Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
db2e416
Add semantic exchange layer for D4D ↔ RO-Crate transformations
realmarcin Mar 13, 2026
ef5afc4
Fix Copilot review issues on semantic exchange implementation
realmarcin Mar 13, 2026
4dcdac1
Integrate FAIRSCAPE RO-Crate reference implementation alignment
realmarcin Mar 18, 2026
4721fc3
Fix Copilot review issues (9 new issues from FAIRSCAPE alignment)
realmarcin Mar 18, 2026
10ecded
Organize documentation: Move .md files to notes/
realmarcin Mar 18, 2026
c8f345a
Add FAIRSCAPE Pydantic models integration for D4D to RO-Crate conversion
realmarcin Mar 20, 2026
c35a5e4
Document FAIRSCAPE JSON vs Pydantic classes relationship
realmarcin Mar 20, 2026
4ac667a
Add SSSOM mapping generation and complete slot_uri coverage
realmarcin Mar 20, 2026
1e2c227
Implement FAIRSCAPE → D4D reverse converter (bidirectional transforma…
realmarcin Mar 20, 2026
875e5c2
Improve round-trip fidelity with namespaced properties (CM4AI example)
realmarcin Mar 20, 2026
718bd9e
Complete D4D ↔ RO-Crate SKOS and SSSOM mappings
realmarcin Mar 20, 2026
eca117d
Add D4D schema path and RO-Crate JSON path columns to SSSOM mappings
realmarcin Mar 20, 2026
2a1bb6e
Add URI-level SSSOM mapping (D4D slot URIs ↔ RO-Crate property URIs)
realmarcin Mar 20, 2026
9aa9cc6
Add D4D URI coverage analysis report and recommendations
realmarcin Mar 20, 2026
53e638d
Add comprehensive SSSOM mapping covering ALL 270 D4D attributes
realmarcin Mar 20, 2026
02625e8
Add comprehensive URI-level SSSOM for all 270 D4D attributes
realmarcin Mar 20, 2026
02b7f97
Merge main into semantic_xchange to include PR #134 (94 slot_uri defi…
realmarcin Mar 24, 2026
43c3a44
Merge main into semantic_xchange
realmarcin Mar 24, 2026
15da612
Fix invalid ARK identifiers with trailing commas
realmarcin Mar 24, 2026
4b1cd45
Fix SKOS alignment inconsistencies with FAIRSCAPE reference
realmarcin Mar 24, 2026
082332c
Remove unused FAIRSCAPE imports and dead code
realmarcin Mar 24, 2026
3bd1a2d
Rename vulnerable_populations to at_risk_populations throughout schema
realmarcin Mar 24, 2026
95d40cd
Update D4D-RO-Crate mapping TSV to use at_risk_populations
realmarcin Mar 24, 2026
c4a9443
Update remaining vulnerable_populations references to at_risk_populat…
realmarcin Mar 24, 2026
1c32a0d
Consolidate all SSSOM mapping files to data/mappings/
realmarcin Mar 24, 2026
5f7ca6c
Update mappings README to document all SSSOM files
realmarcin Mar 24, 2026
5e6b2e2
Rename mappings/ to linkml_mappings/
realmarcin Mar 24, 2026
95f4e8e
Add d4d_module column to all SSSOM mapping files
realmarcin Mar 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
406 changes: 406 additions & 0 deletions .claude/agents/scripts/auto_process_rocrates.py

Large diffs are not rendered by default.

319 changes: 319 additions & 0 deletions .claude/agents/scripts/d4d_builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,319 @@
#!/usr/bin/env python3
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do think that much of this functionality can be handled by linkml-map, particularly for things like type conversions, datetime strings, etc. If this works for now then this is fine as a description of what needs to happen in order to get from RO-Crate metadata to D4D.

"""
D4D Builder - Construct D4D YAML structure from RO-Crate metadata.

This module builds the D4D datasheet structure by mapping RO-Crate properties
to D4D classes and fields according to the TSV mapping specification.
"""

from datetime import datetime
from typing import Any, Dict, List, Optional, Union


class D4DBuilder:
"""Build D4D YAML structure from mapped RO-Crate data."""

def __init__(self, mapping_loader):
"""
Initialize D4D builder with mapping loader.

Args:
mapping_loader: MappingLoader instance with field mappings
"""
self.mapping = mapping_loader
self.d4d_data: Dict[str, Any] = {}

def build_dataset(self, rocrate_parser) -> Dict[str, Any]:
"""
Build complete D4D Dataset from RO-Crate parser.

Args:
rocrate_parser: ROCrateParser instance with loaded RO-Crate data

Returns:
Dict with D4D Dataset structure
"""
self.d4d_data = {}

# Get all covered D4D fields
covered_fields = self.mapping.get_covered_fields()

print(f"\nBuilding D4D dataset from {len(covered_fields)} mapped fields...")

# Map each covered field
mapped_count = 0
for d4d_field in covered_fields:
rocrate_property = self.mapping.get_rocrate_property(d4d_field)
if not rocrate_property:
continue

# Handle multiple RO-Crate properties (comma-separated)
rocrate_props = [p.strip() for p in rocrate_property.split(',')]

# Try to extract value from RO-Crate
value = None
for rc_prop in rocrate_props:
value = rocrate_parser.get_property(rc_prop)
if value is not None:
break

if value is not None:
# Apply transformations based on field type
transformed_value = self.apply_field_transformation(d4d_field, value)
self.d4d_data[d4d_field] = transformed_value
mapped_count += 1

print(f"Successfully mapped {mapped_count}/{len(covered_fields)} fields")

return self.d4d_data

def apply_field_transformation(self, field_name: str, value: Any) -> Any:
"""
Apply field-specific transformations to values.

Args:
field_name: D4D field name
value: Raw value from RO-Crate

Returns:
Transformed value appropriate for D4D field
"""
# Get mapping info for this field
mapping_info = self.mapping.get_mapping_info(field_name)
if not mapping_info:
return value

field_type = mapping_info.get('Type', '').lower()

# Date transformations
if 'date' in field_type or field_name in ['created_on', 'last_updated_on', 'issued', 'distribution_dates']:
return self._transform_date(value)

# Integer transformations
if field_type in ['int', 'integer']:
return self._transform_int(value)

# List transformations
if 'list' in field_type or isinstance(value, list):
return self._transform_list(value, field_name)

# Enum transformations
if 'enum' in field_type:
return self._transform_enum(value, field_name)

# URI transformations
if field_type == 'uri' or field_name in ['doi', 'download_url', 'publisher', 'status', 'conforms_to']:
return self._transform_uri(value)

# Person/Organization transformations
if field_name in ['creators', 'created_by', 'modified_by', 'funders']:
return self._transform_person_org(value)

# Boolean transformations
if field_type in ['bool', 'boolean']:
return self._transform_bool(value)

# String is default - handle None
if value is None:
return None

# Return as string
return str(value)

def _transform_date(self, value: Any) -> Optional[str]:
"""Transform date values to D4D Date format (YYYY-MM-DD)."""
if value is None:
return None

value_str = str(value)

# Handle ISO 8601 datetime strings
if 'T' in value_str:
try:
dt = datetime.fromisoformat(value_str.replace('Z', '+00:00'))
return dt.strftime('%Y-%m-%d')
except ValueError:
pass

# Handle YYYY-MM-DD format (already correct)
if len(value_str) >= 10 and value_str[4] == '-' and value_str[7] == '-':
return value_str[:10]

# Return as-is if can't parse
return value_str

def _transform_int(self, value: Any) -> Optional[int]:
"""Transform values to integer."""
if value is None:
return None

try:
return int(value)
except (ValueError, TypeError):
return None

def _transform_list(self, value: Any, field_name: str) -> Optional[Union[List, str]]:
"""Transform list values."""
if value is None:
return None

if not isinstance(value, list):
return [value]

# For keywords, return list of strings
if field_name == 'keywords':
return [str(item) for item in value]

# For complex objects, extract relevant info
if all(isinstance(item, dict) for item in value):
# Person/Organization lists
if field_name in ['creators', 'created_by', 'funders']:
return [self._extract_name_from_entity(item) for item in value]

return value

def _transform_enum(self, value: Any, field_name: str) -> Optional[str]:
"""Transform enum values."""
if value is None:
return None

# CompressionEnum values
if field_name == 'compression':
compression_map = {
'gzip': 'GZIP',
'tar': 'TAR',
'zip': 'ZIP',
'bzip2': 'BZIP2',
'application/gzip': 'GZIP',
'application/zip': 'ZIP',
'application/x-tar': 'TAR',
}
value_lower = str(value).lower()
for key, enum_value in compression_map.items():
if key in value_lower:
return enum_value

return str(value)

def _transform_uri(self, value: Any) -> Optional[str]:
"""Transform URI values."""
if value is None:
return None

value_str = str(value)

# Ensure proper URI format
if not value_str.startswith(('http://', 'https://', 'doi:', 'urn:')):
# DOI special case
if value_str.startswith('10.'):
return f"https://doi.org/{value_str}"

return value_str

def _transform_person_org(self, value: Any) -> Optional[str]:
"""Transform Person/Organization entities to string representation."""
if value is None:
return None

if isinstance(value, dict):
return self._extract_name_from_entity(value)

if isinstance(value, list):
names = [self._extract_name_from_entity(item) for item in value if isinstance(item, dict)]
return ', '.join(filter(None, names)) if names else None

return str(value)

def _transform_bool(self, value: Any) -> Optional[bool]:
"""Transform boolean values."""
if value is None:
return None

if isinstance(value, bool):
return value

value_str = str(value).lower()
if value_str in ['true', 'yes', '1']:
return True
elif value_str in ['false', 'no', '0']:
return False

return None

def _extract_name_from_entity(self, entity: Dict[str, Any]) -> Optional[str]:
"""Extract name from Person or Organization entity."""
if not isinstance(entity, dict):
return None

# Try common name fields
for field in ['name', 'givenName', 'familyName', '@id']:
if field in entity:
if field == '@id' and entity['@id'].startswith(('http://', 'https://')):
continue # Skip URLs
return str(entity[field])

# Combine givenName and familyName if both present
given = entity.get('givenName')
family = entity.get('familyName')
if given and family:
return f"{given} {family}"

return None

def set_field(self, field_name: str, value: Any):
"""
Manually set a D4D field value.

Args:
field_name: D4D field name
value: Value to set
"""
self.d4d_data[field_name] = value

def get_field(self, field_name: str) -> Optional[Any]:
"""
Get a D4D field value.

Args:
field_name: D4D field name

Returns:
Field value, or None if not set
"""
return self.d4d_data.get(field_name)

def get_dataset(self) -> Dict[str, Any]:
"""
Get the complete D4D dataset structure.

Returns:
Dict with D4D Dataset data
"""
return self.d4d_data.copy()


if __name__ == "__main__":
# Test the D4D builder
import sys
from mapping_loader import MappingLoader
from rocrate_parser import ROCrateParser

if len(sys.argv) < 3:
print("Usage: python d4d_builder.py <mapping_tsv> <rocrate_json>")
sys.exit(1)

mapping = MappingLoader(sys.argv[1])
parser = ROCrateParser(sys.argv[2])

builder = D4DBuilder(mapping)
dataset = builder.build_dataset(parser)

print("\n=== Built D4D Dataset ===")
print(f"Total fields: {len(dataset)}")
print("\nSample fields:")
for key in list(dataset.keys())[:10]:
value = dataset[key]
value_str = str(value)[:60]
if len(str(value)) > 60:
value_str += "..."
print(f" {key}: {value_str}")
Loading
Loading