diff --git a/fairscape_models/__init__.py b/fairscape_models/__init__.py index 6ec60e4..4a29b4c 100644 --- a/fairscape_models/__init__.py +++ b/fairscape_models/__init__.py @@ -3,7 +3,7 @@ from fairscape_models.annotation import Annotation from fairscape_models.biochem_entity import BioChemEntity from fairscape_models.computation import Computation -from fairscape_models.dataset import Dataset +from fairscape_models.dataset import Dataset, Split, SplitType from fairscape_models.software import Software from fairscape_models.mlmodel import MLModel from fairscape_models.fairscape_base import IdentifierValue, IdentifierPropertyValue, DEFAULT_ARK_NAAN, DEFAULT_LICENSE, DEFAULT_CONTEXT diff --git a/fairscape_models/conversion/converter.py b/fairscape_models/conversion/converter.py index c58e0c5..af21b61 100644 --- a/fairscape_models/conversion/converter.py +++ b/fairscape_models/conversion/converter.py @@ -62,6 +62,13 @@ def _map_source_to_args(self, source_dict: Dict[str, Any], mapping_def: Dict[str value = source_dict.get(spec["source_key"]) if "parser" in spec and value is not None: value = spec["parser"](value) + # Backward compat fallback (e.g., additionalProperty) + if value is None and "fallback_source_key" in spec: + fallback_value = source_dict.get(spec["fallback_source_key"]) + if fallback_value is not None and "fallback_parser" in spec: + value = spec["fallback_parser"](fallback_value) + elif fallback_value is not None: + value = fallback_value if value is not None: target_args[target_key] = value return target_args diff --git a/fairscape_models/conversion/mapping/AIReady.py b/fairscape_models/conversion/mapping/AIReady.py index 8efa5c2..434bbe5 100644 --- a/fairscape_models/conversion/mapping/AIReady.py +++ b/fairscape_models/conversion/mapping/AIReady.py @@ -310,58 +310,60 @@ def _score_ethics(ethics: EthicsScore, root_data: Dict[str, Any]): collection = root_data.get("rai:dataCollection", "") if collection and str(collection).strip(): details.append(f"Data collection: {collection}") - - addl_props = root_data.get("additionalProperty", []) - if isinstance(addl_props, list): - for prop in addl_props: + + hs_val = root_data.get("humanSubjects") + if not hs_val: + for prop in (root_data.get("additionalProperty") or []): if isinstance(prop, dict) and prop.get("name") == "Human Subject": hs_val = prop.get("value") - if hs_val: - details.append(f"Human subject info: {hs_val}") - break - + break + if hs_val: + details.append(f"Human subject info: {hs_val}") + if details: ethics.ethically_acquired = SubCriterionScore( has_content=True, details=", ".join(details) ) - + details = [] ethical_review = root_data.get("ethicalReview", "") if ethical_review and str(ethical_review).strip(): details.append(f"Ethical review: {ethical_review}") - - if isinstance(addl_props, list): - for prop in addl_props: + + gov_val = root_data.get("dataGovernanceCommittee") + if not gov_val: + for prop in (root_data.get("additionalProperty") or []): if isinstance(prop, dict) and prop.get("name") == "Data Governance Committee": gov_val = prop.get("value") - if gov_val: - details.append(f"Governance: {gov_val}") - break - + break + if gov_val: + details.append(f"Governance: {gov_val}") + if details: ethics.ethically_managed = SubCriterionScore( has_content=True, details=", ".join(details) ) - + details = [] license_val = root_data.get("license", "") if license_val: details.append(f"License: {license_val}") - + psi = root_data.get("rai:personalSensitiveInformation", "") if psi and str(psi).strip(): details.append(f"Sensitive info: {psi}") - - if isinstance(addl_props, list): - for prop in addl_props: + + pu_val = root_data.get("prohibitedUses") + if not pu_val: + for prop in (root_data.get("additionalProperty") or []): if isinstance(prop, dict) and prop.get("name") == "Prohibited Uses": pu_val = prop.get("value") - if pu_val: - details.append(f"Prohibited uses: {pu_val}") - break - + break + if pu_val: + details.append(f"Prohibited uses: {pu_val}") + if details: ethics.ethically_disseminated = SubCriterionScore( has_content=True, @@ -397,17 +399,17 @@ def _score_sustainability(sustainability: SustainabilityScore, root_data: Dict[s details="Maintenance plan: " + maint ) - addl_props = root_data.get("additionalProperty", []) - if isinstance(addl_props, list): - for prop in addl_props: + gov_val = root_data.get("dataGovernanceCommittee") + if not gov_val: + for prop in (root_data.get("additionalProperty") or []): if isinstance(prop, dict) and prop.get("name") == "Data Governance Committee": gov_val = prop.get("value") - if gov_val: - sustainability.well_governed = SubCriterionScore( - has_content=True, - details=f"Governance committee: {gov_val}" - ) - break + break + if gov_val: + sustainability.well_governed = SubCriterionScore( + has_content=True, + details=f"Governance committee: {gov_val}" + ) def _score_computability(computability: ComputabilityScore, root_data: Dict[str, Any], metadata_graph: List[Dict]): """Score Computability criteria.""" diff --git a/fairscape_models/conversion/mapping/FairscapeDatasheet.py b/fairscape_models/conversion/mapping/FairscapeDatasheet.py index 0465815..9bcb67c 100644 --- a/fairscape_models/conversion/mapping/FairscapeDatasheet.py +++ b/fairscape_models/conversion/mapping/FairscapeDatasheet.py @@ -76,16 +76,16 @@ def _extract_id(value: Any) -> Optional[str]: "funding": {"source_key": "funder", "parser": _as_list_str}, "keywords": {"source_key": "keywords"}, - # human-subjects & governance (via additionalProperty) - "human_subject": {"source_key": "additionalProperty", "parser": from_additional_property("Human Subject")}, - "human_subject_research": {"source_key": "additionalProperty", "parser": from_additional_property("Human Subject Research", "")}, - "human_subject_exemptions": {"source_key": "additionalProperty", "parser": from_additional_property("Human Subjects Exemptions", "")}, - "deidentified_samples": {"source_key": "additionalProperty", "parser": from_additional_property("De-identified Samples", "")}, - "fda_regulated": {"source_key": "additionalProperty", "parser": from_additional_property("FDA Regulated", "")}, - "irb": {"source_key": "additionalProperty", "parser": from_additional_property("IRB", "")}, - "irb_protocol_id": {"source_key": "additionalProperty", "parser": from_additional_property("IRB Protocol ID", "")}, - "data_governance": {"source_key": "additionalProperty", "parser": from_additional_property("Data Governance Committee")}, - "completeness": {"source_key": "additionalProperty", "parser": from_additional_property("Completeness")}, + # human-subjects & governance — top-level fields with additionalProperty fallback + "human_subject": {"source_key": "humanSubjects", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("Human Subject")}, + "human_subject_research": {"source_key": "humanSubjectResearch", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("Human Subject Research", "")}, + "human_subject_exemptions": {"source_key": "humanSubjectExemption", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("Human Subjects Exemptions", "")}, + "deidentified_samples": {"source_key": "deidentified", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("De-identified Samples", "")}, + "fda_regulated": {"source_key": "fdaRegulated", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("FDA Regulated", "")}, + "irb": {"source_key": "irb", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("IRB", "")}, + "irb_protocol_id": {"source_key": "irbProtocolId", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("IRB Protocol ID", "")}, + "data_governance": {"source_key": "dataGovernanceCommittee","fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("Data Governance Committee")}, + "completeness": {"source_key": "completeness", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("Completeness")}, # related pubs "related_publications": {"source_key": "associatedPublication", "parser": _as_list_str}, @@ -112,7 +112,7 @@ def _extract_id(value: Any) -> Optional[str]: USECASES_MAPPING = { "intended_use": {"source_key": "rai:dataUseCases"}, "limitations": {"source_key": "rai:dataLimitations"}, - "prohibited_uses": {"source_key": "additionalProperty", "parser": from_additional_property("Prohibited Uses")}, + "prohibited_uses": {"source_key": "prohibitedUses", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("Prohibited Uses")}, "potential_sources_of_bias": {"source_key": "rai:dataBiases"}, "maintenance_plan": {"source_key": "rai:dataReleaseMaintenancePlan"}, diff --git a/fairscape_models/dataset.py b/fairscape_models/dataset.py index 7949875..21fb290 100644 --- a/fairscape_models/dataset.py +++ b/fairscape_models/dataset.py @@ -1,9 +1,51 @@ -from pydantic import Field, ConfigDict, AliasChoices, model_validator +from pydantic import BaseModel, Field, ConfigDict, AliasChoices, model_validator from typing import Optional, List, Union +from enum import Enum from fairscape_models.fairscape_base import IdentifierValue, DATASET_TYPE from fairscape_models.digital_object import DigitalObject + +class SplitType(str, Enum): + """Croissant-aligned split type semantics. + + Maps to: + cr:TrainingSplit -> "train" + cr:ValidationSplit -> "validation" + cr:TestSplit -> "test" + custom -> "other" + """ + TRAIN = "train" + VALIDATION = "validation" + TEST = "test" + OTHER = "other" + + +class Split(BaseModel): + """A named partition or subset of a Dataset. + + Unifies concepts from D4D DataSubset/SamplingStrategy and Croissant cr:Split. + """ + model_config = ConfigDict(extra="allow", populate_by_name=True) + + # Identity + name: str + description: Optional[str] = Field(default=None) + + # Croissant split semantics (maps to cr:TrainingSplit, etc.) + splitType: Optional[SplitType] = Field(default=None) + + # Query information SQL or croissant extract transform + query: Optional[str] = Field(default=None) + queryType: Optional[str] = Field(default=None) + + sourceDatasets: Optional[List[IdentifierValue]] = Field(default=None) + + # D4D sampling strategy (flat, all optional) + isSample: Optional[bool] = Field(default=None) + isRandom: Optional[bool] = Field(default=None) + samplingStrategy: Optional[str] = Field(default=None) + class Dataset(DigitalObject): metadataType: Optional[Union[List[str], str]] = Field(default=['prov:Entity', "https://w3id.org/EVI#Dataset"], alias="@type") additionalType: Optional[str] = Field(default=DATASET_TYPE) @@ -17,6 +59,7 @@ class Dataset(DigitalObject): ) generatedBy: Optional[Union[IdentifierValue, List[IdentifierValue]]] = Field(default=[]) derivedFrom: Optional[List[IdentifierValue]] = Field(default=[]) + splits: Optional[List[Split]] = Field(default=None) @model_validator(mode='after') def populate_prov_fields(self): diff --git a/fairscape_models/rocrate.py b/fairscape_models/rocrate.py index ccebf89..bc3a239 100644 --- a/fairscape_models/rocrate.py +++ b/fairscape_models/rocrate.py @@ -85,22 +85,56 @@ class ROCrateMetadataElem(BaseModel): ``` """ model_config = ConfigDict(extra="allow") - + + # Core identity guid: str = Field(alias="@id") metadataType: List[str] = Field(alias="@type") name: str description: str keywords: List[str] - isPartOf: Optional[List[IdentifierValue]] = Field(default=[]) version: str + datePublished: Optional[str] = Field(default=None) + + # relationships + isPartOf: Optional[List[IdentifierValue]] = Field(default=[]) hasPart: List[IdentifierValue] + + # Attribution author: Union[str, List[str]] - dataLicense: Optional[str] = Field(alias="license") + publisher: Optional[str] = Field(default=None) + principalInvestigator: Optional[str] = Field(default=None) + funder: Optional[str] = Field(default=None) + contactEmail: Optional[str] = Field(default=None) + citation: Optional[str] = Field(default=None) associatedPublication: Optional[Union[str, List[str]]] = Field(default=None) + identifier: Optional[str] = Field(default=None) + + # Licensing + dataLicense: Optional[str] = Field(alias="license") conditionsOfAccess: Optional[str] = Field(default=None) copyrightNotice: Optional[str] = Field(default=None) - fairscapeVersion: str = __version__ - + + # Content info + contentSize: Optional[str] = Field(default=None) + usageInfo: Optional[str] = Field(default=None) + hasSummaryStatistics: Optional[Union[str, IdentifierValue]] = Field(default=None) + additionalProperty: Optional[List[Dict[str, Any]]] = Field(default=None) + + # Compliance / ethics + ethicalReview: Optional[str] = Field(default=None) + confidentialityLevel: Optional[str] = Field(default=None) + irb: Optional[str] = Field(default=None) + irbProtocolId: Optional[str] = Field(default=None) + humanSubjectExemption: Optional[str] = Field(default=None) + fdaRegulated: Optional[bool] = Field(default=None) + deidentified: Optional[bool] = Field(default=None) + humanSubjects: Optional[str] = Field(alias="humanSubjects", default=None) + humanSubjectResearch: Optional[str] = Field(default=None) + dataGovernanceCommittee: Optional[str] = Field(default=None) + completeness: Optional[str] = Field(alias="completeness", default=None) + prohibitedUses: Optional[str] = Field(alias="prohibitedUses", default=None) + + # RAI fields rai_data_limitations: Optional[str] = Field(alias="rai:dataLimitations", default=None) rai_data_biases: Optional[str] = Field(alias="rai:dataBiases", default=None) rai_data_use_cases: Optional[str] = Field(alias="rai:dataUseCases", default=None) @@ -132,17 +166,52 @@ class ROCrateMetadataElem(BaseModel): evi_total_entities: Optional[int] = Field(alias="evi:totalEntities", default=None) evi_formats: Optional[List[str]] = Field(alias="evi:formats", default=None) - def generateFileElem(self)->ROCrateMetadataFileElem: - return ROCrateMetadataFileElem.validate({ - "@id": "ro-crate-metadata.json", - "@type": "CreativeWork", - "about": { - "@id": self.guid, - "@type": ["EVI:Dataset", "EVI:ROCrate"], - "name": self.name - }, - "conformsTo": {"@id": "https://w3id.org/ro/crate/1.2"} - }) + #D4D Placeholders + addressingGaps : Optional[str] = Field(alias="d4d:addressingGaps", default=None) + dataAnomalies : Optional[str] = Field(alias="d4d:dataAnomalies", default=None) + contentWarning : Optional[str] = Field(alias="d4d:contentWarning", default=None) + informedConsent : Optional[str] = Field(alias="d4d:informedConsent", default=None) + atRiskPopulations : Optional[str] = Field(alias="d4d:atRiskPopulations", default=None) + + def get_aiready_warnings(self) -> List[str]: + """Return a list of warnings for properties recommended for AI-Ready scoring that are missing.""" + warnings = [] + + # Fairness / Sustainability + if not self.identifier: + warnings.append("Missing 'identifier' (DOI) — affects Findability and Sustainability scoring") + if not self.dataLicense: + warnings.append("Missing 'license' — affects Reusability and Ethics scoring") + + # Provenance + if not self.publisher and not self.principalInvestigator: + warnings.append("Missing 'publisher' or 'principalInvestigator' — affects Provenance and Computability scoring") + + # Characterization + if not self.rai_data_biases: + warnings.append("Missing 'rai:dataBiases' — affects Characterization: potential_sources_of_bias") + if not self.rai_data_collection_missing_data: + warnings.append("Missing 'rai:dataCollectionMissingData' — affects Characterization: data_quality") + if not self.contentSize and not self.hasSummaryStatistics: + warnings.append("Missing 'contentSize' and 'hasSummaryStatistics' — affects Characterization: statistics") + + # Pre-model explainability + if not self.rai_data_use_cases and not self.rai_data_limitations: + warnings.append("Missing 'rai:dataUseCases' and 'rai:dataLimitations' — affects Pre-model: fit_for_purpose") + + # Ethics + if not self.rai_data_collection: + warnings.append("Missing 'rai:dataCollection' — affects Ethics: ethically_acquired") + if not self.ethicalReview: + warnings.append("Missing 'ethicalReview' — affects Ethics: ethically_managed") + if not self.confidentialityLevel: + warnings.append("Missing 'confidentialityLevel' — affects Ethics: secure") + + # Sustainability + if not self.rai_data_release_maintenance_plan: + warnings.append("Missing 'rai:dataReleaseMaintenancePlan' — affects Sustainability: domain_appropriate") + + return warnings class ROCrateDistribution(BaseModel): diff --git a/pyproject.toml b/pyproject.toml index 9567528..16c357d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "fairscape-models" -version = "1.0.24" +version = "1.0.23" description = "Fairscape pydantic models" readme = "README.md" authors = [ diff --git a/tests/test_rocrate_validation.py b/tests/test_rocrate_validation.py index 7be8079..d94bd1d 100644 --- a/tests/test_rocrate_validation.py +++ b/tests/test_rocrate_validation.py @@ -336,6 +336,74 @@ def test_clean_identifiers_with_annotation(): assert annotation.generated[0].guid == "ark:59852/test-output" +def _minimal_rocrate_elem(**kwargs) -> ROCrateMetadataElem: + """Build a minimal ROCrateMetadataElem, overridable via kwargs.""" + base = { + "@id": "ark:59852/test-crate", + "@type": ["Dataset", "https://w3id.org/EVI#ROCrate"], + "name": "Test", "description": "Test", "keywords": [], + "version": "1.0", "author": "tester", "license": None, + "hasPart": [] + } + base.update(kwargs) + return ROCrateMetadataElem.model_validate(base) + + +def test_get_aiready_warnings_all_missing(): + """All recommended fields absent → all 9 warnings returned.""" + elem = _minimal_rocrate_elem() + warnings = elem.get_aiready_warnings() + assert len(warnings) == 11 + texts = "\n".join(warnings) + assert "identifier" in texts + assert "license" in texts + assert "publisher" in texts + assert "rai:dataBiases" in texts + assert "rai:dataCollectionMissingData" in texts + assert "contentSize" in texts + assert "rai:dataUseCases" in texts + assert "rai:dataCollection" in texts + assert "ethicalReview" in texts + assert "confidentialityLevel" in texts + assert "rai:dataReleaseMaintenancePlan" in texts + + +def test_get_aiready_warnings_all_present(): + """All recommended fields present → empty warnings list.""" + elem = _minimal_rocrate_elem(**{ + "identifier": "https://doi.org/10.1234/test", + "license": "MIT", + "publisher": "Test Publisher", + "rai:dataBiases": "None known", + "rai:dataCollectionMissingData": "No missing data", + "contentSize": "1GB", + "rai:dataUseCases": "Training ML models", + "rai:dataCollection": "Prospective study", + "ethicalReview": "IRB approved", + "confidentialityLevel": "Public", + "rai:dataReleaseMaintenancePlan": "Annual updates", + }) + warnings = elem.get_aiready_warnings() + assert warnings == [] + + +def test_get_aiready_warnings_partial(): + """publisher present suppresses that warning; others still fire.""" + elem = _minimal_rocrate_elem(publisher="UCSD") + warnings = elem.get_aiready_warnings() + texts = "\n".join(warnings) + assert "publisher" not in texts + assert "identifier" in texts + + +def test_get_aiready_warnings_missing_license(): + """No license → license warning fires.""" + elem = _minimal_rocrate_elem(**{"license": None}) + warnings = elem.get_aiready_warnings() + texts = "\n".join(warnings) + assert "license" in texts + + def test_clean_identifiers_with_experiment(): """Test cleanIdentifiers with Experiment elements.""" data = {