Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion fairscape_models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from fairscape_models.annotation import Annotation
from fairscape_models.biochem_entity import BioChemEntity
from fairscape_models.computation import Computation
from fairscape_models.dataset import Dataset
from fairscape_models.dataset import Dataset, Split, SplitType
from fairscape_models.software import Software
from fairscape_models.mlmodel import MLModel
from fairscape_models.fairscape_base import IdentifierValue, IdentifierPropertyValue, DEFAULT_ARK_NAAN, DEFAULT_LICENSE, DEFAULT_CONTEXT
Expand Down
7 changes: 7 additions & 0 deletions fairscape_models/conversion/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,13 @@ def _map_source_to_args(self, source_dict: Dict[str, Any], mapping_def: Dict[str
value = source_dict.get(spec["source_key"])
if "parser" in spec and value is not None:
value = spec["parser"](value)
# Backward compat fallback (e.g., additionalProperty)
if value is None and "fallback_source_key" in spec:
fallback_value = source_dict.get(spec["fallback_source_key"])
if fallback_value is not None and "fallback_parser" in spec:
value = spec["fallback_parser"](fallback_value)
elif fallback_value is not None:
value = fallback_value
if value is not None:
target_args[target_key] = value
return target_args
Expand Down
70 changes: 36 additions & 34 deletions fairscape_models/conversion/mapping/AIReady.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,58 +310,60 @@ def _score_ethics(ethics: EthicsScore, root_data: Dict[str, Any]):
collection = root_data.get("rai:dataCollection", "")
if collection and str(collection).strip():
details.append(f"Data collection: {collection}")
addl_props = root_data.get("additionalProperty", [])
if isinstance(addl_props, list):
for prop in addl_props:

hs_val = root_data.get("humanSubjects")
if not hs_val:
for prop in (root_data.get("additionalProperty") or []):
if isinstance(prop, dict) and prop.get("name") == "Human Subject":
hs_val = prop.get("value")
if hs_val:
details.append(f"Human subject info: {hs_val}")
break
break
if hs_val:
details.append(f"Human subject info: {hs_val}")

if details:
ethics.ethically_acquired = SubCriterionScore(
has_content=True,
details=", ".join(details)
)

details = []
ethical_review = root_data.get("ethicalReview", "")
if ethical_review and str(ethical_review).strip():
details.append(f"Ethical review: {ethical_review}")

if isinstance(addl_props, list):
for prop in addl_props:

gov_val = root_data.get("dataGovernanceCommittee")
if not gov_val:
for prop in (root_data.get("additionalProperty") or []):
if isinstance(prop, dict) and prop.get("name") == "Data Governance Committee":
gov_val = prop.get("value")
if gov_val:
details.append(f"Governance: {gov_val}")
break
break
if gov_val:
details.append(f"Governance: {gov_val}")

if details:
ethics.ethically_managed = SubCriterionScore(
has_content=True,
details=", ".join(details)
)

details = []
license_val = root_data.get("license", "")
if license_val:
details.append(f"License: {license_val}")

psi = root_data.get("rai:personalSensitiveInformation", "")
if psi and str(psi).strip():
details.append(f"Sensitive info: {psi}")

if isinstance(addl_props, list):
for prop in addl_props:

pu_val = root_data.get("prohibitedUses")
if not pu_val:
for prop in (root_data.get("additionalProperty") or []):
if isinstance(prop, dict) and prop.get("name") == "Prohibited Uses":
pu_val = prop.get("value")
if pu_val:
details.append(f"Prohibited uses: {pu_val}")
break
break
if pu_val:
details.append(f"Prohibited uses: {pu_val}")

if details:
ethics.ethically_disseminated = SubCriterionScore(
has_content=True,
Expand Down Expand Up @@ -397,17 +399,17 @@ def _score_sustainability(sustainability: SustainabilityScore, root_data: Dict[s
details="Maintenance plan: " + maint
)

addl_props = root_data.get("additionalProperty", [])
if isinstance(addl_props, list):
for prop in addl_props:
gov_val = root_data.get("dataGovernanceCommittee")
if not gov_val:
for prop in (root_data.get("additionalProperty") or []):
if isinstance(prop, dict) and prop.get("name") == "Data Governance Committee":
gov_val = prop.get("value")
if gov_val:
sustainability.well_governed = SubCriterionScore(
has_content=True,
details=f"Governance committee: {gov_val}"
)
break
break
if gov_val:
sustainability.well_governed = SubCriterionScore(
has_content=True,
details=f"Governance committee: {gov_val}"
)

def _score_computability(computability: ComputabilityScore, root_data: Dict[str, Any], metadata_graph: List[Dict]):
"""Score Computability criteria."""
Expand Down
22 changes: 11 additions & 11 deletions fairscape_models/conversion/mapping/FairscapeDatasheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,16 +76,16 @@ def _extract_id(value: Any) -> Optional[str]:
"funding": {"source_key": "funder", "parser": _as_list_str},
"keywords": {"source_key": "keywords"},

# human-subjects & governance (via additionalProperty)
"human_subject": {"source_key": "additionalProperty", "parser": from_additional_property("Human Subject")},
"human_subject_research": {"source_key": "additionalProperty", "parser": from_additional_property("Human Subject Research", "")},
"human_subject_exemptions": {"source_key": "additionalProperty", "parser": from_additional_property("Human Subjects Exemptions", "")},
"deidentified_samples": {"source_key": "additionalProperty", "parser": from_additional_property("De-identified Samples", "")},
"fda_regulated": {"source_key": "additionalProperty", "parser": from_additional_property("FDA Regulated", "")},
"irb": {"source_key": "additionalProperty", "parser": from_additional_property("IRB", "")},
"irb_protocol_id": {"source_key": "additionalProperty", "parser": from_additional_property("IRB Protocol ID", "")},
"data_governance": {"source_key": "additionalProperty", "parser": from_additional_property("Data Governance Committee")},
"completeness": {"source_key": "additionalProperty", "parser": from_additional_property("Completeness")},
# human-subjects & governance — top-level fields with additionalProperty fallback
"human_subject": {"source_key": "humanSubjects", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("Human Subject")},
"human_subject_research": {"source_key": "humanSubjectResearch", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("Human Subject Research", "")},
"human_subject_exemptions": {"source_key": "humanSubjectExemption", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("Human Subjects Exemptions", "")},
"deidentified_samples": {"source_key": "deidentified", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("De-identified Samples", "")},
"fda_regulated": {"source_key": "fdaRegulated", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("FDA Regulated", "")},
"irb": {"source_key": "irb", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("IRB", "")},
"irb_protocol_id": {"source_key": "irbProtocolId", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("IRB Protocol ID", "")},
"data_governance": {"source_key": "dataGovernanceCommittee","fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("Data Governance Committee")},
"completeness": {"source_key": "completeness", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("Completeness")},

# related pubs
"related_publications": {"source_key": "associatedPublication", "parser": _as_list_str},
Expand All @@ -112,7 +112,7 @@ def _extract_id(value: Any) -> Optional[str]:
USECASES_MAPPING = {
"intended_use": {"source_key": "rai:dataUseCases"},
"limitations": {"source_key": "rai:dataLimitations"},
"prohibited_uses": {"source_key": "additionalProperty", "parser": from_additional_property("Prohibited Uses")},
"prohibited_uses": {"source_key": "prohibitedUses", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("Prohibited Uses")},
"potential_sources_of_bias": {"source_key": "rai:dataBiases"},
"maintenance_plan": {"source_key": "rai:dataReleaseMaintenancePlan"},

Expand Down
45 changes: 44 additions & 1 deletion fairscape_models/dataset.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,51 @@
from pydantic import Field, ConfigDict, AliasChoices, model_validator
from pydantic import BaseModel, Field, ConfigDict, AliasChoices, model_validator
from typing import Optional, List, Union
from enum import Enum

from fairscape_models.fairscape_base import IdentifierValue, DATASET_TYPE
from fairscape_models.digital_object import DigitalObject


class SplitType(str, Enum):
"""Croissant-aligned split type semantics.

Maps to:
cr:TrainingSplit -> "train"
cr:ValidationSplit -> "validation"
cr:TestSplit -> "test"
custom -> "other"
"""
TRAIN = "train"
VALIDATION = "validation"
TEST = "test"
OTHER = "other"


class Split(BaseModel):
"""A named partition or subset of a Dataset.

Unifies concepts from D4D DataSubset/SamplingStrategy and Croissant cr:Split.
"""
model_config = ConfigDict(extra="allow", populate_by_name=True)

# Identity
name: str
description: Optional[str] = Field(default=None)

# Croissant split semantics (maps to cr:TrainingSplit, etc.)
splitType: Optional[SplitType] = Field(default=None)

# Query information SQL or croissant extract transform
query: Optional[str] = Field(default=None)
queryType: Optional[str] = Field(default=None)

sourceDatasets: Optional[List[IdentifierValue]] = Field(default=None)

# D4D sampling strategy (flat, all optional)
isSample: Optional[bool] = Field(default=None)
isRandom: Optional[bool] = Field(default=None)
samplingStrategy: Optional[str] = Field(default=None)

class Dataset(DigitalObject):
metadataType: Optional[Union[List[str], str]] = Field(default=['prov:Entity', "https://w3id.org/EVI#Dataset"], alias="@type")
additionalType: Optional[str] = Field(default=DATASET_TYPE)
Expand All @@ -17,6 +59,7 @@ class Dataset(DigitalObject):
)
generatedBy: Optional[Union[IdentifierValue, List[IdentifierValue]]] = Field(default=[])
derivedFrom: Optional[List[IdentifierValue]] = Field(default=[])
splits: Optional[List[Split]] = Field(default=None)

@model_validator(mode='after')
def populate_prov_fields(self):
Expand Down
101 changes: 85 additions & 16 deletions fairscape_models/rocrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,22 +85,56 @@ class ROCrateMetadataElem(BaseModel):
```
"""
model_config = ConfigDict(extra="allow")


# Core identity
guid: str = Field(alias="@id")
metadataType: List[str] = Field(alias="@type")
name: str
description: str
keywords: List[str]
isPartOf: Optional[List[IdentifierValue]] = Field(default=[])
version: str
datePublished: Optional[str] = Field(default=None)

# relationships
isPartOf: Optional[List[IdentifierValue]] = Field(default=[])
hasPart: List[IdentifierValue]

# Attribution
author: Union[str, List[str]]
dataLicense: Optional[str] = Field(alias="license")
publisher: Optional[str] = Field(default=None)
principalInvestigator: Optional[str] = Field(default=None)
funder: Optional[str] = Field(default=None)
contactEmail: Optional[str] = Field(default=None)
citation: Optional[str] = Field(default=None)
associatedPublication: Optional[Union[str, List[str]]] = Field(default=None)
identifier: Optional[str] = Field(default=None)

# Licensing
dataLicense: Optional[str] = Field(alias="license")
conditionsOfAccess: Optional[str] = Field(default=None)
copyrightNotice: Optional[str] = Field(default=None)
fairscapeVersion: str = __version__


# Content info
contentSize: Optional[str] = Field(default=None)
usageInfo: Optional[str] = Field(default=None)
hasSummaryStatistics: Optional[Union[str, IdentifierValue]] = Field(default=None)
additionalProperty: Optional[List[Dict[str, Any]]] = Field(default=None)

# Compliance / ethics
ethicalReview: Optional[str] = Field(default=None)
confidentialityLevel: Optional[str] = Field(default=None)
irb: Optional[str] = Field(default=None)
irbProtocolId: Optional[str] = Field(default=None)
humanSubjectExemption: Optional[str] = Field(default=None)
fdaRegulated: Optional[bool] = Field(default=None)
deidentified: Optional[bool] = Field(default=None)
humanSubjects: Optional[str] = Field(alias="humanSubjects", default=None)
humanSubjectResearch: Optional[str] = Field(default=None)
dataGovernanceCommittee: Optional[str] = Field(default=None)
completeness: Optional[str] = Field(alias="completeness", default=None)
prohibitedUses: Optional[str] = Field(alias="prohibitedUses", default=None)

# RAI fields
rai_data_limitations: Optional[str] = Field(alias="rai:dataLimitations", default=None)
rai_data_biases: Optional[str] = Field(alias="rai:dataBiases", default=None)
rai_data_use_cases: Optional[str] = Field(alias="rai:dataUseCases", default=None)
Expand Down Expand Up @@ -132,17 +166,52 @@ class ROCrateMetadataElem(BaseModel):
evi_total_entities: Optional[int] = Field(alias="evi:totalEntities", default=None)
evi_formats: Optional[List[str]] = Field(alias="evi:formats", default=None)

def generateFileElem(self)->ROCrateMetadataFileElem:
return ROCrateMetadataFileElem.validate({
"@id": "ro-crate-metadata.json",
"@type": "CreativeWork",
"about": {
"@id": self.guid,
"@type": ["EVI:Dataset", "EVI:ROCrate"],
"name": self.name
},
"conformsTo": {"@id": "https://w3id.org/ro/crate/1.2"}
})
#D4D Placeholders
addressingGaps : Optional[str] = Field(alias="d4d:addressingGaps", default=None)
dataAnomalies : Optional[str] = Field(alias="d4d:dataAnomalies", default=None)
contentWarning : Optional[str] = Field(alias="d4d:contentWarning", default=None)
informedConsent : Optional[str] = Field(alias="d4d:informedConsent", default=None)
atRiskPopulations : Optional[str] = Field(alias="d4d:atRiskPopulations", default=None)

def get_aiready_warnings(self) -> List[str]:
"""Return a list of warnings for properties recommended for AI-Ready scoring that are missing."""
warnings = []

# Fairness / Sustainability
if not self.identifier:
warnings.append("Missing 'identifier' (DOI) — affects Findability and Sustainability scoring")
if not self.dataLicense:
warnings.append("Missing 'license' — affects Reusability and Ethics scoring")

# Provenance
if not self.publisher and not self.principalInvestigator:
warnings.append("Missing 'publisher' or 'principalInvestigator' — affects Provenance and Computability scoring")

# Characterization
if not self.rai_data_biases:
warnings.append("Missing 'rai:dataBiases' — affects Characterization: potential_sources_of_bias")
if not self.rai_data_collection_missing_data:
warnings.append("Missing 'rai:dataCollectionMissingData' — affects Characterization: data_quality")
if not self.contentSize and not self.hasSummaryStatistics:
warnings.append("Missing 'contentSize' and 'hasSummaryStatistics' — affects Characterization: statistics")

# Pre-model explainability
if not self.rai_data_use_cases and not self.rai_data_limitations:
warnings.append("Missing 'rai:dataUseCases' and 'rai:dataLimitations' — affects Pre-model: fit_for_purpose")

# Ethics
if not self.rai_data_collection:
warnings.append("Missing 'rai:dataCollection' — affects Ethics: ethically_acquired")
if not self.ethicalReview:
warnings.append("Missing 'ethicalReview' — affects Ethics: ethically_managed")
if not self.confidentialityLevel:
warnings.append("Missing 'confidentialityLevel' — affects Ethics: secure")

# Sustainability
if not self.rai_data_release_maintenance_plan:
warnings.append("Missing 'rai:dataReleaseMaintenancePlan' — affects Sustainability: domain_appropriate")

return warnings


class ROCrateDistribution(BaseModel):
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "fairscape-models"
version = "1.0.24"
version = "1.0.23"
description = "Fairscape pydantic models"
readme = "README.md"
authors = [
Expand Down
Loading