From 8e8fb2b389d1f4f258828bd70db71ae6d87f1197 Mon Sep 17 00:00:00 2001 From: jniestroy Date: Thu, 8 Jan 2026 11:45:15 -0500 Subject: [PATCH] prov types --- fairscape_models/activity.py | 2 +- fairscape_models/biochem_entity.py | 4 +- fairscape_models/computation.py | 2 +- fairscape_models/dataset.py | 2 +- fairscape_models/digital_object.py | 2 +- fairscape_models/experiment.py | 2 +- fairscape_models/instrument.py | 2 +- fairscape_models/mlmodel.py | 2 +- fairscape_models/model_card.py | 39 +++++++- fairscape_models/rocrate.py | 9 +- fairscape_models/sample.py | 2 +- fairscape_models/software.py | 4 +- tests/test_model_card.py | 141 +++++++++++++++++++++++++++++ 13 files changed, 192 insertions(+), 21 deletions(-) create mode 100644 tests/test_model_card.py diff --git a/fairscape_models/activity.py b/fairscape_models/activity.py index 2363adb..417db62 100644 --- a/fairscape_models/activity.py +++ b/fairscape_models/activity.py @@ -7,7 +7,7 @@ class Activity(BaseModel): """Base class for Activity types (Computation, Annotation, Experiment)""" guid: str = Field(alias="@id") name: str - metadataType: Optional[str] = Field(default=None, alias="@type") + metadataType: Optional[Union[List[str], str]] = Field(default=['prov:Activity'], alias="@type") description: str = Field(min_length=10) associatedPublication: Optional[str] = Field(default=None) generated: Optional[List[IdentifierValue]] = Field(default=[]) diff --git a/fairscape_models/biochem_entity.py b/fairscape_models/biochem_entity.py index dcaa06a..727ffcd 100644 --- a/fairscape_models/biochem_entity.py +++ b/fairscape_models/biochem_entity.py @@ -1,5 +1,5 @@ from pydantic import BaseModel, Field, ConfigDict -from typing import Optional, List +from typing import Optional, List, Union from fairscape_models.fairscape_base import IdentifierValue, IdentifierPropertyValue @@ -9,7 +9,7 @@ class BioChemEntity(BaseModel): This class can apply to Protiens, Genes, Chemical Entities, or Biological Samples """ guid: str = Field(alias="@id") - metadataType: Optional[str] = Field(default="BioChemEntity", alias="@type") + metadataType: Optional[Union[List[str], str]] = Field(default=['prov:Entity', 'evi:BioChemEntity'], alias="@type") name: str identifier: Optional[List[IdentifierPropertyValue]] = Field(default=[]) associatedDisease: Optional[IdentifierValue] = Field(default=None) diff --git a/fairscape_models/computation.py b/fairscape_models/computation.py index 6d98179..2391f23 100644 --- a/fairscape_models/computation.py +++ b/fairscape_models/computation.py @@ -5,7 +5,7 @@ from fairscape_models.activity import Activity class Computation(Activity): - metadataType: Optional[str] = Field(default="https://w3id.org/EVI#Computation", alias="@type") + metadataType: Optional[Union[List[str], str]] = Field(default=['prov:Activity', "https://w3id.org/EVI#Computation"], alias="@type") additionalType: Optional[str] = Field(default=COMPUTATION_TYPE) runBy: Union[str, IdentifierValue] dateCreated: str diff --git a/fairscape_models/dataset.py b/fairscape_models/dataset.py index cb5fa89..7949875 100644 --- a/fairscape_models/dataset.py +++ b/fairscape_models/dataset.py @@ -5,7 +5,7 @@ from fairscape_models.digital_object import DigitalObject class Dataset(DigitalObject): - metadataType: Optional[str] = Field(default="https://w3id.org/EVI#Dataset", alias="@type") + metadataType: Optional[Union[List[str], str]] = Field(default=['prov:Entity', "https://w3id.org/EVI#Dataset"], alias="@type") additionalType: Optional[str] = Field(default=DATASET_TYPE) datePublished: str = Field(...) keywords: List[str] = Field(...) diff --git a/fairscape_models/digital_object.py b/fairscape_models/digital_object.py index 7fb61bd..8e8c207 100644 --- a/fairscape_models/digital_object.py +++ b/fairscape_models/digital_object.py @@ -7,7 +7,7 @@ class DigitalObject(BaseModel): """Base class for DigitalObject types (Dataset, Software, MLModel)""" guid: str = Field(alias="@id") name: str - metadataType: Optional[str] = Field(default=None, alias="@type") + metadataType: Optional[Union[List[str], str]] = Field(default=['prov:Entity', "https://w3id.org/EVI#DigitalObject"], alias="@type") author: Union[str, IdentifierValue, List[Union[str, IdentifierValue]]] description: str = Field(min_length=10) version: str = Field(default="0.1.0") diff --git a/fairscape_models/experiment.py b/fairscape_models/experiment.py index 600aee4..8feb738 100644 --- a/fairscape_models/experiment.py +++ b/fairscape_models/experiment.py @@ -4,7 +4,7 @@ from fairscape_models.activity import Activity class Experiment(Activity): - metadataType: Optional[str] = Field(default="https://w3id.org/EVI#Experiment", alias="@type") + metadataType: Optional[Union[List[str], str]] = Field(default=['prov:Activity', "https://w3id.org/EVI#Experiment"], alias="@type") experimentType: str runBy: Union[str, IdentifierValue] datePerformed: str diff --git a/fairscape_models/instrument.py b/fairscape_models/instrument.py index fcbc9ec..fd6b9f5 100644 --- a/fairscape_models/instrument.py +++ b/fairscape_models/instrument.py @@ -5,7 +5,7 @@ class Instrument(BaseModel): guid: str = Field(alias="@id") name: str - metadataType: Optional[str] = Field(default="https://w3id.org/EVI#Instrument", alias="@type") + metadataType: Optional[Union[List[str], str]] = Field(default=['prov:Entity', "https://w3id.org/EVI#Instrument"], alias="@type") manufacturer: str = Field(min_length=4) model: str description: str = Field(min_length=10) diff --git a/fairscape_models/mlmodel.py b/fairscape_models/mlmodel.py index 0b95bff..31e853e 100644 --- a/fairscape_models/mlmodel.py +++ b/fairscape_models/mlmodel.py @@ -5,7 +5,7 @@ from fairscape_models.digital_object import DigitalObject class MLModel(DigitalObject): - metadataType: Optional[str] = Field(default="https://w3id.org/EVI#MLModel", alias="@type") + metadataType: Optional[Union[List[str], str]] = Field(default=['prov:Entity', "https://w3id.org/EVI#MLModel"], alias="@type") additionalType: Optional[str] = Field(default=MLMODEL_TYPE) dateModified: Optional[str] = Field(default=None) fileFormat: str = Field(alias="format") diff --git a/fairscape_models/model_card.py b/fairscape_models/model_card.py index 960297e..0ee4aca 100644 --- a/fairscape_models/model_card.py +++ b/fairscape_models/model_card.py @@ -1,16 +1,18 @@ from typing import List, Optional, Union -from pydantic import BaseModel, Field, ConfigDict +from pydantic import BaseModel, Field, ConfigDict, model_validator from fairscape_models.fairscape_base import IdentifierValue +from fairscape_models.digital_object import DigitalObject -class ModelCard(BaseModel): +class ModelCard(DigitalObject): """Model Card for ML models as RO-Crate Dataset elements""" model_config = ConfigDict(extra="allow") guid: str = Field(alias="@id") - metadataType: Union[str, List[str]] = Field(alias="@type",default="https://w3id.org/EVI#MLModel") + + metadataType: Optional[Union[List[str], str]] = Field(default=['prov:Entity', "https://w3id.org/EVI#MLModel"], alias="@type") name: str description: str author: Union[str, List[str]] @@ -22,6 +24,7 @@ class ModelCard(BaseModel): modelFormat: Optional[Union[str, List[str]]] = Field(default=None) trainingDataset: Optional[Union[str, List[IdentifierValue]]] = Field(default=None) generatedBy: Optional[IdentifierValue] = Field(default=None) + derivedFrom: Optional[List[IdentifierValue]] = Field(default=[]) parameters: Optional[float] = Field(default=None) inputSize: Optional[str] = Field(default=None) @@ -37,3 +40,33 @@ class ModelCard(BaseModel): citation: Optional[str] = Field(default=None) isPartOf: Optional[List[IdentifierValue]] = Field(default=[]) + + @model_validator(mode='after') + def populate_prov_fields(self): + """Auto-populate PROV-O fields from EVI fields""" + + # Map generatedBy → prov:wasGeneratedBy + if self.generatedBy: + self.wasGeneratedBy = [self.generatedBy] + else: + self.wasGeneratedBy = [] + + if self.trainingDataset and self.derivedFrom == []: + if isinstance(self.trainingDataset, list): + self.derivedFrom = self.trainingDataset + else: + self.derivedFrom = [self.trainingDataset] + + # Map derivedFrom → prov:wasDerivedFrom + self.wasDerivedFrom = self.derivedFrom or [] + + # Map author → prov:wasAttributedTo + if self.author: + if isinstance(self.author, str): + self.wasAttributedTo = [self.author] + elif isinstance(self.author, list): + self.wasAttributedTo = [a for a in self.author] + else: + self.wasAttributedTo = [] + + return self \ No newline at end of file diff --git a/fairscape_models/rocrate.py b/fairscape_models/rocrate.py index b904933..523aee0 100644 --- a/fairscape_models/rocrate.py +++ b/fairscape_models/rocrate.py @@ -193,16 +193,13 @@ def normalize_type(type_str): item_type = item["@type"] if isinstance(item_type, list): - normalized_types = [normalize_type(t) for t in item_type] - if "ROCrate" in normalized_types or "Dataset" in normalized_types: - new_graph.append(ROCrateMetadataElem.model_validate(item)) - continue + item_type = item_type[-1] - elif isinstance(item_type, str): + if isinstance(item_type, str): normalized_type = normalize_type(item_type) model_class_to_use = type_map.get(normalized_type) - # If we found a specific class, use it. Let it raise a + # If we found a specific class, use it. if model_class_to_use: new_graph.append(model_class_to_use.model_validate(item)) # Only if no specific class was matched, use the generic one. diff --git a/fairscape_models/sample.py b/fairscape_models/sample.py index b3d88df..6278c67 100644 --- a/fairscape_models/sample.py +++ b/fairscape_models/sample.py @@ -5,7 +5,7 @@ class Sample(BaseModel): guid: str = Field(alias="@id") name: str - metadataType: Optional[str] = Field(default="https://w3id.org/EVI#Sample", alias="@type") + metadataType: Optional[Union[List[str], str]] = Field(default=['prov:Entity', "https://w3id.org/EVI#Sample"], alias="@type") author: Union[str, List[str]] description: str = Field(min_length=1) keywords: List[str] = Field(...) diff --git a/fairscape_models/software.py b/fairscape_models/software.py index 2531b42..7d6ad1a 100644 --- a/fairscape_models/software.py +++ b/fairscape_models/software.py @@ -1,11 +1,11 @@ from pydantic import Field, ConfigDict, model_validator -from typing import Optional, List +from typing import Optional, List, Union from fairscape_models.fairscape_base import IdentifierValue, SOFTWARE_TYPE from fairscape_models.digital_object import DigitalObject class Software(DigitalObject): - metadataType: Optional[str] = Field(default="https://w3id.org/EVI#Software", alias="@type") + metadataType: Optional[Union[List[str], str]] = Field(default=['prov:Entity', "https://w3id.org/EVI#Software"], alias="@type") additionalType: Optional[str] = Field(default=SOFTWARE_TYPE) dateModified: Optional[str] = None fileFormat: str = Field(title="fileFormat", alias="format") diff --git a/tests/test_model_card.py b/tests/test_model_card.py new file mode 100644 index 0000000..de78f14 --- /dev/null +++ b/tests/test_model_card.py @@ -0,0 +1,141 @@ +import pytest +from pydantic import ValidationError +from fairscape_models.model_card import ModelCard +from fairscape_models.fairscape_base import IdentifierValue + +@pytest.fixture +def model_card_minimal_data(): + """Minimal data for a valid ModelCard.""" + return { + "@id": "ark:59852/test-model-card", + "name": "Test Model Card", + "author": "Test Model Card Author", + "description": "This is a test model card with sufficient description.", + "keywords": ["machine learning", "test"], + "version": "1.0.0" + } + +def test_model_card_instantiation(model_card_minimal_data): + """Test successful instantiation of a ModelCard model.""" + model_card = ModelCard.model_validate(model_card_minimal_data) + assert model_card.guid == model_card_minimal_data["@id"] + assert model_card.name == model_card_minimal_data["name"] + + # Test PROV field auto-population + assert len(model_card.wasAttributedTo) == 1 + assert isinstance(model_card.wasAttributedTo[0], str) + assert model_card.wasAttributedTo[0] == model_card_minimal_data["author"] + +def test_model_card_missing_required_field(model_card_minimal_data): + """Test ValidationError for missing a required field.""" + del model_card_minimal_data["author"] + with pytest.raises(ValidationError): + ModelCard.model_validate(model_card_minimal_data) + +def test_model_card_with_multiple_authors(model_card_minimal_data): + """Test PROV field population with multiple authors.""" + model_card_minimal_data["author"] = ["Card Author 1", "Card Author 2"] + + model_card = ModelCard.model_validate(model_card_minimal_data) + + # Test PROV:wasAttributedTo handles list of authors + assert len(model_card.wasAttributedTo) == 2 + assert all(isinstance(item, str) for item in model_card.wasAttributedTo) + author_ids = [item for item in model_card.wasAttributedTo] + assert "Card Author 1" in author_ids + assert "Card Author 2" in author_ids + +def test_model_card_with_generated_by_single(model_card_minimal_data): + """Test PROV field population with single generatedBy.""" + model_card_minimal_data["generatedBy"] = {"@id": "ark:59852/computation-1"} + + model_card = ModelCard.model_validate(model_card_minimal_data) + + # Test PROV:wasGeneratedBy with single value + assert len(model_card.wasGeneratedBy) == 1 + assert isinstance(model_card.wasGeneratedBy[0], IdentifierValue) + assert model_card.wasGeneratedBy[0].guid == "ark:59852/computation-1" + + +def test_model_card_with_training_dataset_as_string(model_card_minimal_data): + """Test PROV field population with trainingDataset as string.""" + model_card_minimal_data["trainingDataset"] = "ark:59852/training-data" + + model_card = ModelCard.model_validate(model_card_minimal_data) + + # Test trainingDataset maps to derivedFrom and wasDerivedFrom + assert len(model_card.derivedFrom) == 1 + assert model_card.derivedFrom[0] == "ark:59852/training-data" + assert len(model_card.wasDerivedFrom) == 1 + +def test_model_card_with_training_dataset_as_list(model_card_minimal_data): + """Test PROV field population with trainingDataset as list.""" + model_card_minimal_data["trainingDataset"] = [ + {"@id": "ark:59852/training-data-1"}, + {"@id": "ark:59852/training-data-2"} + ] + + model_card = ModelCard.model_validate(model_card_minimal_data) + + # Test trainingDataset maps to derivedFrom and wasDerivedFrom + assert len(model_card.derivedFrom) == 2 + assert all(isinstance(item, IdentifierValue) for item in model_card.derivedFrom) + assert len(model_card.wasDerivedFrom) == 2 + +def test_model_card_with_derived_from(model_card_minimal_data): + """Test PROV field population with derivedFrom.""" + model_card_minimal_data["derivedFrom"] = [ + {"@id": "ark:59852/model-source"} + ] + + model_card = ModelCard.model_validate(model_card_minimal_data) + + # Test PROV:wasDerivedFrom + assert len(model_card.wasDerivedFrom) == 1 + assert isinstance(model_card.wasDerivedFrom[0], IdentifierValue) + assert model_card.wasDerivedFrom[0].guid == "ark:59852/model-source" + +def test_model_card_derived_from_takes_precedence(model_card_minimal_data): + """Test that derivedFrom takes precedence over trainingDataset.""" + model_card_minimal_data["trainingDataset"] = [{"@id": "ark:59852/training-data"}] + model_card_minimal_data["derivedFrom"] = [{"@id": "ark:59852/model-source"}] + + model_card = ModelCard.model_validate(model_card_minimal_data) + + # derivedFrom should remain as specified, not be overwritten by trainingDataset + assert len(model_card.derivedFrom) == 1 + assert model_card.derivedFrom[0].guid == "ark:59852/model-source" + +def test_model_card_edge_case_empty_author(): + """Test PROV field population when author is falsy (defensive code path).""" + # Test with empty list for author (valid but falsy) + model_card_data = { + "@id": "ark:59852/test-model-card", + "name": "Test Model Card", + "author": [], + "description": "This is a test model card with sufficient description.", + "keywords": ["test"], + "version": "1.0.0" + } + + model_card = ModelCard.model_validate(model_card_data) + + # Should hit the else clause and set wasAttributedTo to empty list + assert model_card.wasAttributedTo == [] + +def test_model_card_edge_case_no_generated_by(): + """Test PROV field population when generatedBy is None.""" + model_card_data = { + "@id": "ark:59852/test-model-card", + "name": "Test Model Card", + "author": "Test Author", + "description": "This is a test model card with sufficient description.", + "keywords": ["test"], + "version": "1.0.0", + "generatedBy": None + } + + model_card = ModelCard.model_validate(model_card_data) + + # Should set wasGeneratedBy to empty list + assert model_card.wasGeneratedBy == []