diff --git a/README.md b/README.md index 6ff267db..f53ec3a5 100644 --- a/README.md +++ b/README.md @@ -341,6 +341,36 @@ media.claims.add(Item(prop_nr='P180', value='Q3146211')) media.write() ``` +## Entity validation +Two different validators for entities are available. + +### Entityshape +This is a in beta state. For simple entity schemas it has proven reliable. + +See https://github.com/dpriskorn/entityshape#limitations for a list of limitations + +```python +from wikibaseintegrator import WikibaseIntegrator + +wbi = WikibaseIntegrator() +item = wbi.item.get('Q1') +result = item.entityshape_schema_validator(entity_schema_id="E1") +print(result) +``` + +### PyShex +This is considered highly experimental/alpha state. +We have not yet been able to successfully validate an item or lexeme with this library. + +```python +from wikibaseintegrator import WikibaseIntegrator + +wbi = WikibaseIntegrator() +item = wbi.item.get('Q1') +result = item.pyshex_schema_validator(entity_schema_id="E1") +print(result) +``` + # More than Wikibase # WikibaseIntegrator natively supports some extensions: diff --git a/requirements.txt b/requirements.txt index 4f33e295..ffcd737e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,7 @@ oauthlib~=3.2.2 requests~=2.31.0 requests-oauthlib~=1.3.1 ujson~=5.8.0 +entityshape~=0.1.0 +pyshex~=0.8.1 +rdflib~=6.3.2 +pydantic~=1.10.9 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index cb904dd2..8a465ad6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -40,6 +40,7 @@ install_requires = requests>=2.27.1,<2.29.0 requests-oauthlib~=1.3.1 ujson>=5.4,<5.6 + entityshape~=0.1.0 python_requires = >=3.8, <3.13 [options.extras_require] diff --git a/setup.py b/setup.py index e2d48c03..92beea53 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,8 @@ "oauthlib ~= 3.2.0", "requests >= 2.27.1,< 2.32.0", "requests-oauthlib ~= 1.3.1", - "ujson >= 5.4,< 5.9" + "ujson >= 5.4,< 5.9", + "entityshape ~= 0.1.0" ], extras_require={ "dev": [ diff --git a/test/test_entity_item.py b/test/test_entity_item.py index a83ed3aa..7ee17a0a 100644 --- a/test/test_entity_item.py +++ b/test/test_entity_item.py @@ -107,3 +107,53 @@ def test_new_lines(self): item.claims.add(MonolingualText(prop_nr=123, text="Multi\r\nline")) item.claims.add(MonolingualText(prop_nr=123, text="Multi\rline")) item.claims.add(MonolingualText(prop_nr=123, text="Multi\nline")) + + def test_entityshape_entity_validation(self): + random_campsite = wbi.item.get('Q119156070') + assert random_campsite.entityshape_schema_validator(entity_schema_id="E376").is_valid + assert random_campsite.entityshape_schema_validator(entity_schema_id="376").is_valid + assert random_campsite.entityshape_schema_validator(entity_schema_id=376).is_valid + assert not wbi.item.get('Q582').entityshape_schema_validator(entity_schema_id="E376").is_valid + + def test_pyshex_entity_validation(self): + # TODO find a combination of shex and entity that is valid + # danish noun + result = wbi.lexeme.get('L41172').pyshex_schema_validator(entity_schema_id="E34") + assert result.valid is False + # This error makes no sense TODO report upstream to pyshex + assert result.reason == 'Import failure on https://www.wikidata.org/wiki/Special:EntitySchemaText/E68' + random_campsite = wbi.item.get('Q119156070') + result = random_campsite.pyshex_schema_validator(entity_schema_id="E376") + assert result.valid is False + assert result.reason == (' Testing wd:Q119156070 against shape campsite\n' + ' Datatype constraint (http://www.w3.org/2001/XMLSchema#string) does not ' + 'match URIRef ' + '\n' + ' Testing wd:Q119156070 against shape campsite\n' + ' Datatype constraint (http://www.w3.org/2001/XMLSchema#string) does not ' + 'match URIRef ' + '\n' + ' Testing wd:Q119156070 against shape campsite\n' + ' No matching triples found for predicate wdt:P31') + assert random_campsite.pyshex_schema_validator(entity_schema_id="376").valid is False + assert random_campsite.pyshex_schema_validator(entity_schema_id=376).valid is False + result2 = wbi.item.get('Q582').pyshex_schema_validator(entity_schema_id="E376") + assert not result2.valid + assert result2.reason == (' Testing wd:Q582 against shape campsite\n' + ' Triples:\n' + ' wd:Q582 wdt:P31 wd:Q1549591 .\n' + ' wd:Q582 wdt:P31 wd:Q484170 .\n' + ' 2 triples exceeds max {1,1}\n' + ' Testing wd:Q582 against shape campsite\n' + ' Node: wd:Q1549591 not in value set:\n' + '\t {"values": ["http://www.wikidata.org/entity/Q832778", "http:...\n' + ' Testing wd:Q582 against shape campsite\n' + ' Node: wd:Q484170 not in value set:\n' + '\t {"values": ["http://www.wikidata.org/entity/Q832778", "http:...\n' + ' Testing wd:Q582 against shape campsite\n' + ' Triples:\n' + ' wd:Q582 wdt:P31 wd:Q1549591 .\n' + ' wd:Q582 wdt:P31 wd:Q484170 .\n' + ' 2 triples exceeds max {1,1}\n' + ' Testing wd:Q582 against shape campsite\n' + ' No matching triples found for predicate wdt:P31') diff --git a/test/test_entityshape.py b/test/test_entityshape.py new file mode 100644 index 00000000..d0d12dfa --- /dev/null +++ b/test/test_entityshape.py @@ -0,0 +1,52 @@ +from unittest import TestCase + +from entityshape import EntityShape + +from wikibaseintegrator import WikibaseIntegrator +from wikibaseintegrator.wbi_config import config as wbi_config +from wikibaseintegrator.wbi_helpers import execute_sparql_query + +wbi_config['USER_AGENT'] = 'WikibaseIntegrator-pytest/1.0 (test_entity_item.py)' +wbi = WikibaseIntegrator() +class TestEntityShape(TestCase): + def test_validate_one_item(self): + item = wbi.item.get("Q96620548") + #item.validate(eid="E1", lang="en") + e = EntityShape(qid=item.id, eid="E376", lang="en") + result = e.get_result() + assert result.is_valid is False + assert result.required_properties_that_are_missing == ["P137"] + + def test_validate_all_campsite_shelter_items(self): + # This query was build in a few seconds using https://query.wikidata.org/querybuilder/?uselang=en :) + results = execute_sparql_query(""" + SELECT DISTINCT ?item ?itemLabel WHERE { + SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". } + { + SELECT DISTINCT ?item WHERE { + ?item p:P31 ?statement0. + ?statement0 (ps:P31/(wdt:P279*)) wd:Q96620652. + } + LIMIT 100 + } + } + """) + bindings = results["results"]["bindings"] + print(f"Found {len(bindings)} results") + count = 1 + for result in bindings: + qid = result["itemLabel"]["value"] + print(f"Working on: {qid}") + #print(result) + item = wbi.item.get(qid) + e = EntityShape(qid=item.id, eid="E376", lang="en") + result = e.get_result() + # Ignore the invalid shelters missing an operator P137 + if result.is_valid is False and result.required_properties_that_are_missing == {"P137"}: + print("Skipping campsite only missing and operator") + elif result.is_valid is True: + print("Skipping valid campsite - they are boring!") + else: + print(f"is_valid: {result.is_valid}, required_properties_that_are_missing:{result.required_properties_that_are_missing}, statements_with_property_that_is_not_allowed:{result.statements_with_property_that_is_not_allowed}, properties_with_too_many_statements:{result.properties_with_too_many_statements}, see {item.get_entity_url()}") + # assert result.is_valid is False + # assert result.required_properties_that_are_missing == ["P137"] diff --git a/wikibaseintegrator/entities/baseentity.py b/wikibaseintegrator/entities/baseentity.py index b032da62..437a892b 100644 --- a/wikibaseintegrator/entities/baseentity.py +++ b/wikibaseintegrator/entities/baseentity.py @@ -1,14 +1,23 @@ from __future__ import annotations import logging +import re from copy import copy from typing import TYPE_CHECKING, Any +import requests +from entityshape import EntityShape, Result +from pydantic import BaseModel +from pyshex import ShExEvaluator +from pyshex.shex_evaluator import EvaluationResult +from rdflib import Graph + from wikibaseintegrator import wbi_fastrun from wikibaseintegrator.datatypes import BaseDataType from wikibaseintegrator.models.claims import Claim, Claims +from wikibaseintegrator.wbi_config import config from wikibaseintegrator.wbi_enums import ActionIfExists -from wikibaseintegrator.wbi_exceptions import MissingEntityException +from wikibaseintegrator.wbi_exceptions import EntitySchemaDownloadError, MissingEntityException, TtlDownloadError from wikibaseintegrator.wbi_helpers import delete_page, edit_entity, mediawiki_api_call_helper from wikibaseintegrator.wbi_login import _Login @@ -18,6 +27,17 @@ log = logging.getLogger(__name__) +class PyshexResult(BaseModel): + reason: str + valid: bool + + def __str__(self): + return ( + f"Valid: {self.valid}\n" + f"Reason: {self.reason}" + ) + + class BaseEntity: ETYPE = 'base-entity' subclasses: list[type[BaseEntity]] = [] @@ -299,6 +319,93 @@ def get_entity_url(self, wikibase_url: str | None = None) -> str: raise ValueError('wikibase_url or entity ID is null.') + def _get_valid_entity_schema_id(self, entity_schema_id) -> str: + if isinstance(entity_schema_id, str): + pattern = re.compile(r'^(?:[a-zA-Z]+:)?E?([0-9]+)$') + matches = pattern.match(entity_schema_id) + + if not matches: + raise ValueError(f"Invalid EntitySchema ID ({entity_schema_id}), format must be 'E[0-9]+'") + + entity_schema_id = f'E{matches.group(1)}' + elif isinstance(entity_schema_id, int): + entity_schema_id = f'E{entity_schema_id}' + else: + raise ValueError(f"Invalid EntitySchema ID ({entity_schema_id}), format must be 'E[0-9]+'") + return entity_schema_id + + def _get_ttl_data(self) -> str: + """Download the entity data in turtle format (ttl)""" + api_endpoint = 'https://www.wikidata.org/wiki/Special:EntityData/' + api_url = f'{api_endpoint}{self.id}.ttl' + # TODO fix timeout + response = requests.get(api_url, timeout=10) + if response.status_code == 200: + return response.text + else: + raise TtlDownloadError() + + def _get_schema_text(self, entity_schema_id) -> str: + """ + Downloads the schema from wikidata + + :param entity_schema_id: the entityschema id to be downloaded + """ + url: str = f"https://www.wikidata.org/wiki/EntitySchema:{entity_schema_id}?action=raw" + response = requests.get(url, timeout=10) + if response.status_code == 200: + json_text: dict = response.json() + return json_text["schemaText"] + else: + raise EntitySchemaDownloadError() + + # TODO make an interface for the validator so the user + # does not have to think about how the internals of the validators work + # The users should get similar output no matter which validator they choose + def entityshape_schema_validator(self, entity_schema_id: str, language: str | None = None) -> Result: + entity_schema_id = self._get_valid_entity_schema_id(entity_schema_id=entity_schema_id) + language = str(language or config['DEFAULT_LANGUAGE']) + return EntityShape(qid=self.id, eid=entity_schema_id, lang=language).validate_and_get_result() + + def pyshex_schema_validator(self, entity_schema_id: str) -> PyshexResult: + entity_schema_id = self._get_valid_entity_schema_id(entity_schema_id=entity_schema_id) + return self._check_shex_conformance(entity_schema_id=entity_schema_id) + + def _check_shex_conformance(self, entity_schema_id: str= "", data: str= "") -> PyshexResult: + """ + Static method which can be used to check for conformance of a Wikidata item to an EntitySchema any SPARQL query + + :param entity_schema_id: The URI prefixes required for an endpoint, default is the Wikidata specific prefixes + :param data: Turtle data to be validated (Optional) + :return: The results of the query are an instance of PyshexResult + """ + # load the string of ttl data into a rdf graph to please ShExEvaluator + rdfdata = Graph() + if not data: + # This downloads the ttl data + data = self._get_ttl_data() + # print(data) + # exit() + rdfdata.parse(data=data) + else: + rdfdata.parse(data=data) + for result in ShExEvaluator(rdf=rdfdata, schema=self._get_schema_text(entity_schema_id=entity_schema_id), focus=f"http://www.wikidata.org/entity/{self.id}").evaluate(): + result: EvaluationResult + # convert named tuple to pydantic class which is way nicer + # class EvaluationResult(NamedTuple): + # result: bool + # focus: Optional[URIRef] + # start: Optional[URIRef] + # reason: Optional[str] + # We return early because we expect only one result from ShExEvaluator + return PyshexResult( + valid=result[0], + # We ignore these for now as they seem overcomplicated + #focus=result[1], + #start=result[2], + reason=result[3], + ) + def __repr__(self): """A mixin implementing a simple __repr__.""" return "<{klass} @{id:x} {attrs}>".format( # pylint: disable=consider-using-f-string diff --git a/wikibaseintegrator/wbi_exceptions.py b/wikibaseintegrator/wbi_exceptions.py index 8440f8fc..ac312286 100644 --- a/wikibaseintegrator/wbi_exceptions.py +++ b/wikibaseintegrator/wbi_exceptions.py @@ -99,3 +99,11 @@ class MissingEntityException(Exception): class SearchError(Exception): pass + + +class TtlDownloadError(BaseException): + pass + + +class EntitySchemaDownloadError(BaseException): + pass