diff --git a/README.md b/README.md index 7baadb18..f6a4cc66 100644 --- a/README.md +++ b/README.md @@ -869,7 +869,15 @@ for entrez_id, ensembl in raw_data.items(): Note: Fastrun mode checks for equality of property/value pairs, qualifiers (not including qualifier attributes), labels, aliases and description, but it ignores references by default! -References can be checked in fast run mode by setting `use_refs` to `True`. +References can be checked in fast run mode by setting `use_references` to `True`. + +# Statistics # + +| Dataset | partial fastrun | fastrun without qualifiers/references | fastrun with qualifiers | fastrun with qualifiers/references | +|:----------------------------|----------------:|--------------------------------------:|------------------------:|-----------------------------------:| +| Communes (34990 elements) | ? | 7min | 30s | 60s | +| Cantons (2042 elements) | ? | ? | ? | ? | +| Départements (100 elements) | 70min | 1s | 30s | 60s | # Debugging # diff --git a/pyproject.toml b/pyproject.toml index 3a85ae01..5b135170 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -116,3 +116,4 @@ disable = [ [tool.pytest.ini_options] log_cli = true +log_cli_level = 'DEBUG' diff --git a/test/test_all.py b/test/test_all.py index e9d2c455..68ac8c52 100644 --- a/test/test_all.py +++ b/test/test_all.py @@ -1,12 +1,12 @@ import copy +import logging import unittest from wikibaseintegrator import WikibaseIntegrator, datatypes, wbi_fastrun from wikibaseintegrator.datatypes import BaseDataType, Item from wikibaseintegrator.entities import ItemEntity from wikibaseintegrator.wbi_config import config as wbi_config -from wikibaseintegrator.wbi_enums import ActionIfExists, WikibaseDatatype -from wikibaseintegrator.wbi_fastrun import get_fastrun_container +from wikibaseintegrator.wbi_enums import WikibaseDatatype wbi_config['USER_AGENT'] = 'WikibaseIntegrator-pytest/1.0 (test_all.py)' @@ -53,66 +53,22 @@ class TestFastRun(unittest.TestCase): """ some basic tests for fastrun mode """ + logging.basicConfig(level=logging.DEBUG) def test_fastrun(self): statements = [ - datatypes.ExternalID(value='P40095', prop_nr='P352'), + datatypes.ExternalID(value='A0A023PZB3', prop_nr='P352'), datatypes.ExternalID(value='YER158C', prop_nr='P705') ] frc = wbi_fastrun.FastRunContainer(base_filter=[BaseDataType(prop_nr='P352'), datatypes.Item(prop_nr='P703', value='Q27510868')], base_data_type=datatypes.BaseDataType) - fastrun_result = frc.write_required(data=statements) - - if fastrun_result: - message = 'fastrun failed' - else: - message = 'successful fastrun' + fastrun_result = frc.write_required(claims=statements) # here, fastrun should succeed, if not, test failed - if fastrun_result: + if not fastrun_result: raise ValueError - def test_fastrun_label(self): - # tests fastrun label, description and aliases, and label in another language - frc = get_fastrun_container(base_filter=[datatypes.ExternalID(value='/m/02j71', prop_nr='P646')]) - item = WikibaseIntegrator().item.get('Q2') - - assert item.labels.get(language='en') == "Earth" - descr = item.descriptions.get(language='en') - assert len(descr) > 3 - assert "Planet Earth" in item.aliases.get() - - assert list(frc.get_language_data("Q2", 'en', 'label'))[0] == item.labels.get(language='en') - assert frc.check_language_data("Q2", ['not the Earth'], 'en', 'label') - assert "Planet Earth" in item.aliases.get() - assert "planet" in item.descriptions.get() - - assert item.labels.get('es') == "Tierra" - - item.descriptions.set(value=descr) - item.descriptions.set(value="fghjkl") - assert item.get_json()['descriptions']['en'] == {'language': 'en', 'value': 'fghjkl'} - item.labels.set(value="Earth") - item.labels.set(value="xfgfdsg") - assert item.get_json()['labels']['en'] == {'language': 'en', 'value': 'xfgfdsg'} - item.aliases.set(values=["fake alias"], action_if_exists=ActionIfExists.APPEND_OR_REPLACE) - assert {'language': 'en', 'value': 'fake alias'} in item.get_json()['aliases']['en'] - - # something that's empty (for now.., can change, so this just makes sure no exception is thrown) - frc.check_language_data("Q2", ['Ewiase'], 'ak', 'label') - frc.check_language_data("Q2", ['not Ewiase'], 'ak', 'label') - frc.check_language_data("Q2", [''], 'ak', 'description') - frc.check_language_data("Q2", [], 'ak', 'aliases') - frc.check_language_data("Q2", ['sdf', 'sdd'], 'ak', 'aliases') - - item.labels.get(language="ak") - item.descriptions.get(language='ak') - item.aliases.get(language="ak") - item.labels.set(value="label", language="ak") - item.descriptions.set(value="d", language="ak") - item.aliases.set(values=["a"], language="ak", action_if_exists=ActionIfExists.APPEND_OR_REPLACE) - def test_sitelinks(): item = wbi.item.get('Q622901') diff --git a/test/test_entity_item.py b/test/test_entity_item.py index 1c510c6a..fe12e7dd 100644 --- a/test/test_entity_item.py +++ b/test/test_entity_item.py @@ -51,18 +51,18 @@ def test_write(self): def test_write_not_required(self): assert not wbi.item.get('Q582').write_required(base_filter=[BaseDataType(prop_nr='P1791')]) + def test_write_not_required_ref(self): + assert not wbi.item.get('Q582').write_required(base_filter=[BaseDataType(prop_nr='P1464')], use_references=True) + def test_write_required(self): item = wbi.item.get('Q582') item.claims.add(Item(prop_nr='P1791', value='Q42')) assert item.write_required([BaseDataType(prop_nr='P1791')]) - def test_write_not_required_ref(self): - assert not wbi.item.get('Q582').write_required(base_filter=[BaseDataType(prop_nr='P2581')], use_refs=True) - def test_write_required_ref(self): item = wbi.item.get('Q582') - item.claims.get('P2581')[0].references.references.pop() - assert item.write_required(base_filter=[BaseDataType(prop_nr='P2581')], use_refs=True) + item.claims.get('P1464')[0].references.references.pop() + assert item.write_required(base_filter=[BaseDataType(prop_nr='P1464')], use_references=True) def test_long_item_id(self): assert wbi.item.get('Item:Q582').id == 'Q582' diff --git a/test/test_wbi_core.py b/test/test_wbi_core.py index e9db809f..8022463c 100644 --- a/test/test_wbi_core.py +++ b/test/test_wbi_core.py @@ -243,7 +243,7 @@ def test_new_item_creation(self): MonolingualText(text='xxx', language='fr', prop_nr='P7'), Quantity(amount=-5.04, prop_nr='P8'), Quantity(amount=5.06, upper_bound=9.99, lower_bound=-2.22, unit='Q11573', prop_nr='P8'), - CommonsMedia(value='xxx', prop_nr='P9'), + CommonsMedia(value="Place lazare goujon.jpg", prop_nr='P9'), GlobeCoordinate(latitude=1.2345, longitude=-1.2345, precision=12, prop_nr='P10'), GeoShape(value='Data:xxx.map', prop_nr='P11'), Property(value='P123', prop_nr='P12'), diff --git a/test/test_wbi_fastrun.py b/test/test_wbi_fastrun.py deleted file mode 100644 index 92c379a5..00000000 --- a/test/test_wbi_fastrun.py +++ /dev/null @@ -1,211 +0,0 @@ -from collections import defaultdict -from typing import Any - -from wikibaseintegrator import WikibaseIntegrator, wbi_fastrun -from wikibaseintegrator.datatypes import BaseDataType, ExternalID, Item -from wikibaseintegrator.wbi_config import config as wbi_config -from wikibaseintegrator.wbi_enums import ActionIfExists - -wbi_config['USER_AGENT'] = 'WikibaseIntegrator-pytest/1.0 (test_wbi_fastrun.py)' - -wbi = WikibaseIntegrator() - - -def test_query_data(): - """ - test_fastrun.test_query_data - This hits live wikidata and may change !! - - This tests that the fast run container correctly queries data from wikidata and stores it in the appropriate format - without getting references - """ - frc = wbi_fastrun.FastRunContainer(base_filter=[BaseDataType(prop_nr='P699')], base_data_type=BaseDataType) - # get a string value - frc._query_data('P699') - # wikidata-item value - frc._query_data('P828') - # uri value - frc._query_data('P2888') - - # https://www.wikidata.org/wiki/Q10874 - assert 'Q10874' in frc.prop_data - assert 'P699' in frc.prop_data['Q10874'] - # the ID may change, so retrieve it - statement_id = list(frc.prop_data['Q10874']['P699'].keys())[0] - d = frc.prop_data['Q10874']['P699'][statement_id] - # d looks like: {'qual': set(), 'ref': {}, 'v': 'DOID:1432'} - assert all(x in d for x in {'qual', 'ref', 'v'}) - assert frc.prop_data['Q10874']['P699'][statement_id]['v'].startswith('"DOID:') - - # item - assert list(frc.prop_data['Q10874']['P828'].values())[0]['v'] == "Q18228398" - - # uri - v = {x['v'] for x in frc.prop_data['Q10874']['P2888'].values()} - assert all(y.startswith(" 0 - ref_id = list(d['ref'].keys())[0] - ref = d['ref'][ref_id] - assert len(ref) > 1 - - -class FastRunContainerFakeQueryDataEnsembl(wbi_fastrun.FastRunContainer): - def __init__(self, *args: Any, **kwargs: Any): - super().__init__(*args, **kwargs) - self.prop_dt_map = {'P248': 'wikibase-item', 'P594': 'external-id'} - self.prop_data['Q14911732'] = {'P594': { - 'fake statement id': { - 'qual': set(), - 'ref': {'fake ref id': { - ('P248', - 'Q106833387'), - ('P594', - 'ENSG00000123374')}}, - 'unit': '1', - 'v': '"ENSG00000123374"'}}} - self.rev_lookup = defaultdict(set) - self.rev_lookup['"ENSG00000123374"'].add('Q14911732') - - -class FastRunContainerFakeQueryDataEnsemblNoRef(wbi_fastrun.FastRunContainer): - def __init__(self, *args: Any, **kwargs: Any): - super().__init__(*args, **kwargs) - self.prop_dt_map = {'P248': 'wikibase-item', 'P594': 'external-id'} - self.prop_data['Q14911732'] = {'P594': { - 'fake statement id': { - 'qual': set(), - 'ref': {}, - 'v': 'ENSG00000123374'}}} - self.rev_lookup = defaultdict(set) - self.rev_lookup['"ENSG00000123374"'].add('Q14911732') - - -def test_fastrun_ref_ensembl(): - # fastrun checks refs - frc = FastRunContainerFakeQueryDataEnsembl(base_filter=[BaseDataType(prop_nr='P594'), Item(prop_nr='P703', value='Q15978631')], base_data_type=BaseDataType, use_refs=True) - - # statement has no ref - statements = [ExternalID(value='ENSG00000123374', prop_nr='P594')] - assert frc.write_required(data=statements) - - # statement has the same ref - statements = [ExternalID(value='ENSG00000123374', prop_nr='P594', references=[[Item("Q106833387", prop_nr="P248"), ExternalID("ENSG00000123374", prop_nr="P594")]])] - assert not frc.write_required(data=statements) - - # new statement has an different stated in - statements = [ExternalID(value='ENSG00000123374', prop_nr='P594', references=[[Item("Q99999999999", prop_nr="P248"), ExternalID("ENSG00000123374", prop_nr="P594", )]])] - assert frc.write_required(data=statements) - - # fastrun don't check references, statement has no reference, - frc = FastRunContainerFakeQueryDataEnsemblNoRef(base_filter=[BaseDataType(prop_nr='P594'), Item(prop_nr='P703', value='Q15978631')], base_data_type=BaseDataType, - use_refs=False) - statements = [ExternalID(value='ENSG00000123374', prop_nr='P594')] - assert not frc.write_required(data=statements) - - # fastrun don't check references, statement has reference, - frc = FastRunContainerFakeQueryDataEnsemblNoRef(base_filter=[BaseDataType(prop_nr='P594'), Item(prop_nr='P703', value='Q15978631')], base_data_type=BaseDataType, - use_refs=False) - statements = [ExternalID(value='ENSG00000123374', prop_nr='P594', references=[[Item("Q123", prop_nr="P31")]])] - assert not frc.write_required(data=statements) - - -class FakeQueryDataAppendProps(wbi_fastrun.FastRunContainer): - # an item with three values for the same property - def __init__(self, *args: Any, **kwargs: Any): - super().__init__(*args, **kwargs) - self.prop_dt_map = {'P527': 'wikibase-item', 'P248': 'wikibase-item', 'P594': 'external-id'} - - self.rev_lookup = defaultdict(set) - self.rev_lookup['Q24784025'].add('Q3402672') - self.rev_lookup['Q24743729'].add('Q3402672') - self.rev_lookup['Q24782625'].add('Q3402672') - - self.prop_data['Q3402672'] = {'P527': { - 'Q3402672-11BA231B-857B-498B-AC4F-91D71EE007FD': {'qual': set(), - 'ref': { - '149c9c7ba4e246d9f09ce3ed0cdf7aa721aad5c8': { - ('P248', 'Q3047275'), - }}, - 'v': 'Q24784025'}, - 'Q3402672-15F54AFF-7DCC-4DF6-A32F-73C48619B0B2': {'qual': set(), - 'ref': { - '149c9c7ba4e246d9f09ce3ed0cdf7aa721aad5c8': { - ('P248', 'Q3047275'), - }}, - 'v': 'Q24743729'}, - 'Q3402672-C8F11D55-1B11-44E5-9EAF-637E062825A4': {'qual': set(), - 'ref': { - '149c9c7ba4e246d9f09ce3ed0cdf7aa721aad5c8': { - ('P248', 'Q3047275')}}, - 'v': 'Q24782625'}}} - - -def test_append_props(): - qid = 'Q3402672' - # https://www.wikidata.org/wiki/Q3402672#P527 - - # don't consider refs - frc = FakeQueryDataAppendProps(base_filter=[BaseDataType(prop_nr='P352'), Item(prop_nr='P703', value='Q15978631')], base_data_type=BaseDataType) - # with append - statements = [Item(value='Q24784025', prop_nr='P527')] - assert frc.write_required(data=statements, action_if_exists=ActionIfExists.APPEND_OR_REPLACE, cqid=qid) is False - # with force append - statements = [Item(value='Q24784025', prop_nr='P527')] - assert frc.write_required(data=statements, action_if_exists=ActionIfExists.FORCE_APPEND, cqid=qid) is True - # without append - statements = [Item(value='Q24784025', prop_nr='P527')] - assert frc.write_required(data=statements, cqid=qid) is True - - # if we are in append mode, and the refs are different, we should write - frc = FakeQueryDataAppendProps(base_filter=[BaseDataType(prop_nr='P352'), Item(prop_nr='P703', value='Q15978631')], base_data_type=BaseDataType, use_refs=True) - # with append - statements = [Item(value='Q24784025', prop_nr='P527')] - assert frc.write_required(data=statements, cqid=qid, action_if_exists=ActionIfExists.APPEND_OR_REPLACE) is True - # without append - statements = [Item(value='Q24784025', prop_nr='P527')] - assert frc.write_required(data=statements, cqid=qid) is True diff --git a/wikibaseintegrator/datatypes/basedatatype.py b/wikibaseintegrator/datatypes/basedatatype.py index 0db1c3d7..f5b99a92 100644 --- a/wikibaseintegrator/datatypes/basedatatype.py +++ b/wikibaseintegrator/datatypes/basedatatype.py @@ -11,6 +11,7 @@ class BaseDataType(Claim): The base class for all Wikibase data types, they inherit from it """ DTYPE = 'base-data-type' + PTYPE = 'property-data-type' subclasses: list[type[BaseDataType]] = [] sparql_query: str = ''' SELECT * WHERE {{ @@ -28,7 +29,14 @@ def __init__(self, prop_nr: int | str | None = None, **kwargs: Any): super().__init__(**kwargs) - self.mainsnak.property_number = prop_nr or None + if isinstance(prop_nr, str): + pattern = re.compile(r'^([a-z][a-z\d+.-]*):([^][<>\"\x00-\x20\x7F])+$') + matches = pattern.match(str(prop_nr)) + + if matches: + prop_nr = prop_nr.rsplit('/', 1)[-1] + + self.mainsnak.property_number = prop_nr # self.subclasses.append(self) # Allow registration of subclasses of BaseDataType into BaseDataType.subclasses @@ -39,7 +47,7 @@ def __init_subclass__(cls, **kwargs): def set_value(self, value: Any | None = None): pass - def get_sparql_value(self) -> str: + def get_sparql_value(self, **kwargs: Any) -> str | None: return '"' + self.mainsnak.datavalue['value'] + '"' def parse_sparql_value(self, value, type='literal', unit='1') -> bool: @@ -61,3 +69,6 @@ def parse_sparql_value(self, value, type='literal', unit='1') -> bool: raise ValueError return True + + def from_sparql_value(self, sparql_value: dict) -> BaseDataType: # type: ignore + pass diff --git a/wikibaseintegrator/datatypes/commonsmedia.py b/wikibaseintegrator/datatypes/commonsmedia.py index c444437d..18fbec19 100644 --- a/wikibaseintegrator/datatypes/commonsmedia.py +++ b/wikibaseintegrator/datatypes/commonsmedia.py @@ -1,17 +1,31 @@ import re import urllib.parse +from typing import Optional -from wikibaseintegrator.datatypes.string import String +from wikibaseintegrator.datatypes.url import URL -class CommonsMedia(String): +class CommonsMedia(URL): """ Implements the Wikibase data type for Wikimedia commons media files """ DTYPE = 'commonsMedia' + PTYPE = 'http://wikiba.se/ontology#CommonsMedia' - def get_sparql_value(self) -> str: - return '<' + self.mainsnak.datavalue['value'] + '>' + def set_value(self, value: Optional[str] = None): + assert isinstance(value, str) or value is None, f"Expected str, found {type(value)} ({value})" + + if value: + pattern = re.compile(r'^.+\..+$') + matches = pattern.match(value) + + if not matches: + raise ValueError(f"Invalid CommonsMedia {value}") + + self.mainsnak.datavalue = { + 'value': value, + 'type': 'string' + } def parse_sparql_value(self, value, type='literal', unit='1') -> bool: pattern = re.compile(r'^?$') diff --git a/wikibaseintegrator/datatypes/externalid.py b/wikibaseintegrator/datatypes/externalid.py index c4838138..6b88ca7e 100644 --- a/wikibaseintegrator/datatypes/externalid.py +++ b/wikibaseintegrator/datatypes/externalid.py @@ -6,3 +6,4 @@ class ExternalID(String): Implements the Wikibase data type 'external-id' """ DTYPE = 'external-id' + PTYPE = 'http://wikiba.se/ontology#ExternalId' diff --git a/wikibaseintegrator/datatypes/form.py b/wikibaseintegrator/datatypes/form.py index f82edf7b..fe93896d 100644 --- a/wikibaseintegrator/datatypes/form.py +++ b/wikibaseintegrator/datatypes/form.py @@ -2,6 +2,8 @@ from typing import Any, Optional from wikibaseintegrator.datatypes.basedatatype import BaseDataType +from wikibaseintegrator.wbi_config import config +from wikibaseintegrator.wbi_enums import WikibaseSnakType class Form(BaseDataType): @@ -9,6 +11,7 @@ class Form(BaseDataType): Implements the Wikibase data type 'wikibase-form' """ DTYPE = 'wikibase-form' + PTYPE = 'http://wikiba.se/ontology#WikibaseForm' sparql_query = ''' SELECT * WHERE {{ ?item_id <{wb_url}/prop/{pid}> ?s . @@ -55,8 +58,14 @@ def set_value(self, value: Optional[str] = None): 'type': 'wikibase-entityid' } - def get_sparql_value(self) -> str: - return self.mainsnak.datavalue['value']['id'] + # TODO: add from_sparql_value() + + def get_sparql_value(self, **kwargs: Any) -> Optional[str]: + if self.mainsnak.snaktype == WikibaseSnakType.KNOWN_VALUE: + wikibase_url = str(kwargs['wikibase_url'] if 'wikibase_url' in kwargs else config['WIKIBASE_URL']) + return f'<{wikibase_url}/entity/' + self.mainsnak.datavalue['value']['id'] + '>' + + return None def get_lexeme_id(self) -> str: """ diff --git a/wikibaseintegrator/datatypes/geoshape.py b/wikibaseintegrator/datatypes/geoshape.py index 23d403ef..940f0923 100644 --- a/wikibaseintegrator/datatypes/geoshape.py +++ b/wikibaseintegrator/datatypes/geoshape.py @@ -9,6 +9,7 @@ class GeoShape(BaseDataType): Implements the Wikibase data type 'geo-shape' """ DTYPE = 'geo-shape' + PTYPE = 'http://wikiba.se/ontology#GeoShape' sparql_query = ''' SELECT * WHERE {{ ?item_id <{wb_url}/prop/{pid}> ?s . @@ -53,3 +54,7 @@ def set_value(self, value: Optional[str] = None): 'value': value, 'type': 'string' } + + # TODO: Does GeoShape need a full URL to wikimedia commons? + def get_sparql_value(self, **kwargs: Any) -> str: + return '<' + self.mainsnak.datavalue['value'] + '>' diff --git a/wikibaseintegrator/datatypes/globecoordinate.py b/wikibaseintegrator/datatypes/globecoordinate.py index 8607bcc4..ff462beb 100644 --- a/wikibaseintegrator/datatypes/globecoordinate.py +++ b/wikibaseintegrator/datatypes/globecoordinate.py @@ -1,9 +1,12 @@ +from __future__ import annotations + import re -from typing import Any, Optional +from typing import Any from wikibaseintegrator.datatypes.basedatatype import BaseDataType from wikibaseintegrator.models import Claim from wikibaseintegrator.wbi_config import config +from wikibaseintegrator.wbi_enums import WikibaseSnakType class GlobeCoordinate(BaseDataType): @@ -11,6 +14,7 @@ class GlobeCoordinate(BaseDataType): Implements the Wikibase data type for globe coordinates """ DTYPE = 'globe-coordinate' + PTYPE = 'http://wikiba.se/ontology#GlobeCoordinate' sparql_query = ''' SELECT * WHERE {{ ?item_id <{wb_url}/prop/{pid}> ?s . @@ -18,7 +22,7 @@ class GlobeCoordinate(BaseDataType): }} ''' - def __init__(self, latitude: Optional[float] = None, longitude: Optional[float] = None, altitude: Optional[float] = None, precision: Optional[float] = None, globe: Optional[str] = None, wikibase_url: Optional[str] = None, + def __init__(self, latitude: float | None = None, longitude: float | None = None, altitude: float | None = None, precision: float | None = None, globe: str | None = None, wikibase_url: str | None = None, **kwargs: Any): """ Constructor, calls the superclass BaseDataType @@ -34,7 +38,7 @@ def __init__(self, latitude: Optional[float] = None, longitude: Optional[float] super().__init__(**kwargs) self.set_value(latitude=latitude, longitude=longitude, altitude=altitude, precision=precision, globe=globe, wikibase_url=wikibase_url) - def set_value(self, latitude: Optional[float] = None, longitude: Optional[float] = None, altitude: Optional[float] = None, precision: Optional[float] = None, globe: Optional[str] = None, wikibase_url: Optional[str] = None): + def set_value(self, latitude: float | None = None, longitude: float | None = None, altitude: float | None = None, precision: float | None = None, globe: str | None = None, wikibase_url: str | None = None): # https://github.com/wikimedia/Wikibase/blob/174450de8fdeabcf97287604dbbf04d07bb5000c/repo/includes/Rdf/Values/GlobeCoordinateRdfBuilder.php#L120 precision = precision or 1 / 3600 globe = globe or str(config['COORDINATE_GLOBE_QID']) @@ -77,8 +81,37 @@ def __eq__(self, other): return super().__eq__(other) - def get_sparql_value(self) -> str: - return '"Point(' + str(self.mainsnak.datavalue['value']['longitude']) + ' ' + str(self.mainsnak.datavalue['value']['latitude']) + ')"' + def from_sparql_value(self, sparql_value: dict) -> GlobeCoordinate: + """ + Parse data returned by a SPARQL endpoint and set the value to the object + + :param sparql_value: A SPARQL value composed of datatype, type and value + :return: True if the parsing is successful + """ + datatype = sparql_value['datatype'] + type = sparql_value['type'] + value = sparql_value['value'] + + if datatype != 'http://www.opengis.net/ont/geosparql#wktLiteral': + raise ValueError('Wrong SPARQL datatype') + + if type != 'literal': + raise ValueError('Wrong SPARQL type') + + if value.startswith('http://www.wikidata.org/.well-known/genid/'): + self.mainsnak.snaktype = WikibaseSnakType.UNKNOWN_VALUE + else: + pattern = re.compile(r'^Point\((.*) (.*)\)$') + matches = pattern.match(value) + if not matches: + raise ValueError('Invalid SPARQL value') + + self.set_value(longitude=float(matches.group(1)), latitude=float(matches.group(2))) + + return self + + def get_sparql_value(self, **kwargs: Any) -> str: + return '"Point(' + str(self.mainsnak.datavalue['value']['longitude']) + ' ' + str(self.mainsnak.datavalue['value']['latitude']) + ')"^^geo:wktLiteral' def parse_sparql_value(self, value, type='literal', unit='1') -> bool: pattern = re.compile(r'^"?Point\((.*) (.*)\)"?(?:\^\^geo:wktLiteral)?$') diff --git a/wikibaseintegrator/datatypes/item.py b/wikibaseintegrator/datatypes/item.py index a35b57e6..3dbf911f 100644 --- a/wikibaseintegrator/datatypes/item.py +++ b/wikibaseintegrator/datatypes/item.py @@ -1,7 +1,11 @@ +from __future__ import annotations + import re -from typing import Any, Optional, Union +from typing import Any from wikibaseintegrator.datatypes.basedatatype import BaseDataType +from wikibaseintegrator.wbi_config import config +from wikibaseintegrator.wbi_enums import WikibaseSnakType class Item(BaseDataType): @@ -9,6 +13,7 @@ class Item(BaseDataType): Implements the Wikibase data type 'wikibase-item' with a value being another item ID """ DTYPE = 'wikibase-item' + PTYPE = 'http://wikiba.se/ontology#WikibaseItem' sparql_query = ''' SELECT * WHERE {{ ?item_id <{wb_url}/prop/{pid}> ?s . @@ -16,7 +21,7 @@ class Item(BaseDataType): }} ''' - def __init__(self, value: Optional[Union[str, int]] = None, **kwargs: Any): + def __init__(self, value: str | int | None = None, **kwargs: Any): """ Constructor, calls the superclass BaseDataType @@ -26,7 +31,7 @@ def __init__(self, value: Optional[Union[str, int]] = None, **kwargs: Any): super().__init__(**kwargs) self.set_value(value=value) - def set_value(self, value: Optional[Union[str, int]] = None): + def set_value(self, value: str | int | None = None): assert isinstance(value, (str, int)) or value is None, f'Expected str or int, found {type(value)} ({value})' if value: @@ -48,5 +53,34 @@ def set_value(self, value: Optional[Union[str, int]] = None): 'type': 'wikibase-entityid' } - def get_sparql_value(self) -> str: - return '<{wb_url}/entity/' + self.mainsnak.datavalue['value']['id'] + '>' + def from_sparql_value(self, sparql_value: dict) -> Item: + """ + Parse data returned by a SPARQL endpoint and set the value to the object + + :param sparql_value: A SPARQL value composed of type and value + :return: True if the parsing is successful + """ + type = sparql_value['type'] + value = sparql_value['value'] + + if type != 'uri': + raise ValueError('Wrong SPARQL type') + + if value.startswith('http://www.wikidata.org/.well-known/genid/'): + self.mainsnak.snaktype = WikibaseSnakType.UNKNOWN_VALUE + else: + pattern = re.compile(r'^.+/([PQLM]\d+)$') + matches = pattern.match(value) + if not matches: + raise ValueError(f"Invalid SPARQL value {value}") + + self.set_value(value=str(matches.group(1))) + + return self + + def get_sparql_value(self, **kwargs: Any) -> str | None: + if self.mainsnak.snaktype == WikibaseSnakType.KNOWN_VALUE: + wikibase_url = str(kwargs['wikibase_url'] if 'wikibase_url' in kwargs else config['WIKIBASE_URL']) + return f'<{wikibase_url}/entity/' + self.mainsnak.datavalue['value']['id'] + '>' + + return None diff --git a/wikibaseintegrator/datatypes/lexeme.py b/wikibaseintegrator/datatypes/lexeme.py index 2ad0c641..5cd26886 100644 --- a/wikibaseintegrator/datatypes/lexeme.py +++ b/wikibaseintegrator/datatypes/lexeme.py @@ -2,6 +2,8 @@ from typing import Any, Optional, Union from wikibaseintegrator.datatypes.basedatatype import BaseDataType +from wikibaseintegrator.wbi_config import config +from wikibaseintegrator.wbi_enums import WikibaseSnakType class Lexeme(BaseDataType): @@ -9,6 +11,7 @@ class Lexeme(BaseDataType): Implements the Wikibase data type 'wikibase-lexeme' """ DTYPE = 'wikibase-lexeme' + PTYPE = 'http://wikiba.se/ontology#WikibaseLexeme' sparql_query = ''' SELECT * WHERE {{ ?item_id <{wb_url}/prop/{pid}> ?s . @@ -48,5 +51,9 @@ def set_value(self, value: Optional[Union[str, int]] = None): 'type': 'wikibase-entityid' } - def get_sparql_value(self) -> str: - return self.mainsnak.datavalue['value']['id'] + def get_sparql_value(self, **kwargs: Any) -> Optional[str]: + if self.mainsnak.snaktype == WikibaseSnakType.KNOWN_VALUE: + wikibase_url = str(kwargs['wikibase_url'] if 'wikibase_url' in kwargs else config['WIKIBASE_URL']) + return f'<{wikibase_url}/entity/' + self.mainsnak.datavalue['value']['id'] + '>' + + return None diff --git a/wikibaseintegrator/datatypes/math.py b/wikibaseintegrator/datatypes/math.py index 7ad3f3cc..29c48217 100644 --- a/wikibaseintegrator/datatypes/math.py +++ b/wikibaseintegrator/datatypes/math.py @@ -1,4 +1,9 @@ +from __future__ import annotations + +from typing import Any + from wikibaseintegrator.datatypes.string import String +from wikibaseintegrator.wbi_enums import WikibaseSnakType class Math(String): @@ -6,3 +11,31 @@ class Math(String): Implements the Wikibase data type 'math' for mathematical formula in TEX format """ DTYPE = 'math' + PTYPE = 'http://wikiba.se/ontology#Math' + + def from_sparql_value(self, sparql_value: dict) -> Math: + """ + Parse data returned by a SPARQL endpoint and set the value to the object + + :param sparql_value: A SPARQL value composed of type and value + :return: + """ + datatype = sparql_value['datatype'] + type = sparql_value['type'] + value = sparql_value['value'] + + if datatype != 'http://www.w3.org/2001/XMLSchema#dateTime': + raise ValueError('Wrong SPARQL datatype') + + if type != 'literal': + raise ValueError('Wrong SPARQL type') + + if value.startswith('http://www.wikidata.org/.well-known/genid/'): + self.mainsnak.snaktype = WikibaseSnakType.UNKNOWN_VALUE + else: + self.set_value(value=value) + + return self + + def get_sparql_value(self, **kwargs: Any) -> str: + return '"' + self.mainsnak.datavalue['value'] + '"^^' diff --git a/wikibaseintegrator/datatypes/monolingualtext.py b/wikibaseintegrator/datatypes/monolingualtext.py index 7b46fb54..f3c41a51 100644 --- a/wikibaseintegrator/datatypes/monolingualtext.py +++ b/wikibaseintegrator/datatypes/monolingualtext.py @@ -1,8 +1,11 @@ +from __future__ import annotations + import re -from typing import Any, Optional +from typing import Any from wikibaseintegrator.datatypes.basedatatype import BaseDataType from wikibaseintegrator.wbi_config import config +from wikibaseintegrator.wbi_enums import WikibaseSnakType class MonolingualText(BaseDataType): @@ -10,6 +13,8 @@ class MonolingualText(BaseDataType): Implements the Wikibase data type for Monolingual Text strings """ DTYPE = 'monolingualtext' + PTYPE = 'http://wikiba.se/ontology#Monolingualtext' + sparql_query = ''' SELECT * WHERE {{ ?item_id <{wb_url}/prop/{pid}> ?s . @@ -17,7 +22,7 @@ class MonolingualText(BaseDataType): }} ''' - def __init__(self, text: Optional[str] = None, language: Optional[str] = None, **kwargs: Any): + def __init__(self, text: str | None = None, language: str | None = None, **kwargs: Any): """ Constructor, calls the superclass BaseDataType @@ -28,7 +33,7 @@ def __init__(self, text: Optional[str] = None, language: Optional[str] = None, * super().__init__(**kwargs) self.set_value(text=text, language=language) - def set_value(self, text: Optional[str] = None, language: Optional[str] = None): + def set_value(self, text: str | None = None, language: str | None = None): language = language or str(config['DEFAULT_LANGUAGE']) assert isinstance(text, str) or text is None, f"Expected str, found {type(text)} ({text})" @@ -46,7 +51,28 @@ def set_value(self, text: Optional[str] = None, language: Optional[str] = None): 'type': 'monolingualtext' } - def get_sparql_value(self) -> str: + def from_sparql_value(self, sparql_value: dict) -> MonolingualText: + """ + Parse data returned by a SPARQL endpoint and set the value to the object + + :param sparql_value: A SPARQL value composed of datatype, type and value + :return: True if the parsing is successful + """ + xml_lang = sparql_value['xml:lang'] + type = sparql_value['type'] + value = sparql_value['value'] + + if type != 'literal': + raise ValueError(f"Wrong SPARQL type {type}") + + if value.startswith('http://www.wikidata.org/.well-known/genid/'): + self.mainsnak.snaktype = WikibaseSnakType.UNKNOWN_VALUE + else: + self.set_value(text=value, language=xml_lang) + + return self + + def get_sparql_value(self, **kwargs: Any) -> str: return '"' + self.mainsnak.datavalue['value']['text'].replace('"', r'\"') + '"@' + self.mainsnak.datavalue['value']['language'] def parse_sparql_value(self, value, type='literal', unit='1') -> bool: diff --git a/wikibaseintegrator/datatypes/musicalnotation.py b/wikibaseintegrator/datatypes/musicalnotation.py index 8e5fcab6..0d9047c9 100644 --- a/wikibaseintegrator/datatypes/musicalnotation.py +++ b/wikibaseintegrator/datatypes/musicalnotation.py @@ -8,6 +8,7 @@ class MusicalNotation(String): Implements the Wikibase data type 'musical-notation' """ DTYPE = 'musical-notation' + PTYPE = 'http://wikiba.se/ontology#MusicalNotation' def set_value(self, value: Optional[str] = None): assert isinstance(value, str) or value is None, f"Expected str, found {type(value)} ({value})" diff --git a/wikibaseintegrator/datatypes/property.py b/wikibaseintegrator/datatypes/property.py index d68962fb..ad9f4bd4 100644 --- a/wikibaseintegrator/datatypes/property.py +++ b/wikibaseintegrator/datatypes/property.py @@ -2,6 +2,8 @@ from typing import Any, Optional, Union from wikibaseintegrator.datatypes.basedatatype import BaseDataType +from wikibaseintegrator.wbi_config import config +from wikibaseintegrator.wbi_enums import WikibaseSnakType class Property(BaseDataType): @@ -9,6 +11,7 @@ class Property(BaseDataType): Implements the Wikibase data type 'property' """ DTYPE = 'wikibase-property' + PTYPE = 'http://wikiba.se/ontology#Property' sparql_query = ''' SELECT * WHERE {{ ?item_id <{wb_url}/prop/{pid}> ?s . @@ -49,5 +52,9 @@ def set_value(self, value: Optional[Union[str, int]] = None): 'type': 'wikibase-entityid' } - def get_sparql_value(self) -> str: - return self.mainsnak.datavalue['value']['id'] + def get_sparql_value(self, **kwargs: Any) -> Optional[str]: + if self.mainsnak.snaktype == WikibaseSnakType.KNOWN_VALUE: + wikibase_url = str(kwargs['wikibase_url'] if 'wikibase_url' in kwargs else config['WIKIBASE_URL']) + return f'<{wikibase_url}/entity/' + self.mainsnak.datavalue['value']['id'] + '>' + + return None diff --git a/wikibaseintegrator/datatypes/quantity.py b/wikibaseintegrator/datatypes/quantity.py index d1aeb250..558a1ef2 100644 --- a/wikibaseintegrator/datatypes/quantity.py +++ b/wikibaseintegrator/datatypes/quantity.py @@ -1,7 +1,10 @@ -from typing import Any, Optional, Union +from __future__ import annotations + +from typing import Any from wikibaseintegrator.datatypes.basedatatype import BaseDataType from wikibaseintegrator.wbi_config import config +from wikibaseintegrator.wbi_enums import WikibaseSnakType from wikibaseintegrator.wbi_helpers import format_amount @@ -10,6 +13,7 @@ class Quantity(BaseDataType): Implements the Wikibase data type for quantities """ DTYPE = 'quantity' + PTYPE = 'http://wikiba.se/ontology#Quantity' sparql_query = ''' SELECT * WHERE {{ ?item_id <{wb_url}/prop/{pid}> ?s . @@ -17,8 +21,8 @@ class Quantity(BaseDataType): }} ''' - def __init__(self, amount: Optional[Union[str, int, float]] = None, upper_bound: Optional[Union[str, int, float]] = None, lower_bound: Optional[Union[str, int, float]] = None, unit: Union[str, int] = '1', - wikibase_url: Optional[str] = None, **kwargs: Any): + def __init__(self, amount: str | int | float | None = None, upper_bound: str | int | float | None = None, lower_bound: str | int | float | None = None, unit: str | int = '1', + wikibase_url: str | None = None, **kwargs: Any): """ Constructor, calls the superclass BaseDataType @@ -33,8 +37,8 @@ def __init__(self, amount: Optional[Union[str, int, float]] = None, upper_bound: super().__init__(**kwargs) self.set_value(amount=amount, upper_bound=upper_bound, lower_bound=lower_bound, unit=unit, wikibase_url=wikibase_url) - def set_value(self, amount: Optional[Union[str, int, float]] = None, upper_bound: Optional[Union[str, int, float]] = None, lower_bound: Optional[Union[str, int, float]] = None, unit: Union[str, int] = '1', - wikibase_url: Optional[str] = None): + def set_value(self, amount: str | int | float | None = None, upper_bound: str | int | float | None = None, lower_bound: str | int | float | None = None, unit: str | int = '1', + wikibase_url: str | None = None): wikibase_url = wikibase_url or str(config['WIKIBASE_URL']) unit = str(unit or '1') @@ -81,7 +85,31 @@ def set_value(self, amount: Optional[Union[str, int, float]] = None, upper_bound if not lower_bound: del self.mainsnak.datavalue['value']['lowerBound'] - def get_sparql_value(self) -> str: + def from_sparql_value(self, sparql_value: dict) -> Quantity: + """ + Parse data returned by a SPARQL endpoint and set the value to the object + + :param sparql_value: A SPARQL value composed of datatype, type and value + :return: True if the parsing is successful + """ + datatype = sparql_value['datatype'] + type = sparql_value['type'] + value = sparql_value['value'] + + if datatype != 'http://www.w3.org/2001/XMLSchema#decimal': + raise ValueError(f"Wrong SPARQL datatype {datatype}") + + if type != 'literal': + raise ValueError(f"Wrong SPARQL type {type}") + + if value.startswith('http://www.wikidata.org/.well-known/genid/'): + self.mainsnak.snaktype = WikibaseSnakType.UNKNOWN_VALUE + else: + self.set_value(amount=value) + + return self + + def get_sparql_value(self, **kwargs: Any) -> str: return '"' + format_amount(self.mainsnak.datavalue['value']['amount']) + '"^^xsd:decimal' def parse_sparql_value(self, value, type='literal', unit='1') -> bool: diff --git a/wikibaseintegrator/datatypes/sense.py b/wikibaseintegrator/datatypes/sense.py index 72342760..e468350a 100644 --- a/wikibaseintegrator/datatypes/sense.py +++ b/wikibaseintegrator/datatypes/sense.py @@ -2,6 +2,8 @@ from typing import Any, Optional from wikibaseintegrator.datatypes.basedatatype import BaseDataType +from wikibaseintegrator.wbi_config import config +from wikibaseintegrator.wbi_enums import WikibaseSnakType class Sense(BaseDataType): @@ -9,6 +11,7 @@ class Sense(BaseDataType): Implements the Wikibase data type 'wikibase-sense' """ DTYPE = 'wikibase-sense' + PTYPE = 'http://wikiba.se/ontology#WikibaseSense' sparql_query = ''' SELECT * WHERE {{ ?item_id <{wb_url}/prop/{pid}> ?s . @@ -44,8 +47,14 @@ def set_value(self, value: Optional[str] = None): 'type': 'wikibase-entityid' } - def get_sparql_value(self) -> str: - return self.mainsnak.datavalue['value']['id'] + # TODO: add from_sparql_value() + + def get_sparql_value(self, **kwargs: Any) -> Optional[str]: + if self.mainsnak.snaktype == WikibaseSnakType.KNOWN_VALUE: + wikibase_url = str(kwargs['wikibase_url'] if 'wikibase_url' in kwargs else config['WIKIBASE_URL']) + return f'<{wikibase_url}/entity/' + self.mainsnak.datavalue['value']['id'] + '>' + + return None def get_lexeme_id(self) -> str: """ diff --git a/wikibaseintegrator/datatypes/string.py b/wikibaseintegrator/datatypes/string.py index 527c36d2..e8971911 100644 --- a/wikibaseintegrator/datatypes/string.py +++ b/wikibaseintegrator/datatypes/string.py @@ -1,16 +1,19 @@ -from typing import Any, Optional +from __future__ import annotations + +from typing import Any from wikibaseintegrator.datatypes.basedatatype import BaseDataType +from wikibaseintegrator.wbi_enums import WikibaseSnakType class String(BaseDataType): """ Implements the Wikibase data type 'string' """ - DTYPE = 'string' + PTYPE = 'http://wikiba.se/ontology#String' - def __init__(self, value: Optional[str] = None, **kwargs: Any): + def __init__(self, value: str | None = None, **kwargs: Any): """ Constructor, calls the superclass BaseDataType @@ -20,7 +23,7 @@ def __init__(self, value: Optional[str] = None, **kwargs: Any): super().__init__(**kwargs) self.set_value(value=value) - def set_value(self, value: Optional[str] = None): + def set_value(self, value: str | None = None): assert isinstance(value, str) or value is None, f"Expected str, found {type(value)} ({value})" if value and ('\n' in value or '\r' in value): @@ -31,3 +34,23 @@ def set_value(self, value: Optional[str] = None): 'value': value, 'type': 'string' } + + def from_sparql_value(self, sparql_value: dict) -> String: + """ + Parse data returned by a SPARQL endpoint and set the value to the object + + :param sparql_value: A SPARQL value composed of type and value + :return: + """ + type = sparql_value['type'] + value = sparql_value['value'] + + if type != 'literal': + raise ValueError(f"Wrong SPARQL type {type}") + + if value.startswith('http://www.wikidata.org/.well-known/genid/'): + self.mainsnak.snaktype = WikibaseSnakType.UNKNOWN_VALUE + else: + self.set_value(value=value) + + return self diff --git a/wikibaseintegrator/datatypes/tabulardata.py b/wikibaseintegrator/datatypes/tabulardata.py index 9ce94763..654e57d6 100644 --- a/wikibaseintegrator/datatypes/tabulardata.py +++ b/wikibaseintegrator/datatypes/tabulardata.py @@ -9,6 +9,7 @@ class TabularData(BaseDataType): Implements the Wikibase data type 'tabular-data' """ DTYPE = 'tabular-data' + PTYPE = 'http://wikiba.se/ontology#TabularData' def __init__(self, value: Optional[str] = None, **kwargs: Any): """ @@ -34,3 +35,7 @@ def set_value(self, value: Optional[str] = None): 'value': value, 'type': 'string' } + + # TODO: Does TabularData need a full URL to wikimedia commons? + def get_sparql_value(self, **kwargs: Any) -> str: + return '<' + self.mainsnak.datavalue['value'] + '>' diff --git a/wikibaseintegrator/datatypes/time.py b/wikibaseintegrator/datatypes/time.py index 0fc41e68..ead40d15 100644 --- a/wikibaseintegrator/datatypes/time.py +++ b/wikibaseintegrator/datatypes/time.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import datetime import re from functools import total_ordering @@ -5,7 +7,7 @@ from wikibaseintegrator.datatypes.basedatatype import BaseDataType from wikibaseintegrator.wbi_config import config -from wikibaseintegrator.wbi_enums import WikibaseTimePrecision +from wikibaseintegrator.wbi_enums import WikibaseSnakType, WikibaseTimePrecision @total_ordering @@ -14,6 +16,7 @@ class Time(BaseDataType): Implements the Wikibase data type with date and time values """ DTYPE = 'time' + PTYPE = 'http://wikiba.se/ontology#Time' sparql_query = ''' SELECT * WHERE {{ ?item_id <{wb_url}/prop/{pid}> ?s . @@ -102,8 +105,8 @@ def set_value(self, time: Optional[str] = None, before: int = 0, after: int = 0, 'type': 'time' } - def get_sparql_value(self) -> str: - return self.mainsnak.datavalue['value']['time'] + def get_sparql_value(self, **kwargs: Any) -> str: + return '"' + self.mainsnak.datavalue['value']['time'] + '"^^xsd:dateTime' def get_year(self) -> int: return int(self.mainsnak.datavalue['value']['time'][0:5]) @@ -118,3 +121,27 @@ def __lt__(self, other): return (self.get_year() < other.get_year()) or \ (self.get_year() == other.get_year() and self.get_month() < other.get_month()) or \ (self.get_year() == other.get_year() and self.get_month() == other.get_month() and self.get_day() < other.get_day()) + + def from_sparql_value(self, sparql_value: dict) -> Time: + """ + Parse data returned by a SPARQL endpoint and set the value to the object + + :param sparql_value: A SPARQL value composed of type and value + :return: + """ + datatype = sparql_value['datatype'] + type = sparql_value['type'] + value = sparql_value['value'] + + if datatype != 'http://www.w3.org/2001/XMLSchema#dateTime': + raise ValueError('Wrong SPARQL datatype') + + if type != 'literal': + raise ValueError('Wrong SPARQL type') + + if value.startswith('http://www.wikidata.org/.well-known/genid/'): + self.mainsnak.snaktype = WikibaseSnakType.UNKNOWN_VALUE + else: + self.set_value(time=value) + + return self diff --git a/wikibaseintegrator/datatypes/url.py b/wikibaseintegrator/datatypes/url.py index 88059870..8b49d258 100644 --- a/wikibaseintegrator/datatypes/url.py +++ b/wikibaseintegrator/datatypes/url.py @@ -1,7 +1,10 @@ +from __future__ import annotations + import re -from typing import Any, Optional +from typing import Any from wikibaseintegrator.datatypes.basedatatype import BaseDataType +from wikibaseintegrator.wbi_enums import WikibaseSnakType class URL(BaseDataType): @@ -9,6 +12,7 @@ class URL(BaseDataType): Implements the Wikibase data type for URL strings """ DTYPE = 'url' + PTYPE = 'http://wikiba.se/ontology#Url' sparql_query = ''' SELECT * WHERE {{ ?item_id <{wb_url}/prop/{pid}> ?s . @@ -16,7 +20,7 @@ class URL(BaseDataType): }} ''' - def __init__(self, value: Optional[str] = None, **kwargs: Any): + def __init__(self, value: str | None = None, **kwargs: Any): """ Constructor, calls the superclass BaseDataType @@ -26,7 +30,7 @@ def __init__(self, value: Optional[str] = None, **kwargs: Any): super().__init__(**kwargs) self.set_value(value=value) - def set_value(self, value: Optional[str] = None): + def set_value(self, value: str | None = None): assert isinstance(value, str) or value is None, f"Expected str, found {type(value)} ({value})" if value: @@ -41,7 +45,26 @@ def set_value(self, value: Optional[str] = None): 'type': 'string' } - def get_sparql_value(self) -> str: + def from_sparql_value(self, sparql_value: dict) -> URL: + """ + Parse data returned by a SPARQL endpoint and set the value to the object + + :param sparql_value: A SPARQL value composed of type and value + :return: + """ + type = sparql_value['type'] + value = sparql_value['value'] + + if type != 'uri': + raise ValueError(f"Wrong SPARQL type {type}") + + if value.startswith('http://www.wikidata.org/.well-known/genid/'): + self.mainsnak.snaktype = WikibaseSnakType.UNKNOWN_VALUE + else: + self.set_value(value=value) + return self + + def get_sparql_value(self, **kwargs: Any) -> str: return '<' + self.mainsnak.datavalue['value'] + '>' def parse_sparql_value(self, value, type='literal', unit='1') -> bool: diff --git a/wikibaseintegrator/entities/baseentity.py b/wikibaseintegrator/entities/baseentity.py index 95f7027a..4b69e541 100644 --- a/wikibaseintegrator/entities/baseentity.py +++ b/wikibaseintegrator/entities/baseentity.py @@ -104,9 +104,13 @@ def claims(self) -> Claims: return self.__claims @claims.setter - def claims(self, value: Claims): - if not isinstance(value, Claims): + def claims(self, value: Claim | Claims): + if not isinstance(value, Claims) and not isinstance(value, Claim): raise TypeError + + if isinstance(value, Claim): + value = Claims().add(claims=value) + self.__claims = value def add_claims(self, claims: Claim | list[Claim] | Claims, action_if_exists: ActionIfExists = ActionIfExists.APPEND_OR_REPLACE) -> BaseEntity: @@ -204,6 +208,21 @@ def clear(self, **kwargs: Any) -> dict[str, Any]: """ return self._write(data={}, clear=True, **kwargs) + def get_claims(self, property: str, login: _Login | None = None, allow_anonymous: bool = True, is_bot: bool | None = None, **kwargs: Any): + params = { + 'action': 'wbgetclaims', + 'entity': self.id, + 'property': property, + 'format': 'json' + } + + login = login or self.api.login + is_bot = is_bot if is_bot is not None else self.api.is_bot + + json_data = mediawiki_api_call_helper(data=params, login=login, allow_anonymous=allow_anonymous, is_bot=is_bot, **kwargs) + self.claims.from_json(json_data['claims']) + return self + def _write(self, data: dict | None = None, summary: str | None = None, login: _Login | None = None, allow_anonymous: bool = False, limit_claims: list[str | int] | None = None, clear: bool = False, as_new: bool = False, is_bot: bool | None = None, fields_to_update: list | None | EntityField = None, **kwargs: Any) -> dict[str, Any]: """ @@ -319,21 +338,19 @@ def delete(self, login: _Login | None = None, allow_anonymous: bool = False, is_ return delete_page(title=None, pageid=self.pageid, login=login, allow_anonymous=allow_anonymous, is_bot=is_bot, **kwargs) - def write_required(self, base_filter: list[BaseDataType | list[BaseDataType]] | None = None, action_if_exists: ActionIfExists = ActionIfExists.REPLACE_ALL, - **kwargs: Any) -> bool: + def write_required(self, base_filter: list[BaseDataType | list[BaseDataType]], action_if_exists: ActionIfExists = ActionIfExists.REPLACE_ALL, **kwargs: Any) -> bool: fastrun_container = wbi_fastrun.get_fastrun_container(base_filter=base_filter, **kwargs) - if base_filter is None: - base_filter = [] - - claims_to_check = [] + pfilter: set = set() for claim in self.claims: if claim.mainsnak.property_number in base_filter: - claims_to_check.append(claim) + pfilter.add(claim.mainsnak.property_number) + + property_filter: list[str] = list(pfilter) # TODO: Add check_language_data - return fastrun_container.write_required(data=claims_to_check, cqid=self.id, action_if_exists=action_if_exists) + return fastrun_container.write_required(claims=self.claims, property_filter=property_filter) def get_entity_url(self, wikibase_url: str | None = None) -> str: from wikibaseintegrator.wbi_config import config diff --git a/wikibaseintegrator/entities/item.py b/wikibaseintegrator/entities/item.py index dcfe8d1d..92ec7f06 100644 --- a/wikibaseintegrator/entities/item.py +++ b/wikibaseintegrator/entities/item.py @@ -157,7 +157,7 @@ def from_json(self, json_data: dict[str, Any]) -> ItemEntity: def write(self, **kwargs: Any) -> ItemEntity: """ Write the ItemEntity data to the Wikibase instance and return the ItemEntity object returned by the instance. - extend :func:`~wikibaseintegrator.entities.BaseEntity._write` + This function extend :func:`~wikibaseintegrator.entities.baseentity.BaseEntity._write` :param data: The serialized object that is used as the data source. A newly created entity will be assigned an 'id'. :param summary: A summary of the edit diff --git a/wikibaseintegrator/models/claims.py b/wikibaseintegrator/models/claims.py index 95a6b374..1db2bf2c 100644 --- a/wikibaseintegrator/models/claims.py +++ b/wikibaseintegrator/models/claims.py @@ -155,9 +155,17 @@ def __len__(self): class Claim(BaseModel): + """ + extend :func:`wikibaseintegrator.models.basemodel.BaseModel` + + :param qualifiers: + :param id: + :param rank: + :param references: A References object, a list of Claim object or a list of list of Claim object + """ DTYPE = 'claim' - def __init__(self, qualifiers: Qualifiers | None = None, rank: WikibaseRank | None = None, references: References | list[Claim | list[Claim]] | None = None, + def __init__(self, qualifiers: Qualifiers | None = None, id: str | None = None, rank: WikibaseRank | None = None, references: References | list[Claim | list[Claim]] | None = None, snaktype: WikibaseSnakType = WikibaseSnakType.KNOWN_VALUE) -> None: """ @@ -170,7 +178,7 @@ def __init__(self, qualifiers: Qualifiers | None = None, rank: WikibaseRank | No self.type = 'statement' self.qualifiers = qualifiers or Qualifiers() self.qualifiers_order = [] - self.id = None + self.id = id self.rank = rank or WikibaseRank.NORMAL self.removed = False @@ -428,5 +436,5 @@ def ref_equal(oldref: References, newref: References) -> bool: return any(any(ref_equal(oldref, newref) for oldref in oldrefs) for newref in newrefs) @abstractmethod - def get_sparql_value(self) -> str: + def get_sparql_value(self, **kwargs: Any) -> str | None: pass diff --git a/wikibaseintegrator/models/qualifiers.py b/wikibaseintegrator/models/qualifiers.py index aa4fb1f8..f39178de 100644 --- a/wikibaseintegrator/models/qualifiers.py +++ b/wikibaseintegrator/models/qualifiers.py @@ -12,7 +12,7 @@ class Qualifiers(BaseModel): def __init__(self) -> None: - self.qualifiers: dict[str, list[Snak]] = {} + self.qualifiers: dict[str, list[Snak | Claim]] = {} @property def qualifiers(self): diff --git a/wikibaseintegrator/models/references.py b/wikibaseintegrator/models/references.py index a566fb84..8e54d466 100644 --- a/wikibaseintegrator/models/references.py +++ b/wikibaseintegrator/models/references.py @@ -146,6 +146,9 @@ def get_json(self) -> dict[str, dict | list]: } return json_data + def __eq__(self, other): + return self.snaks == other.snaks + def __iter__(self): return iter(self.snaks) diff --git a/wikibaseintegrator/models/snaks.py b/wikibaseintegrator/models/snaks.py index 8add41d5..13ba63f1 100644 --- a/wikibaseintegrator/models/snaks.py +++ b/wikibaseintegrator/models/snaks.py @@ -46,6 +46,9 @@ def get_json(self) -> dict[str, list]: json_data[property].append(snak.get_json()) return json_data + def __eq__(self, other): + return self.snaks == other.snaks + def __iter__(self): iterate = [] for snak in self.snaks.values(): diff --git a/wikibaseintegrator/wbi_config.py b/wikibaseintegrator/wbi_config.py index 8b80a18f..4f4f61ce 100644 --- a/wikibaseintegrator/wbi_config.py +++ b/wikibaseintegrator/wbi_config.py @@ -28,5 +28,6 @@ 'SPARQL_ENDPOINT_URL': 'https://query.wikidata.org/sparql', 'WIKIBASE_URL': 'http://www.wikidata.org', 'DEFAULT_LANGUAGE': 'en', - 'DEFAULT_LEXEME_LANGUAGE': 'Q1860' + 'DEFAULT_LEXEME_LANGUAGE': 'Q1860', + 'SPARQL_QUERY_LIMIT': 10000 } diff --git a/wikibaseintegrator/wbi_fastrun.py b/wikibaseintegrator/wbi_fastrun.py index a5221a1b..45d8a6f4 100644 --- a/wikibaseintegrator/wbi_fastrun.py +++ b/wikibaseintegrator/wbi_fastrun.py @@ -1,21 +1,13 @@ from __future__ import annotations -import collections -import copy import logging -from collections import defaultdict -from functools import lru_cache -from itertools import chain -from typing import TYPE_CHECKING +import re from wikibaseintegrator.datatypes import BaseDataType -from wikibaseintegrator.models import Claim +from wikibaseintegrator.models import Claim, Claims, Qualifiers, Reference, References from wikibaseintegrator.wbi_config import config -from wikibaseintegrator.wbi_enums import ActionIfExists, WikibaseDatatype -from wikibaseintegrator.wbi_helpers import execute_sparql_query, format_amount - -if TYPE_CHECKING: - from wikibaseintegrator.models import Claims +from wikibaseintegrator.wbi_enums import WikibaseRank +from wikibaseintegrator.wbi_helpers import execute_sparql_query log = logging.getLogger(__name__) @@ -23,641 +15,609 @@ class FastRunContainer: - def __init__(self, base_data_type: type[BaseDataType], mediawiki_api_url: str | None = None, sparql_endpoint_url: str | None = None, wikibase_url: str | None = None, - base_filter: list[BaseDataType | list[BaseDataType]] | None = None, use_refs: bool = False, case_insensitive: bool = False): - self.reconstructed_statements: list[BaseDataType] = [] - self.rev_lookup: defaultdict[str, set[str]] = defaultdict(set) - self.rev_lookup_ci: defaultdict[str, set[str]] = defaultdict(set) - self.prop_data: dict[str, dict] = {} - self.loaded_langs: dict[str, dict] = {} - self.base_filter: list[BaseDataType | list[BaseDataType]] = [] - self.base_filter_string = '' - self.prop_dt_map: dict[str, str] = {} - - self.base_data_type: type[BaseDataType] = base_data_type - self.mediawiki_api_url: str = str(mediawiki_api_url or config['MEDIAWIKI_API_URL']) - self.sparql_endpoint_url: str = str(sparql_endpoint_url or config['SPARQL_ENDPOINT_URL']) - self.wikibase_url: str = str(wikibase_url or config['WIKIBASE_URL']) - self.use_refs: bool = use_refs - self.case_insensitive: bool = case_insensitive - - if base_filter and any(base_filter): - self.base_filter = base_filter + """ + + :param base_filter: The default filter to initialize the dataset. A list made of BaseDataType or list of BaseDataType. + :param base_data_type: The default data type to create objects. + :param use_qualifiers: Use qualifiers during fastrun. Enabled by default. + :param use_references: Use references during fastrun. Disabled by default. + :param use_rank: Use rank during fastrun. Disabled by default. + :param cache: Put data returned by WDQS in cache. Enabled by default. + :param case_insensitive: + :param sparql_endpoint_url: SPARLQ endpoint URL. + :param wikibase_url: Wikibase URL used for the concept URI. + """ + + # TODO: Add support for case_insensitive + + data: dict[str, dict[str, list[dict[str, str]]]] + + def __init__(self, base_filter: list[BaseDataType | list[BaseDataType]], base_data_type: type[BaseDataType] | None = None, use_qualifiers: bool = True, + use_references: bool = False, use_rank: bool = False, cache: bool = True, case_insensitive: bool = False, sparql_endpoint_url: str | None = None, + wikibase_url: str | None = None): + + for k in base_filter: + if not isinstance(k, BaseDataType) and not (isinstance(k, list) and len(k) == 2 and isinstance(k[0], BaseDataType) and isinstance(k[1], BaseDataType)): + raise ValueError("base_filter must be an instance of BaseDataType or a list of instances of BaseDataType") + + self.data: dict[str, dict[str, list[dict[str, str]]]] = {} + + self.base_filter = base_filter + self.base_data_type = base_data_type or BaseDataType + self.sparql_endpoint_url = str(sparql_endpoint_url or config['SPARQL_ENDPOINT_URL']) + self.wikibase_url = str(wikibase_url or config['WIKIBASE_URL']) + self.use_qualifiers = use_qualifiers + self.use_references = use_references + self.use_rank = use_rank + self.cache = cache + self.case_insensitive = case_insensitive + self.properties_type: dict[str, str] = {} + + if self.case_insensitive: + raise ValueError("Case insensitive does not work for the moment.") + + def load_statements(self, claims: list[Claim] | Claims | Claim, cache: bool | None = None, wb_url: str | None = None, limit: int | None = None) -> None: + """ + Load the statements related to the given claims into the internal cache of the current object. + + :param claims: A Claim, Claims or list of Claim + :param wb_url: The first part of the concept URI of entities. + :param limit: The limit to request at one time. + :param cache: Put data returned by WDQS in cache. Enabled by default. + :return: + """ + if isinstance(claims, Claim): + claims = [claims] + elif (not isinstance(claims, list) or not all(isinstance(n, Claim) for n in claims)) and not isinstance(claims, Claims): + raise ValueError("claims must be an instance of Claim or Claims or a list of Claim") + + if cache is None: + cache = self.cache + + wb_url = wb_url or self.wikibase_url + + limit = limit or int(config['SPARQL_QUERY_LIMIT']) # type: ignore + + for claim in claims: + prop_nr = claim.mainsnak.property_number + + # Load each property from the Wikibase instance or the cache + if cache and prop_nr in self.data: + logging.debug("Property '%s' found in cache, %s elements", prop_nr, len(self.data[prop_nr])) + continue + + offset = 0 + + # Generate base filter + base_filter_string = '' for k in self.base_filter: if isinstance(k, BaseDataType): + # TODO: Add multiple values for a property (OR-operation) (with the VALUES tag?) if k.mainsnak.datavalue: - self.base_filter_string += '?item <{wb_url}/prop/direct/{prop_nr}> {entity} .\n'.format( - wb_url=self.wikibase_url, prop_nr=k.mainsnak.property_number, entity=k.get_sparql_value().format(wb_url=self.wikibase_url)) - else: - self.base_filter_string += '?item <{wb_url}/prop/direct/{prop_nr}> ?zz{prop_nr} .\n'.format( - wb_url=self.wikibase_url, prop_nr=k.mainsnak.property_number) + base_filter_string += '?entity <{wb_url}/prop/direct/{prop_nr}> {entity} .\n'.format( + wb_url=wb_url, prop_nr=k.mainsnak.property_number, entity=k.get_sparql_value(wikibase_url=wb_url)) + elif sum(map(lambda x, other=k: x.mainsnak.property_number == other.mainsnak.property_number, self.base_filter)) == 1: # type: ignore + base_filter_string += '?entity <{wb_url}/prop/direct/{prop_nr}> ?zz{prop_nr} .\n'.format( + wb_url=wb_url, prop_nr=k.mainsnak.property_number) elif isinstance(k, list) and len(k) == 2 and isinstance(k[0], BaseDataType) and isinstance(k[1], BaseDataType): if k[0].mainsnak.datavalue: - self.base_filter_string += '?item <{wb_url}/prop/direct/{prop_nr}>/<{wb_url}/prop/direct/{prop_nr2}>* {entity} .\n'.format( - wb_url=self.wikibase_url, prop_nr=k[0].mainsnak.property_number, prop_nr2=k[1].mainsnak.property_number, - entity=k[0].get_sparql_value().format(wb_url=self.wikibase_url)) + base_filter_string += '?entity <{wb_url}/prop/direct/{prop_nr}>/<{wb_url}/prop/direct/{prop_nr2}>* {entity} .\n'.format( + wb_url=wb_url, prop_nr=k[0].mainsnak.property_number, prop_nr2=k[1].mainsnak.property_number, + entity=k[0].get_sparql_value(wikibase_url=wb_url)) + # TODO: Remove ?zzPYY if another filter have the same property number, the same as above else: - self.base_filter_string += '?item <{wb_url}/prop/direct/{prop_nr1}>/<{wb_url}/prop/direct/{prop_nr2}>* ?zz{prop_nr1}{prop_nr2} .\n'.format( - wb_url=self.wikibase_url, prop_nr1=k[0].mainsnak.property_number, prop_nr2=k[1].mainsnak.property_number) + base_filter_string += '?entity <{wb_url}/prop/direct/{prop_nr1}>/<{wb_url}/prop/direct/{prop_nr2}>* ?zz{prop_nr1}{prop_nr2} .\n'.format( + wb_url=wb_url, prop_nr1=k[0].mainsnak.property_number, prop_nr2=k[1].mainsnak.property_number) else: raise ValueError("base_filter must be an instance of BaseDataType or a list of instances of BaseDataType") - def reconstruct_statements(self, qid: str) -> list[BaseDataType]: - reconstructed_statements: list[BaseDataType] = [] - - if qid not in self.prop_data: - self.reconstructed_statements = reconstructed_statements - return reconstructed_statements - - for prop_nr, dt in self.prop_data[qid].items(): - # get datatypes for qualifier props - q_props = set(chain(*([x[0] for x in d['qual']] for d in dt.values()))) - r_props = set(chain(*(set(chain(*([y[0] for y in x] for x in d['ref'].values()))) for d in dt.values()))) - props = q_props | r_props - for prop in props: - if prop not in self.prop_dt_map: - self.prop_dt_map.update({prop: self.get_prop_datatype(prop)}) - # reconstruct statements from frc (including unit, qualifiers, and refs) - for _, d in dt.items(): - qualifiers = [] - for q in d['qual']: - f = [x for x in self.base_data_type.subclasses if x.DTYPE == self.prop_dt_map[q[0]]][0] - # TODO: Add support for more data type (Time, MonolingualText, GlobeCoordinate) - if self.prop_dt_map[q[0]] == 'quantity': - qualifiers.append(f(value=q[1], prop_nr=q[0], unit=q[2])) - else: - qualifiers.append(f(value=q[1], prop_nr=q[0])) - - references = [] - for _, refs in d['ref'].items(): - this_ref = [] - for ref in refs: - f = [x for x in self.base_data_type.subclasses if x.DTYPE == self.prop_dt_map[ref[0]]][0] - this_ref.append(f(value=ref[1], prop_nr=ref[0])) - references.append(this_ref) - - f = [x for x in self.base_data_type.subclasses if x.DTYPE == self.prop_dt_map[prop_nr]][0] - # TODO: Add support for more data type - if self.prop_dt_map[prop_nr] == 'quantity': - datatype = f(prop_nr=prop_nr, qualifiers=qualifiers, references=references, unit=d['unit']) - datatype.parse_sparql_value(value=d['v'], unit=d['unit']) + qualifiers_filter_string = '' + if self.use_qualifiers: + for qualifier in claim.qualifiers: + fake_json = { + 'mainsnak': qualifier.get_json(), + 'type': qualifier.datatype, + 'id': 'Q0', + 'rank': 'normal' + } + f = [x for x in self.base_data_type.subclasses if x.DTYPE == qualifier.datatype][0]().from_json(json_data=fake_json) + qualifiers_filter_string += f'?sid pq:{qualifier.property_number} {f.get_sparql_value()}.\n' + + # We force a refresh of the data, remove the previous results + self.data[prop_nr] = {} + + while True: + if claim.mainsnak.datavalue and not cache: + query = ''' + #Tool: WikibaseIntegrator wbi_fastrun.load_statements + SELECT ?entity ?sid ?value ?property_type WHERE {{ + # Base filter string + {base_filter_string} + ?entity <{wb_url}/prop/{prop_nr}> ?sid. + <{wb_url}/entity/{prop_nr}> wikibase:propertyType ?property_type. + ?sid <{wb_url}/prop/statement/{prop_nr}> ?value. + ?sid <{wb_url}/prop/statement/{prop_nr}> {value}. + {qualifiers_filter_string} + }} + ORDER BY ?sid + OFFSET {offset} + LIMIT {limit} + ''' + + # Format the query + query = query.format(base_filter_string=base_filter_string, wb_url=wb_url, prop_nr=prop_nr, offset=str(offset), limit=str(limit), + value=claim.get_sparql_value(wikibase_url=wb_url), qualifiers_filter_string=qualifiers_filter_string) else: - datatype = f(prop_nr=prop_nr, qualifiers=qualifiers, references=references) - datatype.parse_sparql_value(value=d['v']) - reconstructed_statements.append(datatype) + query = ''' + #Tool: WikibaseIntegrator wbi_fastrun.load_statements + SELECT ?entity ?sid ?value ?property_type WHERE {{ + # Base filter string + {base_filter_string} + ?entity <{wb_url}/prop/{prop_nr}> ?sid. + <{wb_url}/entity/{prop_nr}> wikibase:propertyType ?property_type. + ?sid <{wb_url}/prop/statement/{prop_nr}> ?value. + {qualifiers_filter_string} + }} + ORDER BY ?sid + OFFSET {offset} + LIMIT {limit} + ''' + + # Format the query + # TODO: Add custom query support + query = query.format(base_filter_string=base_filter_string, wb_url=wb_url, prop_nr=prop_nr, offset=str(offset), limit=str(limit), + qualifiers_filter_string=qualifiers_filter_string) + + offset += limit # We increase the offset for the next iteration + results = execute_sparql_query(query=query, endpoint=self.sparql_endpoint_url)['results']['bindings'] + + for result in results: + entity = result['entity']['value'] + sid = result['sid']['value'] + # value = result['value']['value'] + property_type = result['property_type']['value'] + + # Use casefold for lower case + if self.case_insensitive: + result['value']['value'] = result['value']['value'].casefold() + + f = [x for x in self.base_data_type.subclasses if x.PTYPE == property_type][0]().from_sparql_value(sparql_value=result['value']) + + sparql_value = f.get_sparql_value() + if sparql_value is not None: + if sparql_value not in self.data[prop_nr]: + self.data[prop_nr][sparql_value] = [] - # this isn't used. done for debugging purposes - self.reconstructed_statements = reconstructed_statements - return reconstructed_statements + if prop_nr not in self.properties_type: + self.properties_type[prop_nr] = property_type - def get_items(self, claims: list[Claim] | Claims | Claim, cqid: str | None = None) -> set[str] | None: + self.data[prop_nr][sparql_value].append({'entity': entity, 'sid': sid}) + + if len(results) == 0 or len(results) < limit: + break + + def _load_qualifiers(self, sid: str, limit: int | None = None) -> Qualifiers: """ - Get items ID from a SPARQL endpoint + Load the qualifiers of a statement. - :param claims: A list of claims the entities should have - :param cqid: - :return: a list of entity ID or None - :exception: if there is more than one claim + :param sid: A statement ID. + :param limit: The limit to request at one time. + :return: A Qualifiers object. """ - match_sets = [] + offset = 0 - if isinstance(claims, Claim): - claims = [claims] - elif (not isinstance(claims, list) or not all(isinstance(n, Claim) for n in claims)) and not isinstance(claims, Claims): - raise ValueError("claims must be an instance of Claim or Claims or a list of Claim") + if not isinstance(sid, str): + raise ValueError('sid must be a string') - for claim in claims: - # skip to next if statement has no value or no data type defined, e.g. for deletion objects - if not claim.mainsnak.datavalue and not claim.mainsnak.datatype: - continue + limit = limit or int(config['SPARQL_QUERY_LIMIT']) # type: ignore - prop_nr = claim.mainsnak.property_number + # TODO: Add cache - if prop_nr not in self.prop_dt_map: - log.debug("%s not found in fastrun", prop_nr) + # We force a refresh of the data, remove the previous results + qualifiers: Qualifiers = Qualifiers() + while True: + query = f''' + #Tool: WikibaseIntegrator wbi_fastrun._load_qualifiers + SELECT ?property ?value ?property_type WHERE {{ + VALUES ?sid {{ <{sid}> }} + ?sid ?predicate ?value. + ?property wikibase:qualifier ?predicate. + ?property wikibase:propertyType ?property_type. + }} + ORDER BY ?sid + OFFSET {offset} + LIMIT {limit} + ''' - if isinstance(claim, BaseDataType) and type(claim) != BaseDataType: # pylint: disable=unidiomatic-typecheck - self.prop_dt_map.update({prop_nr: claim.DTYPE}) - else: - self.prop_dt_map.update({prop_nr: self.get_prop_datatype(prop_nr)}) - self._query_data(prop_nr=prop_nr, use_units=self.prop_dt_map[prop_nr] == 'quantity') - - # noinspection PyProtectedMember - current_value = claim.get_sparql_value() - - if self.prop_dt_map[prop_nr] == 'wikibase-item': - current_value = claim.mainsnak.datavalue['value']['id'] - - log.debug(current_value) - # if self.case_insensitive: - # log.debug("case insensitive enabled") - # log.debug(self.rev_lookup_ci) - # else: - # log.debug(self.rev_lookup) - - if current_value in self.rev_lookup: - # quick check for if the value has ever been seen before, if not, write required - match_sets.append(set(self.rev_lookup[current_value])) - elif self.case_insensitive and current_value.casefold() in self.rev_lookup_ci: - match_sets.append(set(self.rev_lookup_ci[current_value.casefold()])) - else: - log.debug("no matches for rev lookup for %s", current_value) - - if not match_sets: - return None - - if cqid: - matching_qids = {cqid} - else: - matching_qids = match_sets[0].intersection(*match_sets[1:]) + # Format the query + # query = query.format(wb_url=wb_url, sid=sid, offset=str(offset), limit=str(limit)) + offset += limit # We increase the offset for the next iteration + results = execute_sparql_query(query=query, endpoint=self.sparql_endpoint_url)['results']['bindings'] + + for result in results: + property = result['property']['value'] + property_type = result['property_type']['value'] + + if property not in self.properties_type: + self.properties_type[property] = property_type + + # Use casefold for lower case + if self.case_insensitive: + result['value']['value'] = result['value']['value'].casefold() + + f = [x for x in self.base_data_type.subclasses if x.PTYPE == property_type][0](prop_nr=property).from_sparql_value(sparql_value=result['value']) + qualifiers.add(f) + + if len(results) == 0 or len(results) < limit: + break - return matching_qids + return qualifiers - def get_item(self, claims: list[Claim] | Claims | Claim, cqid: str | None = None) -> str | None: + def _load_references(self, sid: str, limit: int = 10000) -> References: """ + Load the references of a statement. - :param claims: A list of claims the entity should have - :param cqid: - :return: An entity ID, None if there is more than one. + :param sid: A statement ID. + :param limit: The limit to request at one time. + :return: A References object. """ + offset = 0 - matching_qids: set[str] | None = self.get_items(claims=claims, cqid=cqid) + if not isinstance(sid, str): + raise ValueError('sid must be a string') - if matching_qids is None: - return None + limit = limit or int(config['SPARQL_QUERY_LIMIT']) # type: ignore - # check if there are any items that have all of these values - # if not, a write is required no matter what - if not len(matching_qids) == 1: - log.debug("no matches (%s)", len(matching_qids)) - return None + # TODO: Add cache - return matching_qids.pop() + # We force a refresh of the data, remove the previous results + references: References = References() + while True: + query = f''' + #Tool: WikibaseIntegrator wbi_fastrun._load_references + SELECT ?srid ?ref_property ?ref_value ?property_type WHERE {{ + VALUES ?sid {{ <{sid}> }} + + ?sid prov:wasDerivedFrom ?srid. + ?srid ?ref_predicate ?ref_value. + ?ref_property wikibase:reference ?ref_predicate. + ?ref_property wikibase:propertyType ?property_type. + }} + ORDER BY ?srid + OFFSET {offset} + LIMIT {limit} + ''' - def write_required(self, data: list[Claim], action_if_exists: ActionIfExists = ActionIfExists.REPLACE_ALL, cqid: str | None = None) -> bool: - """ - Check if a write is required + # Format the query + # query = query.format(wb_url=wb_url, sid=sid, offset=str(offset), limit=str(limit)) + offset += limit # We increase the offset for the next iteration + results = execute_sparql_query(query=query, endpoint=self.sparql_endpoint_url)['results']['bindings'] - :param data: - :param action_if_exists: - :param cqid: - :return: Return True if the write is required - """ - del_props = set() - data_props = set() - append_props = [] - if action_if_exists == ActionIfExists.APPEND_OR_REPLACE: - append_props = [x.mainsnak.property_number for x in data] - - for x in data: - if x.mainsnak.datavalue and x.mainsnak.datatype: - data_props.add(x.mainsnak.property_number) - qid = self.get_item(data, cqid) - - if not qid: - return True + reference = {} - reconstructed_statements = self.reconstruct_statements(qid) - tmp_rs = copy.deepcopy(reconstructed_statements) - - # handle append properties - for p in append_props: - app_data = [x for x in data if x.mainsnak.property_number == p] # new statements - rec_app_data = [x for x in tmp_rs if x.mainsnak.property_number == p] # orig statements - comp = [] - for x in app_data: - for y in rec_app_data: - if x.mainsnak.datavalue == y.mainsnak.datavalue: - if y.equals(x, include_ref=self.use_refs) and action_if_exists != ActionIfExists.FORCE_APPEND: - comp.append(True) - - # comp = [True for x in app_data for y in rec_app_data if x.equals(y, include_ref=self.use_refs)] - if len(comp) != len(app_data): - log.debug("failed append: %s", p) - return True - - tmp_rs = [x for x in tmp_rs if x.mainsnak.property_number not in append_props and x.mainsnak.property_number in data_props] - - for date in data: - # ensure that statements meant for deletion get handled properly - reconst_props = {x.mainsnak.property_number for x in tmp_rs} - if not date.mainsnak.datatype and date.mainsnak.property_number in reconst_props: - log.debug("returned from delete prop handling") - return True - - if not date.mainsnak.datavalue or not date.mainsnak.datatype: - # Ignore the deletion statements which are not in the reconstructed statements. - continue + for result in results: + ref_property = result['ref_property']['value'] + srid = result['srid']['value'] + property_type = result['property_type']['value'] - if date.mainsnak.property_number in append_props: - # TODO: check if value already exist and already have the same value - continue + if ref_property not in self.properties_type: + self.properties_type[ref_property] = property_type - if not date.mainsnak.datavalue and not date.mainsnak.datatype: - del_props.add(date.mainsnak.property_number) + # Use casefold for lower case + if self.case_insensitive: + result['value']['value'] = result['value']['value'].casefold() - # this is where the magic happens - # date is a new statement, proposed to be written - # tmp_rs are the reconstructed statements == current state of the item - bool_vec = [] - for x in tmp_rs: - if (x == date or (self.case_insensitive and x.mainsnak.datavalue.casefold() == date.mainsnak.datavalue.casefold())) and x.mainsnak.property_number not in del_props: - bool_vec.append(x.equals(date, include_ref=self.use_refs)) - else: - bool_vec.append(False) - # bool_vec = [x.equals(date, include_ref=self.use_refs, fref=self.ref_comparison_f) and - # x.mainsnak.property_number not in del_props for x in tmp_rs] - - log.debug("bool_vec: %s", bool_vec) - log.debug("-----------------------------------") - for x in tmp_rs: - if x == date and x.mainsnak.property_number not in del_props: - log.debug([x.mainsnak.property_number, x.mainsnak.datavalue, [z.datavalue for z in x.qualifiers]]) - log.debug([date.mainsnak.property_number, date.mainsnak.datavalue, [z.datavalue for z in date.qualifiers]]) - elif x.mainsnak.property_number == date.mainsnak.property_number: - log.debug([x.mainsnak.property_number, x.mainsnak.datavalue, [z.datavalue for z in x.qualifiers]]) - log.debug([date.mainsnak.property_number, date.mainsnak.datavalue, [z.datavalue for z in date.qualifiers]]) - - if not any(bool_vec): - log.debug(len(bool_vec)) - log.debug("fast run failed at %s", date.mainsnak.property_number) - return True - - log.debug("fast run success") - tmp_rs.pop(bool_vec.index(True)) - - if len(tmp_rs) > 0: - log.debug("failed because not zero") - for x in tmp_rs: - log.debug([x.mainsnak.property_number, x.mainsnak.datavalue, [z.mainsnak.datavalue for z in x.qualifiers]]) - log.debug("failed because not zero--END") - return True + f = [x for x in self.base_data_type.subclasses if x.PTYPE == property_type][0](prop_nr=ref_property).from_sparql_value(sparql_value=result['ref_value']) - return False + if srid not in reference: + reference[srid] = Reference() - def init_language_data(self, lang: str, lang_data_type: str) -> None: - """ - Initialize language data store + reference[srid].add(f) - :param lang: language code - :param lang_data_type: 'label', 'description' or 'aliases' - :return: None - """ - if lang not in self.loaded_langs: - self.loaded_langs[lang] = {} + # Add each Reference to the References + for _, ref in reference.items(): + references.add(ref) - if lang_data_type not in self.loaded_langs[lang]: - result = self._query_lang(lang=lang, lang_data_type=lang_data_type) - if result is not None: - data = self._process_lang(result=result) - self.loaded_langs[lang].update({lang_data_type: data}) + if len(results) == 0 or len(results) < limit: + break + + return references - def get_language_data(self, qid: str, lang: str, lang_data_type: str) -> list[str]: + def _load_rank(self, sid: str) -> WikibaseRank | None: """ - get language data for specified qid - - :param qid: Wikibase item id - :param lang: language code - :param lang_data_type: 'label', 'description' or 'aliases' - :return: list of strings - If nothing is found: - If lang_data_type == label: returns [''] - If lang_data_type == description: returns [''] - If lang_data_type == aliases: returns [] + Load the rank of a statement. + + :param sid: A statement ID. + :param limit: The limit to request at one time. + :return: A References object. """ - self.init_language_data(lang, lang_data_type) - current_lang_data = self.loaded_langs[lang][lang_data_type] - all_lang_strings = current_lang_data.get(qid, []) - if not all_lang_strings and lang_data_type in {'label', 'description'}: - all_lang_strings = [''] - return all_lang_strings + if not isinstance(sid, str): + raise ValueError('sid must be a string') - def check_language_data(self, qid: str, lang_data: list, lang: str, lang_data_type: str, action_if_exists: ActionIfExists = ActionIfExists.APPEND_OR_REPLACE) -> bool: - """ - Method to check if certain language data exists as a label, description or aliases - :param qid: Wikibase item id - :param lang_data: list of string values to check - :param lang: language code - :param lang_data_type: What kind of data is it? 'label', 'description' or 'aliases'? - :param action_if_exists: If aliases already exist, APPEND_OR_REPLACE or REPLACE_ALL - :return: boolean - """ - all_lang_strings = {x.strip().casefold() for x in self.get_language_data(qid, lang, lang_data_type)} + # TODO: Add limit? - if action_if_exists == ActionIfExists.REPLACE_ALL: - return collections.Counter(all_lang_strings) != collections.Counter(map(lambda x: x.casefold(), lang_data)) + # TODO: Add cache - for s in lang_data: - if s.strip().casefold() not in all_lang_strings: - log.debug("fastrun failed at: %s, string: %s", lang_data_type, s) - return True + query = f''' + #Tool: WikibaseIntegrator wbi_fastrun._load_rank + SELECT ?rank WHERE {{ + VALUES ?sid {{ <{sid}> }} + ?sid wikibase:rank ?rank. + }} + ''' - return False + results = execute_sparql_query(query=query, endpoint=self.sparql_endpoint_url)['results']['bindings'] - def get_all_data(self) -> dict[str, dict]: - return self.prop_data + for result in results: + rank_raw = result['rank']['value'].rsplit('#', 1)[-1] - def format_query_results(self, r: list, prop_nr: str) -> None: - """ - `r` is the results of the sparql query in _query_data and is modified in place - `prop_nr` is needed to get the property datatype to determine how to format the value - - `r` is a list of dicts. The keys are: - sid: statement ID - item: the subject. the item this statement is on - v: the object. The value for this statement - unit: property unit - pq: qualifier property - qval: qualifier value - qunit: qualifier unit - ref: reference ID - pr: reference property - rval: reference value + if rank_raw == 'PreferredRank': + return WikibaseRank.PREFERRED + elif rank_raw == 'NormalRank': + return WikibaseRank.NORMAL + elif rank_raw == 'DeprecatedRank': + return WikibaseRank.DEPRECATED + + return None + + def _get_property_type(self, prop_nr: str | int) -> str: """ - prop_dt = self.get_prop_datatype(prop_nr) - for i in r: - for value in ['item', 'sid', 'pq', 'pr', 'ref', 'unit', 'qunit']: - if value in i: - if i[value]['value'].startswith(self.wikibase_url): - i[value] = i[value]['value'].split('/')[-1] - else: - # TODO: Dirty fix. If we are not on wikidata, we force unitless (Q199) to '1' - if i[value]['value'] == 'http://www.wikidata.org/entity/Q199': - i[value] = '1' - else: - i[value] = i[value]['value'] - - # make sure datetimes are formatted correctly. - # the correct format is '+%Y-%m-%dT%H:%M:%SZ', but is sometimes missing the plus?? - # some difference between RDF and xsd:dateTime that I don't understand - for value in ['v', 'qval', 'rval']: - if value in i: - if i[value].get("datatype") == 'http://www.w3.org/2001/XMLSchema#dateTime' and not i[value]['value'][0] in '+-': - # if it is a dateTime and doesn't start with plus or minus, add a plus - i[value]['value'] = '+' + i[value]['value'] - - # these three ({'v', 'qval', 'rval'}) are values that can be any data type - # strip off the URI if they are wikibase-items - if 'v' in i: - if i['v']['type'] == 'uri' and prop_dt == 'wikibase-item': - i['v'] = i['v']['value'].split('/')[-1] - elif i['v']['type'] == 'literal' and prop_dt == 'quantity': - i['v'] = format_amount(i['v']['value']) - elif i['v']['type'] == 'literal' and prop_dt == 'monolingualtext': - f = [x for x in self.base_data_type.subclasses if x.DTYPE == prop_dt][0](prop_nr=prop_nr, text=i['v']['value'], language=i['v']['xml:lang']) - i['v'] = f.get_sparql_value() - else: - f = [x for x in self.base_data_type.subclasses if x.DTYPE == prop_dt][0](prop_nr=prop_nr) - if not f.parse_sparql_value(value=i['v']['value'], type=i['v']['type']): - raise ValueError("Can't parse the value with parse_sparql_value()") - i['v'] = f.get_sparql_value() - - # Note: no-value and some-value don't actually show up in the results here - # see for example: select * where { wd:Q7207 p:P40 ?c . ?c ?d ?e } - if not isinstance(i['v'], dict): - self.rev_lookup[i['v']].add(i['item']) - if self.case_insensitive: - self.rev_lookup_ci[i['v'].casefold()].add(i['item']) - - # handle qualifier value - if 'qval' in i: - qual_prop_dt = self.get_prop_datatype(prop_nr=i['pq']) - if i['qval']['type'] == 'uri' and qual_prop_dt == 'wikibase-item': - i['qval'] = i['qval']['value'].split('/')[-1] - elif i['qval']['type'] == 'literal' and qual_prop_dt == 'quantity': - i['qval'] = format_amount(i['qval']['value']) - else: - i['qval'] = i['qval']['value'] - - # handle reference value - if 'rval' in i: - ref_prop_dt = self.get_prop_datatype(prop_nr=i['pr']) - if i['rval']['type'] == 'uri' and ref_prop_dt == 'wikibase-item': - i['rval'] = i['rval']['value'].split('/')[-1] - elif i['rval']['type'] == 'literal' and ref_prop_dt == 'quantity': - i['rval'] = format_amount(i['rval']['value']) - else: - i['rval'] = i['rval']['value'] - - def update_frc_from_query(self, r: list, prop_nr: str) -> None: - # r is the output of format_query_results - # this updates the frc from the query (result of _query_data) - for i in r: - qid = i['item'] - if qid not in self.prop_data: - self.prop_data[qid] = {prop_nr: {}} - if prop_nr not in self.prop_data[qid]: - self.prop_data[qid].update({prop_nr: {}}) - if i['sid'] not in self.prop_data[qid][prop_nr]: - self.prop_data[qid][prop_nr].update({i['sid']: {}}) - # update values for this statement (not including ref) - d = {'v': i['v']} - self.prop_data[qid][prop_nr][i['sid']].update(d) - - if 'qual' not in self.prop_data[qid][prop_nr][i['sid']]: - self.prop_data[qid][prop_nr][i['sid']]['qual'] = set() - if 'pq' in i and 'qval' in i: - if 'qunit' in i: - self.prop_data[qid][prop_nr][i['sid']]['qual'].add((i['pq'], i['qval'], i['qunit'])) - else: - self.prop_data[qid][prop_nr][i['sid']]['qual'].add((i['pq'], i['qval'], '1')) + Obtain the property type of the given property by looking at the SPARQL endpoint. - if 'ref' not in self.prop_data[qid][prop_nr][i['sid']]: - self.prop_data[qid][prop_nr][i['sid']]['ref'] = {} - if 'ref' in i: - if i['ref'] not in self.prop_data[qid][prop_nr][i['sid']]['ref']: - self.prop_data[qid][prop_nr][i['sid']]['ref'][i['ref']] = set() - self.prop_data[qid][prop_nr][i['sid']]['ref'][i['ref']].add((i['pr'], i['rval'])) + :param prop_nr: The property number. + :return: The SPARQL version of the property type. + """ + if isinstance(prop_nr, int): + prop_nr = 'P' + str(prop_nr) + elif prop_nr is not None: + pattern = re.compile(r'^P?([0-9]+)$') + matches = pattern.match(prop_nr) - if 'unit' not in self.prop_data[qid][prop_nr][i['sid']]: - self.prop_data[qid][prop_nr][i['sid']]['unit'] = '1' - if 'unit' in i: - self.prop_data[qid][prop_nr][i['sid']]['unit'] = i['unit'] + if not matches: + raise ValueError('Invalid prop_nr, format must be "P[0-9]+"') - def _query_data(self, prop_nr: str, use_units: bool = False, page_size: int = 10000) -> None: - page_count = 0 + prop_nr = 'P' + str(matches.group(1)) - while True: - # Query header - query = ''' - #Tool: WikibaseIntegrator wbi_fastrun._query_data - SELECT ?sid ?item ?v ?unit ?pq ?qval ?qunit ?ref ?pr ?rval - WHERE - {{ - ''' + query = f'''#Tool: WikibaseIntegrator wbi_fastrun._get_property_type + SELECT ?property_type WHERE {{ wd:{prop_nr} wikibase:propertyType ?property_type. }}''' - # Base filter - query += ''' - {base_filter} + results = execute_sparql_query(query=query, endpoint=self.sparql_endpoint_url)['results']['bindings'][0]['property_type']['value'] - ?item <{wb_url}/prop/{prop_nr}> ?sid . - ''' + return results - # Amount and unit - if use_units: - query += ''' - {{ - <{wb_url}/entity/{prop_nr}> wikibase:propertyType ?property_type . - FILTER (?property_type != wikibase:Quantity) - ?sid <{wb_url}/prop/statement/{prop_nr}> ?v . - }} - # Get amount and unit for the statement - UNION - {{ - ?sid <{wb_url}/prop/statement/value/{prop_nr}> [wikibase:quantityAmount ?v; wikibase:quantityUnit ?unit] . - }} - ''' - else: - query += ''' - <{wb_url}/entity/{prop_nr}> wikibase:propertyType ?property_type . - ?sid <{wb_url}/prop/statement/{prop_nr}> ?v . - ''' - - # Qualifiers - # Amount and unit - if use_units: - query += ''' - # Get qualifiers - OPTIONAL - {{ - {{ - # Get simple values for qualifiers which are not of type quantity - ?sid ?propQualifier ?qval . - ?pq wikibase:qualifier ?propQualifier . - ?pq wikibase:propertyType ?qualifer_property_type . - FILTER (?qualifer_property_type != wikibase:Quantity) - }} - UNION - {{ - # Get amount and unit for qualifiers of type quantity - ?sid ?pqv [wikibase:quantityAmount ?qval; wikibase:quantityUnit ?qunit] . - ?pq wikibase:qualifierValue ?pqv . - }} - }} - ''' - else: - query += ''' - # Get qualifiers - OPTIONAL - {{ - # Get simple values for qualifiers - ?sid ?propQualifier ?qval . - ?pq wikibase:qualifier ?propQualifier . - ?pq wikibase:propertyType ?qualifer_property_type . - }} - ''' - - # References - if self.use_refs: - query += ''' - # get references - OPTIONAL {{ - ?sid prov:wasDerivedFrom ?ref . - ?ref ?pr ?rval . - [] wikibase:reference ?pr - }} - ''' - # Query footer - query += ''' - }} ORDER BY ?sid OFFSET {offset} LIMIT {page_size} - ''' + def get_entities(self, claims: list[Claim] | Claims | Claim, cache: bool | None = None, query_limit: int | None = None) -> list[str]: + """ + Return a list of entities who correspond to the specified claims. - # Format the query - query = query.format(wb_url=self.wikibase_url, base_filter=self.base_filter_string, prop_nr=prop_nr, offset=str(page_count * page_size), page_size=str(page_size)) + :param claims: A list of claims to query the SPARQL endpoint. + :param cache: Put data returned by WDQS in cache. Enabled by default. + :param query_limit: Limit the amount of results from the SPARQL server + :return: A list of entity ID. + """ + if isinstance(claims, Claim): + claims = [claims] + elif (not isinstance(claims, list) or not all(isinstance(n, Claim) for n in claims)) and not isinstance(claims, Claims): + raise ValueError("claims must be an instance of Claim or Claims or a list of Claim") - results = execute_sparql_query(query=query, endpoint=self.sparql_endpoint_url)['results']['bindings'] - self.format_query_results(results, prop_nr) - self.update_frc_from_query(results, prop_nr) - page_count += 1 + self.load_statements(claims=claims, cache=cache, limit=query_limit) + + result = [] + for base_filter in self.base_filter: + sub_result = set() + if isinstance(base_filter, BaseDataType): # TODO: Manage case where filter is a list of BaseDataType + if not base_filter.mainsnak.datavalue: + for claim in claims: + if base_filter.mainsnak.property_number == claim.mainsnak.property_number: + # Add the returned entities to the result list + if claim.get_sparql_value() in self.data[claim.mainsnak.property_number]: + for rez in self.data[claim.mainsnak.property_number][claim.get_sparql_value()]: # type: ignore + sub_result.add(rez['entity'].rsplit('/', 1)[-1]) + else: + if base_filter.mainsnak.property_number in self.data: + if base_filter.get_sparql_value() in self.data[base_filter.mainsnak.property_number]: + for rez in self.data[base_filter.mainsnak.property_number][base_filter.get_sparql_value()]: # type: ignore + sub_result.add(rez['entity'].rsplit('/', 1)[-1]) + else: + continue + result.append(sub_result) - if len(results) == 0 or len(results) < page_size: - break + if result: + if len(result) > 1: + return list(set(result[0]).intersection(*result[1:])) + return list(result[0]) + else: + return [] - def _query_lang(self, lang: str, lang_data_type: str) -> list[dict[str, dict]] | None: + def write_required(self, claims: list[Claim] | Claims | Claim, entity_filter: list[str] | str | None = None, property_filter: list[str] | str | None = None, + use_qualifiers: bool | None = None, use_references: bool | None = None, use_rank: bool | None = None, cache: bool | None = None, + query_limit: int | None = None) -> bool: """ - :param lang: - :param lang_data_type: + :param claims: + :param entity_filter: Allows you to filter the entities checked. This can be a single entity or a list of entities. + :param property_filter: Allows you to limit the difference comparison to a list of properties + :param use_qualifiers: Use qualifiers during fastrun. Enabled by default. + :param use_references: Use references during fastrun. Disabled by default. + :param use_rank: Use rank during fastrun. Disabled by default. + :param cache: Put data returned by WDQS in cache. Enabled by default. + :param query_limit: Limit the amount of results from the SPARQL server + :return: a boolean True if a write is required. False otherwise. """ - lang_data_type_dict = { - 'label': 'rdfs:label', - 'description': 'schema:description', - 'aliases': 'skos:altLabel' - } + if isinstance(claims, Claim): + claims = [claims] + elif (not isinstance(claims, list) or not all(isinstance(n, Claim) for n in claims)) and not isinstance(claims, Claims): + raise ValueError("claims must be an instance of Claim or Claims or a list of Claim") - query = f''' - #Tool: WikibaseIntegrator wbi_fastrun._query_lang - SELECT ?item ?label WHERE {{ - {self.base_filter_string} + if len(claims) == 0: + raise ValueError("claims must have at least one claim") - OPTIONAL {{ - ?item {lang_data_type_dict[lang_data_type]} ?label FILTER (lang(?label) = "{lang}") . - }} - }} - ''' + if entity_filter is not None and isinstance(entity_filter, str): + entity_filter = [entity_filter] - log.debug(query) - - return execute_sparql_query(query=query, endpoint=self.sparql_endpoint_url)['results']['bindings'] - - @staticmethod - def _process_lang(result: list) -> defaultdict[str, set]: - data = defaultdict(set) - for r in result: - qid = r['item']['value'].split("/")[-1] - if 'label' in r: - data[qid].add(r['label']['value']) - return data - - @lru_cache(maxsize=100000) - def get_prop_datatype(self, prop_nr: str) -> str | None: # pylint: disable=no-self-use - from wikibaseintegrator import WikibaseIntegrator - wbi = WikibaseIntegrator() - property = wbi.property.get(prop_nr) - datatype = property.datatype - if isinstance(datatype, WikibaseDatatype): - return datatype.value - return datatype - - def clear(self) -> None: - """ - convenience function to empty this fastrun container - """ - self.prop_dt_map = {} - self.prop_data = {} - self.rev_lookup = defaultdict(set) - self.rev_lookup_ci = defaultdict(set) + if property_filter is not None and isinstance(property_filter, str): + property_filter = [property_filter] - def __repr__(self) -> str: - """A mixin implementing a simple __repr__.""" - return "<{klass} @{id:x} {attrs}>".format( # pylint: disable=consider-using-f-string - klass=self.__class__.__name__, - id=id(self) & 0xFFFFFF, - attrs="\r\n\t ".join(f"{k}={v!r}" for k, v in self.__dict__.items()), - ) + # Generate a property_filter if None is given + if property_filter is None: + property_filter = [claim.mainsnak.property_number for claim in claims] + if use_qualifiers is None: + use_qualifiers = self.use_qualifiers + if use_references is None: + use_references = self.use_references + if use_rank is None: + use_rank = self.use_rank -def get_fastrun_container(base_filter: list[BaseDataType | list[BaseDataType]] | None = None, use_refs: bool = False, case_insensitive: bool = False) -> FastRunContainer: + def contains(in_list, lambda_filter): + for x in in_list: + if lambda_filter(x): + return True + return False + + # Get all the potential statements + statements_to_check: dict[str, list[str]] = {} + for claim in claims: + if claim.mainsnak.property_number in property_filter: + self.load_statements(claims=claim, cache=cache, limit=query_limit) + if claim.mainsnak.property_number in self.data: + if not contains(self.data[claim.mainsnak.property_number], (lambda x, c=claim: x == c.get_sparql_value())): + # Checks if a property with this value does not exist, return True if none exist + logging.debug("Value '%s' does not exist for property '%s'", claim.get_sparql_value(), claim.mainsnak.property_number) + return True + # TODO: Doesn't work in the value already exists in another entity + + sparql_value = claim.get_sparql_value() + if sparql_value: + for statement in self.data[claim.mainsnak.property_number][sparql_value]: + if claim.mainsnak.property_number not in statements_to_check: + statements_to_check[claim.mainsnak.property_number] = [] + statements_to_check[claim.mainsnak.property_number].append(statement['entity']) + + # Generate an intersection between all the statements by property, based on the entity + # Generate only the list of entities + list_entities: list[list[str]] = [] + for _, statements in statements_to_check.items(): + # entities = [statement['entity'] for statement in statements_to_check[property]] + list_entities.append(list(set(statements))) + + # Return the intersection between all the list + common_entities: list = list_entities.pop() + for entities in list_entities: + common_entities = list(set(common_entities).intersection(entities)) + + # If there is none common entities, return True because we need a write + if not common_entities: + logging.debug("There is no common entities") + return True + + # If the property is already found, load it completely to compare deeply + for claim in claims: + # Check if the property is in the filter + if claim.mainsnak.property_number in property_filter: + sparql_value = claim.get_sparql_value() + # If the value exist in the cache + if sparql_value and claim.mainsnak.property_number in self.data and sparql_value in self.data[claim.mainsnak.property_number]: + entity_cache = [statement['entity'].rsplit('/', 1)[-1] for statement in self.data[claim.mainsnak.property_number][sparql_value]] + if entity_filter: + common_cache_filter = [value for value in entity_cache if value in entity_filter] + else: + common_cache_filter = entity_cache + # If there is common entities between the cache and the entity_filter + if common_cache_filter: + for statement in self.data[claim.mainsnak.property_number][sparql_value]: + if entity_filter and statement['entity'].rsplit('/', 1)[-1] not in entity_filter: + continue + + if statement['entity'] in common_entities: + if use_qualifiers: + qualifiers = self._load_qualifiers(statement['sid'], limit=100) + + if len(qualifiers) != len(claim.qualifiers): + logging.debug("Difference in number of qualifiers, '%i' != '%i'", len(qualifiers), len(claim.qualifiers)) + return True + + for qualifier in qualifiers: + if qualifier not in claim.qualifiers: + logging.debug("Difference between two qualifiers") + return True + + if use_references: + references = self._load_references(statement['sid'], limit=100) + + if sum(len(ref) for ref in references) != sum(len(x) for x in claim.references): + logging.debug("Difference in number of references, '%i' != '%i'", sum(len(ref) for ref in references), sum(len(x) for x in claim.references)) + return True + + for reference in references: + if reference not in claim.references: + logging.debug("Difference between two references") + return True + + if use_rank: + rank = self._load_rank(statement['sid']) + + if claim.rank != rank: + logging.debug("Difference with the rank") + return True + else: + logging.debug("No common entities between cache and entity_filter") + return True + # Enable this if the value doesn't exist ? + else: + logging.debug("Value doesn't already exist in an entity") + return True + + return False + + +def get_fastrun_container(base_filter: list[BaseDataType | list[BaseDataType]], use_qualifiers: bool = True, use_references: bool = False, use_rank: bool = False, + cache: bool = True, case_insensitive: bool = False) -> FastRunContainer: + """ + Return a FastRunContainer object, create a new one if it doesn't already exist. + + :param base_filter: The default filter to initialize the dataset. A list made of BaseDataType or list of BaseDataType. + :param use_qualifiers: Use qualifiers during fastrun. Enabled by default. + :param use_references: Use references during fastrun. Disabled by default. + :param use_rank: Use rank during fastrun. Disabled by default. + :param cache: Put data returned by WDQS in cache. Enabled by default. + :param case_insensitive: + :return: a FastRunContainer object + """ if base_filter is None: base_filter = [] # We search if we already have a FastRunContainer with the same parameters to reuse it - fastrun_container = _search_fastrun_store(base_filter=base_filter, use_refs=use_refs, case_insensitive=case_insensitive) + fastrun_container = _search_fastrun_store(base_filter=base_filter, use_qualifiers=use_qualifiers, use_references=use_references, use_rank=use_rank, + case_insensitive=case_insensitive, cache=cache) return fastrun_container -def _search_fastrun_store(base_filter: list[BaseDataType | list[BaseDataType]] | None = None, use_refs: bool = False, case_insensitive: bool = False) -> FastRunContainer: +def _search_fastrun_store(base_filter: list[BaseDataType | list[BaseDataType]], use_qualifiers: bool = True, use_references: bool = False, use_rank: bool = False, + cache: bool = True, case_insensitive: bool = False) -> FastRunContainer: + """ + Search for an existing FastRunContainer with the same parameters or create a new one if it doesn't exist. + + :param base_filter: The default filter to initialize the dataset. A list made of BaseDataType or list of BaseDataType. + :param use_qualifiers: Use qualifiers during fastrun. Enabled by default. + :param use_references: Use references during fastrun. Disabled by default. + :param use_rank: Use rank during fastrun. Disabled by default. + :param cache: Put data returned by WDQS in cache. Enabled by default. + :param case_insensitive: + :return: a FastRunContainer object + """ for fastrun in fastrun_store: - if (fastrun.base_filter == base_filter) and (fastrun.use_refs == use_refs) and (fastrun.case_insensitive == case_insensitive) and ( - fastrun.sparql_endpoint_url == config['SPARQL_ENDPOINT_URL']): + if (fastrun.base_filter == base_filter) and (fastrun.use_qualifiers == use_qualifiers) and (fastrun.use_references == use_references) and ( + fastrun.use_rank == use_rank) and (fastrun.case_insensitive == case_insensitive) and (fastrun.sparql_endpoint_url == config['SPARQL_ENDPOINT_URL']): + fastrun.cache = cache return fastrun # In case nothing was found in the fastrun_store log.info("Create a new FastRunContainer") - fastrun_container = FastRunContainer(base_data_type=BaseDataType, base_filter=base_filter, use_refs=use_refs, case_insensitive=case_insensitive) + fastrun_container = FastRunContainer(base_data_type=BaseDataType, base_filter=base_filter, use_qualifiers=use_qualifiers, use_references=use_references, use_rank=use_rank, + cache=cache, case_insensitive=case_insensitive) fastrun_store.append(fastrun_container) return fastrun_container