From 8fc19e59a93c88acee2317e7601669bff8ad5a9c Mon Sep 17 00:00:00 2001 From: Olivier Date: Sat, 11 Apr 2026 15:23:35 +0200 Subject: [PATCH 01/22] create __init__.py and add filter for warning 'UserWarning: Field name "schema" in "Resource" shadows an attribute in parent "BaseModel"' --- coordo-py/coordo/__init__.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 coordo-py/coordo/__init__.py diff --git a/coordo-py/coordo/__init__.py b/coordo-py/coordo/__init__.py new file mode 100644 index 0000000..a6cf95c --- /dev/null +++ b/coordo-py/coordo/__init__.py @@ -0,0 +1,19 @@ +import warnings + +# ignore warnings due to shadowing of Pydantic's "schema" field in "Resource" +REGEX_TO_IGNORE = ( + 'Field name "schema" in "Resource" shadows an attribute in parent "(Base)?Model"' +) +warnings.filterwarnings( + "ignore", + category=UserWarning, + module="dplib", + message=REGEX_TO_IGNORE, +) + +warnings.filterwarnings( + "ignore", + category=UserWarning, + module="coordo", + message=REGEX_TO_IGNORE, +) From f004f9202ec4b8781d53ad73a593511f9bbdf889 Mon Sep 17 00:00:00 2001 From: Olivier Date: Sun, 12 Apr 2026 19:07:32 +0200 Subject: [PATCH 02/22] implement functionnality to overwrite existing file --- coordo-py/coordo/__init__.py | 14 ++++++ coordo-py/coordo/cli.py | 9 ++-- coordo-py/coordo/datapackage/package.py | 56 +++++++++++++++++---- coordo-py/coordo/datapackage/resource.py | 6 +++ coordo-py/coordo/loaders/kobotoolbox.py | 62 +++++++++++++++++++----- 5 files changed, 122 insertions(+), 25 deletions(-) diff --git a/coordo-py/coordo/__init__.py b/coordo-py/coordo/__init__.py index a6cf95c..8926ded 100644 --- a/coordo-py/coordo/__init__.py +++ b/coordo-py/coordo/__init__.py @@ -1,4 +1,7 @@ import warnings +from enum import Enum +from typing import Annotated +import typer # ignore warnings due to shadowing of Pydantic's "schema" field in "Resource" REGEX_TO_IGNORE = ( @@ -17,3 +20,14 @@ module="coordo", message=REGEX_TO_IGNORE, ) + + +class LoadingStrategy(str, Enum): + raise_error = "raise_error" + overwrite = "overwrite" + merge = "merge" + +StrategyType = Annotated[ + LoadingStrategy, + typer.Option(help="Strategy to use in case of already existing resource") +] \ No newline at end of file diff --git a/coordo-py/coordo/cli.py b/coordo-py/coordo/cli.py index 2ff3bc5..bd02b86 100644 --- a/coordo-py/coordo/cli.py +++ b/coordo-py/coordo/cli.py @@ -7,7 +7,7 @@ import typer from dplib.models.schema.foreignKey import ForeignKey, ForeignKeyReference -from coordo import loaders +from coordo import loaders, LoadingStrategy, StrategyType from coordo.datapackage import DataPackage from coordo.sql.builder import build_query @@ -78,14 +78,16 @@ def static_files(filename): load = typer.Typer() + @load.command() def kobotoolbox( xlsform: Path, xlsdata: Path, package: Path = typer.Option(help="Path to the package directory"), + strategy: StrategyType = LoadingStrategy.raise_error, ): dp = DataPackage.from_path(package) - loaders.kobotoolbox.load(dp, xlsform, xlsdata) + loaders.kobotoolbox.load(dp, xlsform, xlsdata, strategy) dp.save() @@ -93,10 +95,11 @@ def kobotoolbox( def file( path: Path, package: Path = typer.Option(".", help="Path to the package directory"), + strategy: StrategyType = LoadingStrategy.raise_error, ): dp = DataPackage.from_path(package) try: - loaders.file.load(dp, path) + loaders.file.load(dp, path, strategy) except ValueError as e: raise typer.BadParameter( f"{e} Add --overwrite if you wish to continue.", param_hint="path" diff --git a/coordo-py/coordo/datapackage/package.py b/coordo-py/coordo/datapackage/package.py index b405d28..5638280 100644 --- a/coordo-py/coordo/datapackage/package.py +++ b/coordo-py/coordo/datapackage/package.py @@ -22,6 +22,7 @@ from dplib.plugins.sql.models import SqlSchema from pygeofilter.ast import AstType +from coordo import LoadingStrategy from coordo.sql.builder import build_query, compile_query from coordo.sql.helpers import load_conn @@ -91,26 +92,61 @@ def save(self): ) def remove_resource(self, name: str) -> None: + """ + Remove a resource from the package: + - for all other resources in the current datapackage, remove any foreign keys pointing to this resource. + - remove the file associated with the resource. + Args: + name (str): the name of the resource to remove + """ + print(f"Removing resource {name} from DataPackage {self.name}") resource = self.get_resource(name=name) + # looping over all resources in the current datapackage, other than for res in self.resources: if res.name == name: continue - sm = safe(res, "schema") - if sm.foreignKeys: - for fk in sm.foreignKeys: - assert fk.reference.resource != name, ( - f"Can't remove the resource {name} : {res.name} have a foreign key pointing to this resource." - ) + + # getting the schema of the resource + res_schema = safe(res, "schema") + + # removing all foreign keys pointing to , if any + # we do it in two steps: first collect all keys to remove, then remove them + # so that we don't modify the list while iterating over it + if res_schema.foreignKeys: + foreign_keys_to_remove = [] + for fk in res_schema.foreignKeys: + if fk.reference.resource == name: + foreign_keys_to_remove.append(fk) + for fk in foreign_keys_to_remove: + res.remove_foreignkey(fk) + if resource.path: path = handle_path(resource.path) Path(self._basepath / path).unlink() self.resources = [res for res in self.resources if res.name != name] - def add_resource(self, resource: Resource) -> None: + def add_resource(self, resource: Resource, strategy: LoadingStrategy) -> None: + """ + Add a resource to the DataPackage. + + Args: + resource (Resource): The resource to add. + strategy (LoadingStrategy): The strategy to use when a resource with the same name already exists. + """ + print(f"Adding resource {resource.name} to DataPackage {self.name} with strategy={strategy.name}") if any(res.name == resource.name for res in self.resources): - raise ValueError( - f"A resource named {resource.name} already exists in package {self.name}." - ) + if strategy == LoadingStrategy.overwrite: + self.remove_resource(resource.name) + elif strategy == LoadingStrategy.merge: + pass + elif strategy == LoadingStrategy.raise_error: + raise ValueError( + f"A resource named {resource.name} already exists in package {self.name}." + ) + else: + raise ValueError( + f"Unknown strategy {strategy} for resource {resource.name}." + ) resource._package = self self.resources.append(resource) diff --git a/coordo-py/coordo/datapackage/resource.py b/coordo-py/coordo/datapackage/resource.py index 67a3c32..d0162e3 100644 --- a/coordo-py/coordo/datapackage/resource.py +++ b/coordo-py/coordo/datapackage/resource.py @@ -68,6 +68,12 @@ def add_foreignkey(self, fk: ForeignKey) -> None: f in field_names ), f"Resource {parent_resource.name} has no field named {f}" self.schema.foreignKeys.append(fk) + + def remove_foreignkey(self, fk: ForeignKey) -> None: + if fk not in self.schema.foreignKeys: + raise ValueError(f"Foreign key {fk} not found in resource {self.name}") + self.schema.foreignKeys.remove(fk) + @model_validator(mode="after") def check_data_or_path(self) -> Self: diff --git a/coordo-py/coordo/loaders/kobotoolbox.py b/coordo-py/coordo/loaders/kobotoolbox.py index 5898ab0..d992862 100644 --- a/coordo-py/coordo/loaders/kobotoolbox.py +++ b/coordo-py/coordo/loaders/kobotoolbox.py @@ -14,6 +14,7 @@ from pyxform.xls2json import parse_file_to_json from shapely.geometry import Point +from coordo import LoadingStrategy from coordo.datapackage import ( DataPackage, Field, @@ -134,11 +135,20 @@ def coords_to_point(coords): return Point(lon, lat, alt) -def load(package: DataPackage, xlsform: Path, xlsdata: Path): +def load(dp: DataPackage, xlsform: Path, xlsdata: Path, strategy: LoadingStrategy): + """ + Loads a datapackage from an XLS form and XLS data file. + The xlsform is parsed with the pyxform.xls2json.parse_file_to_json function + while the xlsdata is parsed with pandas read_excel or read_csv functions. + """ + # parse form from xlsform + print(f"Parsing form from {xlsform}") form = parse_file_to_json(str(xlsform)) name = cast(str, form["id_string"].lower()) main_resource = _create_resource(name) - _parse_form(package, form, main_resource) + _parse_form(dp, form, main_resource, strategy) + + print(f"Parsing data from {xlsdata}") if xlsdata.suffix == ".xlsx": sheets_dict = pd.read_excel(xlsdata, sheet_name=None) elif xlsdata.suffix == ".csv": @@ -153,9 +163,10 @@ def load(package: DataPackage, xlsform: Path, xlsdata: Path): } else: raise ValueError(f"Unsupported file format: {xlsdata}") + for i, (sheet_name, sheet) in enumerate(sheets_dict.items()): table_name = main_resource.name if i == 0 else sheet_name.lower() - resource = next(r for r in package.resources if r.name == table_name) + resource = next(r for r in dp.resources if r.name == table_name) schema = safe(resource, "schema") sheet = ( sheet.rename( @@ -185,7 +196,7 @@ def load(package: DataPackage, xlsform: Path, xlsdata: Path): sheet = sheet[fields] sheet = sheet.replace({np.nan: None}) - path = Path(package._basepath, table_name + ".parquet") + path = Path(dp._basepath, table_name + ".parquet") geo_cols = [f.name for f in schema.fields if f.type == "geojson"] if geo_cols: gdf = gpd.GeoDataFrame(sheet, geometry=geo_cols[0], crs="EPSG:4326") @@ -200,7 +211,7 @@ def load(package: DataPackage, xlsform: Path, xlsdata: Path): sheet.to_parquet(path, index=False) -def _create_resource(name) -> Resource: +def _create_resource(name: str) -> Resource: return Resource( name=name, path=name + ".parquet", @@ -211,20 +222,47 @@ def _create_resource(name) -> Resource: ) -def _parse_form(pkg: DataPackage, form, resource: Resource): - _parse_questions(pkg, form["children"], resource) - pkg.add_resource(resource) +def _parse_form(pkg: DataPackage, form: dict[str, Any], resource: Resource, strategy: LoadingStrategy): + _parse_questions(pkg, form["children"], resource, strategy) + # print(resource) + pkg.add_resource(resource, strategy) -def _parse_questions(pkg, questions: List[Dict[str, Any]], resource: Resource): +def _parse_questions( + pkg: DataPackage, questions: List[Dict[str, Any]], resource: Resource, strategy: LoadingStrategy +): + """ + Parses questions (list of dictionaries) and adds them to the resource's schema. + Example of structure of questions: + [ + { + 'type': 'integer', + 'name': '', + 'label': '