-
Notifications
You must be signed in to change notification settings - Fork 0
Add possibility to overwrite data in datapackage #73
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
8fc19e5
f004f92
25f5b33
93f3e2b
87ebb76
e4c70ad
4c52975
964bb15
f7698f4
c1d8911
46ae7bc
5649e71
e7cee9b
5c37c8f
a0cfd11
c954c33
eb5d839
cf6370d
cca0854
ff1d50b
bbd0a76
ad0967b
7a912d9
5faedca
2ce1407
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,3 +34,6 @@ jobs: | |
|
|
||
| - name: Run tests | ||
| run: uv run test.py | ||
|
|
||
| - name: Run pytest | ||
| run: uv run pytest | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,4 +2,4 @@ uv.lock | |
| .DS_Store | ||
| catalog/ | ||
| data/ | ||
| .vscode/ | ||
| .vscode/ | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| # Copyright COORDONNÉES 2025, 2026 | ||
| # SPDX-License-Identifier: MPL-2.0 | ||
|
|
||
| import warnings | ||
|
|
||
|
|
||
| # ignore warnings due to shadowing of Pydantic's "schema" field in "Resource" | ||
| REGEX_TO_IGNORE = ( | ||
| 'Field name "schema" in "Resource" shadows an attribute in parent "(Base)?Model"' | ||
| ) | ||
| warnings.filterwarnings( | ||
| "ignore", | ||
| category=UserWarning, | ||
| module="dplib", | ||
| message=REGEX_TO_IGNORE, | ||
| ) | ||
|
|
||
| warnings.filterwarnings( | ||
| "ignore", | ||
| category=UserWarning, | ||
| module="coordo", | ||
| message=REGEX_TO_IGNORE, | ||
| ) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -40,6 +40,13 @@ def handle_path(path: str | list[str]) -> str: | |
| return path | ||
|
|
||
|
|
||
| def check_resource_fields_match(res1: Resource, res2: Resource) -> None: | ||
| if not res1.has_same_schema_as(res2): | ||
| raise ValueError( | ||
| f"Schemas do not match for resources {res1.name!r} and {res2.name!r}" | ||
| ) | ||
|
|
||
|
|
||
| class DataPackage(pydantic.BaseModel): | ||
| id: Optional[str] = None | ||
| name: str = pydantic.Field(pattern=r"^[a-z0-9._-]+$") | ||
|
|
@@ -91,39 +98,64 @@ def save(self): | |
| ) | ||
|
|
||
| def remove_resource(self, name: str) -> None: | ||
| """ | ||
| Remove a resource from the package. | ||
| For all other resources in the current datapackage, check if they have a foreign key pointing to this resource | ||
| and raise error it if so. | ||
| Args: | ||
| name (str): the name of the resource to remove | ||
| """ | ||
| print(f"Removing resource {name!r} from DataPackage {self.name!r}") | ||
| resource = self.get_resource(name=name) | ||
| # looping over all resources in the current datapackage, other than <resource> | ||
| for res in self.resources: | ||
| if res.name == name: | ||
| continue | ||
| sm = safe(res, "schema") | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this should stay. I'm not really fine with the fact that suppressing a resource is automatically deleting the fks, i prefer we raise, tell the user to delete the fks and try again. It's much more explicit and less error-prone
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hum ok 👍 But then, when adding data in "overwrite" strategy, what should we do?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IMO coordo should try to avoid doing side effects, if there is foreign keys then it's up to the user (the user being either a person or another project using coordo, data4trees for example) to decide how to manage it |
||
| if sm.foreignKeys: | ||
| for fk in sm.foreignKeys: | ||
| assert fk.reference.resource != name, ( | ||
| f"Can't remove the resource {name} : {res.name} have a foreign key pointing to this resource." | ||
| ) | ||
| res_schema = safe(res, "schema") | ||
| if res_schema.foreignKeys: | ||
| for fk in res_schema.foreignKeys: | ||
| if fk.reference.resource == name: | ||
| # build a string containing the list of foreign key field pairs | ||
| fk_part_names_str = "\n".join(res.get_fk_names(fk)) | ||
| raise ValueError( | ||
| f"Can't remove the resource {name!r} : {res.name!r} has a foreign key pointing to this resource. " | ||
| f"Please remove the following foreign keys beforehand:\n{fk_part_names_str}" | ||
| ) | ||
| # remove the file associated with the resource | ||
| if resource.path: | ||
| path = handle_path(resource.path) | ||
| Path(self._basepath / path).unlink() | ||
| # update resources list | ||
| self.resources = [res for res in self.resources if res.name != name] | ||
|
|
||
| def add_resource(self, resource: Resource) -> None: | ||
| if any(res.name == resource.name for res in self.resources): | ||
| print(f"Adding resource {resource.name!r} to package {self.name!r}") | ||
| if self.resource_exists(resource.name): | ||
| raise ValueError( | ||
| f"A resource named {resource.name} already exists in package {self.name}." | ||
| f"A resource named {resource.name!r} already exists in package {self.name!r}. " | ||
| "Please remove the existing resource before adding a new one." | ||
| ) | ||
| resource._package = self | ||
| self.resources.append(resource) | ||
| else: | ||
| resource._package = self | ||
| self.resources.append(resource) | ||
|
|
||
| def update_resource(self, resource: Resource) -> None: | ||
| pass | ||
|
|
||
| def get_resource(self, name: str) -> Resource: | ||
| resource = next(res for res in self.resources if res.name == name) | ||
| assert resource is not None, f"Resource {name} not found." | ||
| return resource | ||
| found_resources = [res for res in self.resources if res.name == name] | ||
| assert len(found_resources) <= 1, f"Multiple resources named {name!r} found." | ||
| assert len(found_resources) > 0, f"Resource {name!r} not found." | ||
| return found_resources[0] | ||
|
|
||
| def write_resource(self, resource_name: str, it: Iterable[dict]): | ||
| pass | ||
| # resource = self.get_resource(name=resource_name) | ||
| # schema = resource.get_schema() | ||
|
|
||
| def resource_exists(self, name: str) -> bool: | ||
| return any(res.name == name for res in self.resources) | ||
|
|
||
| def prepare_db(self) -> tuple[duckdb.DuckDBPyConnection, sa.MetaData]: | ||
| conn = load_conn() | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,7 +5,7 @@ | |
|
|
||
| import duckdb | ||
| import pydantic | ||
| from dplib.models import Contributor, Dialect, ForeignKey, License, Schema, Source | ||
| from dplib.models import Contributor, Dialect, ForeignKey, ForeignKeyReference, License, Schema, Source | ||
| from pydantic import model_validator | ||
|
|
||
| from .db_helpers import prepare_path | ||
|
|
@@ -50,7 +50,21 @@ def load_table(self, conn: duckdb.DuckDBPyConnection): | |
| query = f'CREATE VIEW "{self.name}" AS SELECT * FROM {prepare_path(self.package._basepath / self.path)}' | ||
| conn.execute(query) | ||
|
|
||
| def add_foreignkey(self, fk: ForeignKey) -> None: | ||
| def add_foreignkey(self, fields: list[str], foreign_fields: list[str], foreign_resource: str) -> None: | ||
| # TODO: remove this check when addition of multiple fields at once is supported | ||
| if len(fields) > 1 or len(foreign_fields) > 1: | ||
| raise ValueError("Adding a foreign key with multiple fields is not supported yet.") | ||
|
|
||
| fk = ForeignKey( | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Composite foreign keys are not yet supported by coordo so we should limit to only one field pointing to one another field. Also we can't have multiple keys pointing to the same external resource (because the parser wouldn't know which one to use when auto-joining)
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I understand that the cli does not support that yet, but perhaps we could already implement something more general? It does not hurt anyway
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah I understand that you want to support something more general but IMO it's better to block it and then add it later (and also update the parser at the same time) than to let users do something that is not supported by other parts of coordo
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Even if this is not exposed from the cli, we could for instance use this method from D4T backend , so it is exposed as a lib component. Therefore I rather agree with Mathias, let's not expose something we don't really support yet.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I made the modification :) |
||
| fields=fields, | ||
| reference=ForeignKeyReference( | ||
| fields=foreign_fields, | ||
| resource=None if self.name == foreign_resource else foreign_resource, | ||
| ) | ||
| ) | ||
| fk_part_names_str = " & ".join(self.get_fk_names(fk)) | ||
| print(f"Adding foreign key {fk_part_names_str}") | ||
|
|
||
| if not self._package: | ||
| raise ValueError("You can't add a foreign key to an orphan resource.") | ||
| field_names = [f.name for f in self.schema.fields] | ||
|
|
@@ -64,15 +78,43 @@ def add_foreignkey(self, fk: ForeignKey) -> None: | |
| ) | ||
| field_names = [f.name for f in parent_resource.schema.fields] | ||
| for f in fk.reference.fields: | ||
| assert ( | ||
| f in field_names | ||
| ), f"Resource {parent_resource.name} has no field named {f}" | ||
| assert f in field_names, ( | ||
| f"Resource {parent_resource.name} has no field named {f}" | ||
| ) | ||
| if fk in self.schema.foreignKeys: | ||
| raise ValueError(f"Foreign key {fk_part_names_str} already exists in resource {self.name}") | ||
| self.schema.foreignKeys.append(fk) | ||
|
|
||
| def remove_foreignkey(self, fields: list[str], foreign_fields: list[str], foreign_resource: str) -> None: | ||
| fk = ForeignKey( | ||
| fields=fields, | ||
| reference=ForeignKeyReference( | ||
| fields=foreign_fields, | ||
| resource=None if self.name == foreign_resource else foreign_resource, | ||
| ) | ||
| ) | ||
| fk_part_names_str = " & ".join(self.get_fk_names(fk)) | ||
| print(f"Removing foreign key {fk_part_names_str}") | ||
| if fk not in self.schema.foreignKeys: | ||
| raise ValueError(f"Foreign key {fk_part_names_str} not found in resource {self.name}") | ||
| self.schema.foreignKeys.remove(fk) | ||
|
|
||
| @model_validator(mode="after") | ||
| def check_data_or_path(self) -> Self: | ||
| provided = [self.data, self.path] | ||
| count = len([f for f in provided if f is not None]) | ||
| if count != 1: | ||
| raise ValueError("Exactly one of 'data' or 'path' must be provided.") | ||
| return self | ||
|
|
||
| def has_same_schema_as(self, other: Self) -> bool: | ||
| for attr in vars(self.schema): | ||
| if getattr(self.schema, attr) != getattr(other.schema, attr): | ||
| return False | ||
| return True | ||
|
|
||
| def get_fk_names(self, fk: ForeignKey) -> list[str]: | ||
| return [ | ||
| f"'{self.name}.{field}' -> '{fk.reference.resource}.{reference_field}'" | ||
| for field, reference_field in zip(fk.fields, fk.reference.fields) | ||
| ] | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,9 @@ | ||
| # Copyright COORDONNÉES 2025, 2026 | ||
| # SPDX-License-Identifier: MPL-2.0 | ||
|
|
||
| from . import file, kobotoolbox | ||
| from .loader import ResourceAction | ||
| from .kobotoolbox_loader import KoboToolboxLoader | ||
| from .file_loader import FileLoader | ||
|
|
||
| __all__ = ["kobotoolbox", "file"] | ||
|
|
||
| __all__ = ["ResourceAction", "KoboToolboxLoader", "FileLoader"] |
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| # Copyright COORDONNÉES 2025, 2026 | ||
| # SPDX-License-Identifier: MPL-2.0 | ||
|
|
||
| import shutil | ||
| from pathlib import Path | ||
|
|
||
| from coordo.loaders.loader import Loader, ResourceAction | ||
| from ..datapackage import Field, Resource, Schema | ||
| from ..datapackage.db_helpers import prepare_path, to_dp_type | ||
|
|
||
|
|
||
| class FileLoader(Loader): | ||
| def __init__(self, package: Path, path: Path, action: ResourceAction): | ||
| super().__init__(package, action) | ||
| self.path = path | ||
|
|
||
| def extract(self): | ||
| query = f"SELECT * FROM {prepare_path(self.path)}" | ||
| conn, _ = self.dp.prepare_db() | ||
| rel = conn.sql(query) | ||
| schema = Schema() | ||
| for name, type in zip(rel.columns, rel.types): | ||
| schema.add_field(Field(name=name, **to_dp_type(type))) | ||
| conn.close() | ||
| # creating resurce for file | ||
| resource = Resource( | ||
| name=self.path.stem, | ||
| path=self.path.name, | ||
| schema=schema, | ||
| ) | ||
| self.resources = [resource] | ||
|
|
||
| def transform(self): | ||
| pass | ||
|
|
||
| def load(self): | ||
| shutil.copy(self.path, self.dp._basepath / self.path.name) |
Uh oh!
There was an error while loading. Please reload this page.