Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
8fc19e5
create __init__.py and add filter for warning 'UserWarning: Field nam…
OlivierCoen Apr 11, 2026
f004f92
implement functionnality to overwrite existing file
OlivierCoen Apr 12, 2026
25f5b33
add parameter strategy to choose overwrite behavior + implement overw…
OlivierCoen Apr 13, 2026
93f3e2b
Update README.md
OlivierCoen Apr 13, 2026
87ebb76
add copyright
OlivierCoen Apr 13, 2026
e4c70ad
apply ruff
OlivierCoen Apr 13, 2026
4c52975
factorise loaders
OlivierCoen Apr 14, 2026
964bb15
pass linters
OlivierCoen Apr 14, 2026
f7698f4
address first comments for review
OlivierCoen Apr 16, 2026
c1d8911
simplify resouce handling by datapackage
OlivierCoen Apr 16, 2026
46ae7bc
add list of individual foreign keys to remove in error message
OlivierCoen Apr 18, 2026
5649e71
update addition of foreign key
OlivierCoen Apr 18, 2026
e7cee9b
add error when foreign key exists
OlivierCoen Apr 18, 2026
5c37c8f
add cli for removing foreign key
OlivierCoen Apr 18, 2026
a0cfd11
Merge branch 'main' into feat/add-additional-data
OlivierCoen Apr 18, 2026
c954c33
Update resource.py
OlivierCoen Apr 19, 2026
eb5d839
Merge branch 'main' into feat/add-additional-data
OlivierCoen Apr 20, 2026
cf6370d
add anonymised test data
OlivierCoen Apr 18, 2026
cca0854
change location of files
OlivierCoen Apr 19, 2026
ff1d50b
add test function
OlivierCoen Apr 21, 2026
bbd0a76
factorise code in test_cli.py
OlivierCoen Apr 21, 2026
ad0967b
update uv environment
OlivierCoen Apr 21, 2026
7a912d9
add pytest step in CI
OlivierCoen Apr 21, 2026
5faedca
add copyright
OlivierCoen Apr 21, 2026
2ce1407
Merge pull request #79 from dataforgoodfr/feat/add-test-data
OlivierCoen Apr 22, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/coordo-py.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,6 @@ jobs:

- name: Run tests
run: uv run test.py

- name: Run pytest
run: uv run pytest
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ uv.lock
.DS_Store
catalog/
data/
.vscode/
.vscode/
15 changes: 15 additions & 0 deletions coordo-py/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,18 @@ FROM parents LEFT OUTER JOIN children ON parents.id = children.parent_id)
SELECT anon_1.avg_1
FROM anon_1
```


## Development

### Data types

#### KoboToolbox

For surveys, Kobotoolbox uses the standard XLSForm format.
Briefly, each `xlsx` file contains 3 sheets:
- `survey`
- `choices`
- `settings`

All information can be found at https://support.kobotoolbox.org/edit_forms_excel.html
23 changes: 23 additions & 0 deletions coordo-py/coordo/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Copyright COORDONNÉES 2025, 2026
# SPDX-License-Identifier: MPL-2.0

import warnings


# ignore warnings due to shadowing of Pydantic's "schema" field in "Resource"
REGEX_TO_IGNORE = (
'Field name "schema" in "Resource" shadows an attribute in parent "(Base)?Model"'
)
warnings.filterwarnings(
"ignore",
category=UserWarning,
module="dplib",
message=REGEX_TO_IGNORE,
)

warnings.filterwarnings(
"ignore",
category=UserWarning,
module="coordo",
message=REGEX_TO_IGNORE,
)
46 changes: 26 additions & 20 deletions coordo-py/coordo/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@
from pathlib import Path

import typer
from dplib.models.schema.foreignKey import ForeignKey, ForeignKeyReference

from coordo import loaders
from coordo.loaders import ResourceAction, KoboToolboxLoader, FileLoader
from coordo.datapackage import DataPackage
from coordo.sql.builder import build_query

Expand Down Expand Up @@ -83,25 +82,18 @@ def kobotoolbox(
xlsform: Path,
xlsdata: Path,
package: Path = typer.Option(help="Path to the package directory"),
action: ResourceAction = typer.Option(help="Action to perform on resource"),
):
dp = DataPackage.from_path(package)
loaders.kobotoolbox.load(dp, xlsform, xlsdata)
dp.save()
KoboToolboxLoader(package, xlsform, xlsdata, action).etl()


@load.command()
def file(
path: Path,
package: Path = typer.Option(".", help="Path to the package directory"),
action: ResourceAction = typer.Option(help="Action to perform on resource"),
):
dp = DataPackage.from_path(package)
try:
loaders.file.load(dp, path)
except ValueError as e:
raise typer.BadParameter(
f"{e} Add --overwrite if you wish to continue.", param_hint="path"
)
dp.save()
FileLoader(package, path, action).etl()


app.add_typer(load, name="load")
Expand All @@ -119,13 +111,27 @@ def add_foreignkey(
dp.get_resource(
resource,
).add_foreignkey(
ForeignKey(
fields=[field],
reference=ForeignKeyReference(
fields=[foreign_field],
resource=None if resource == foreign_resource else foreign_resource,
),
),
fields=[field],
foreign_fields=[foreign_field],
foreign_resource=foreign_resource,
)
dp.save()

@app.command()
def remove_foreignkey(
from_: str,
to: str,
package: Path = typer.Option(".", help="Path to the package directory"),
):
dp = DataPackage.from_path(package)
resource, field = from_.split(".")
foreign_resource, foreign_field = to.split(".")
dp.get_resource(
resource,
).remove_foreignkey(
fields=[field],
foreign_fields=[foreign_field],
foreign_resource=foreign_resource,
)
dp.save()

Expand Down
58 changes: 45 additions & 13 deletions coordo-py/coordo/datapackage/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,13 @@ def handle_path(path: str | list[str]) -> str:
return path


def check_resource_fields_match(res1: Resource, res2: Resource) -> None:
Comment thread
Cabanon marked this conversation as resolved.
if not res1.has_same_schema_as(res2):
raise ValueError(
f"Schemas do not match for resources {res1.name!r} and {res2.name!r}"
)


class DataPackage(pydantic.BaseModel):
id: Optional[str] = None
name: str = pydantic.Field(pattern=r"^[a-z0-9._-]+$")
Expand Down Expand Up @@ -91,39 +98,64 @@ def save(self):
)

def remove_resource(self, name: str) -> None:
"""
Remove a resource from the package.
For all other resources in the current datapackage, check if they have a foreign key pointing to this resource
and raise error it if so.
Args:
name (str): the name of the resource to remove
"""
print(f"Removing resource {name!r} from DataPackage {self.name!r}")
resource = self.get_resource(name=name)
# looping over all resources in the current datapackage, other than <resource>
for res in self.resources:
if res.name == name:
continue
sm = safe(res, "schema")
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should stay. I'm not really fine with the fact that suppressing a resource is automatically deleting the fks, i prefer we raise, tell the user to delete the fks and try again. It's much more explicit and less error-prone

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hum ok 👍 But then, when adding data in "overwrite" strategy, what should we do?
Should we just check that both schemas match, overwrite data if it does, and raise error if it does not (with a message saying that we should force removing foreign keys beforehand if we really want to overwrite)?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO coordo should try to avoid doing side effects, if there is foreign keys then it's up to the user (the user being either a person or another project using coordo, data4trees for example) to decide how to manage it

if sm.foreignKeys:
for fk in sm.foreignKeys:
assert fk.reference.resource != name, (
f"Can't remove the resource {name} : {res.name} have a foreign key pointing to this resource."
)
res_schema = safe(res, "schema")
if res_schema.foreignKeys:
for fk in res_schema.foreignKeys:
if fk.reference.resource == name:
# build a string containing the list of foreign key field pairs
fk_part_names_str = "\n".join(res.get_fk_names(fk))
raise ValueError(
f"Can't remove the resource {name!r} : {res.name!r} has a foreign key pointing to this resource. "
f"Please remove the following foreign keys beforehand:\n{fk_part_names_str}"
)
# remove the file associated with the resource
if resource.path:
path = handle_path(resource.path)
Path(self._basepath / path).unlink()
# update resources list
self.resources = [res for res in self.resources if res.name != name]

def add_resource(self, resource: Resource) -> None:
if any(res.name == resource.name for res in self.resources):
print(f"Adding resource {resource.name!r} to package {self.name!r}")
if self.resource_exists(resource.name):
raise ValueError(
f"A resource named {resource.name} already exists in package {self.name}."
f"A resource named {resource.name!r} already exists in package {self.name!r}. "
"Please remove the existing resource before adding a new one."
)
resource._package = self
self.resources.append(resource)
else:
resource._package = self
self.resources.append(resource)

def update_resource(self, resource: Resource) -> None:
pass

def get_resource(self, name: str) -> Resource:
resource = next(res for res in self.resources if res.name == name)
assert resource is not None, f"Resource {name} not found."
return resource
found_resources = [res for res in self.resources if res.name == name]
assert len(found_resources) <= 1, f"Multiple resources named {name!r} found."
assert len(found_resources) > 0, f"Resource {name!r} not found."
return found_resources[0]

def write_resource(self, resource_name: str, it: Iterable[dict]):
pass
# resource = self.get_resource(name=resource_name)
# schema = resource.get_schema()

def resource_exists(self, name: str) -> bool:
return any(res.name == name for res in self.resources)

def prepare_db(self) -> tuple[duckdb.DuckDBPyConnection, sa.MetaData]:
conn = load_conn()

Expand Down
52 changes: 47 additions & 5 deletions coordo-py/coordo/datapackage/resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import duckdb
import pydantic
from dplib.models import Contributor, Dialect, ForeignKey, License, Schema, Source
from dplib.models import Contributor, Dialect, ForeignKey, ForeignKeyReference, License, Schema, Source
from pydantic import model_validator

from .db_helpers import prepare_path
Expand Down Expand Up @@ -50,7 +50,21 @@ def load_table(self, conn: duckdb.DuckDBPyConnection):
query = f'CREATE VIEW "{self.name}" AS SELECT * FROM {prepare_path(self.package._basepath / self.path)}'
conn.execute(query)

def add_foreignkey(self, fk: ForeignKey) -> None:
def add_foreignkey(self, fields: list[str], foreign_fields: list[str], foreign_resource: str) -> None:
# TODO: remove this check when addition of multiple fields at once is supported
if len(fields) > 1 or len(foreign_fields) > 1:
raise ValueError("Adding a foreign key with multiple fields is not supported yet.")

fk = ForeignKey(
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Composite foreign keys are not yet supported by coordo so we should limit to only one field pointing to one another field. Also we can't have multiple keys pointing to the same external resource (because the parser wouldn't know which one to use when auto-joining)

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand that the cli does not support that yet, but perhaps we could already implement something more general? It does not hurt anyway

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I understand that you want to support something more general but IMO it's better to block it and then add it later (and also update the parser at the same time) than to let users do something that is not supported by other parts of coordo

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even if this is not exposed from the cli, we could for instance use this method from D4T backend , so it is exposed as a lib component. Therefore I rather agree with Mathias, let's not expose something we don't really support yet.
Maybe we can keep this current signature, but make an assert on the lists so if their size is > 1 , we raise "Composite forign keys are not supported yet". WDYT ?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I made the modification :)

fields=fields,
reference=ForeignKeyReference(
fields=foreign_fields,
resource=None if self.name == foreign_resource else foreign_resource,
)
)
fk_part_names_str = " & ".join(self.get_fk_names(fk))
print(f"Adding foreign key {fk_part_names_str}")

if not self._package:
raise ValueError("You can't add a foreign key to an orphan resource.")
field_names = [f.name for f in self.schema.fields]
Expand All @@ -64,15 +78,43 @@ def add_foreignkey(self, fk: ForeignKey) -> None:
)
field_names = [f.name for f in parent_resource.schema.fields]
for f in fk.reference.fields:
assert (
f in field_names
), f"Resource {parent_resource.name} has no field named {f}"
assert f in field_names, (
f"Resource {parent_resource.name} has no field named {f}"
)
if fk in self.schema.foreignKeys:
raise ValueError(f"Foreign key {fk_part_names_str} already exists in resource {self.name}")
self.schema.foreignKeys.append(fk)

def remove_foreignkey(self, fields: list[str], foreign_fields: list[str], foreign_resource: str) -> None:
fk = ForeignKey(
fields=fields,
reference=ForeignKeyReference(
fields=foreign_fields,
resource=None if self.name == foreign_resource else foreign_resource,
)
)
fk_part_names_str = " & ".join(self.get_fk_names(fk))
print(f"Removing foreign key {fk_part_names_str}")
if fk not in self.schema.foreignKeys:
raise ValueError(f"Foreign key {fk_part_names_str} not found in resource {self.name}")
self.schema.foreignKeys.remove(fk)

@model_validator(mode="after")
def check_data_or_path(self) -> Self:
provided = [self.data, self.path]
count = len([f for f in provided if f is not None])
if count != 1:
raise ValueError("Exactly one of 'data' or 'path' must be provided.")
return self

def has_same_schema_as(self, other: Self) -> bool:
for attr in vars(self.schema):
if getattr(self.schema, attr) != getattr(other.schema, attr):
return False
return True

def get_fk_names(self, fk: ForeignKey) -> list[str]:
return [
f"'{self.name}.{field}' -> '{fk.reference.resource}.{reference_field}'"
for field, reference_field in zip(fk.fields, fk.reference.fields)
]
7 changes: 5 additions & 2 deletions coordo-py/coordo/loaders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# Copyright COORDONNÉES 2025, 2026
# SPDX-License-Identifier: MPL-2.0

from . import file, kobotoolbox
from .loader import ResourceAction
from .kobotoolbox_loader import KoboToolboxLoader
from .file_loader import FileLoader

__all__ = ["kobotoolbox", "file"]

__all__ = ["ResourceAction", "KoboToolboxLoader", "FileLoader"]
26 changes: 0 additions & 26 deletions coordo-py/coordo/loaders/file.py

This file was deleted.

37 changes: 37 additions & 0 deletions coordo-py/coordo/loaders/file_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright COORDONNÉES 2025, 2026
# SPDX-License-Identifier: MPL-2.0

import shutil
from pathlib import Path

from coordo.loaders.loader import Loader, ResourceAction
from ..datapackage import Field, Resource, Schema
from ..datapackage.db_helpers import prepare_path, to_dp_type


class FileLoader(Loader):
def __init__(self, package: Path, path: Path, action: ResourceAction):
super().__init__(package, action)
self.path = path

def extract(self):
query = f"SELECT * FROM {prepare_path(self.path)}"
conn, _ = self.dp.prepare_db()
rel = conn.sql(query)
schema = Schema()
for name, type in zip(rel.columns, rel.types):
schema.add_field(Field(name=name, **to_dp_type(type)))
conn.close()
# creating resurce for file
resource = Resource(
name=self.path.stem,
path=self.path.name,
schema=schema,
)
self.resources = [resource]

def transform(self):
pass

def load(self):
shutil.copy(self.path, self.dp._basepath / self.path.name)
Loading
Loading