Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
8fc19e5
create __init__.py and add filter for warning 'UserWarning: Field nam…
OlivierCoen Apr 11, 2026
f004f92
implement functionnality to overwrite existing file
OlivierCoen Apr 12, 2026
25f5b33
add parameter strategy to choose overwrite behavior + implement overw…
OlivierCoen Apr 13, 2026
93f3e2b
Update README.md
OlivierCoen Apr 13, 2026
87ebb76
add copyright
OlivierCoen Apr 13, 2026
e4c70ad
apply ruff
OlivierCoen Apr 13, 2026
4c52975
factorise loaders
OlivierCoen Apr 14, 2026
964bb15
pass linters
OlivierCoen Apr 14, 2026
f7698f4
address first comments for review
OlivierCoen Apr 16, 2026
c1d8911
simplify resouce handling by datapackage
OlivierCoen Apr 16, 2026
46ae7bc
add list of individual foreign keys to remove in error message
OlivierCoen Apr 18, 2026
5649e71
update addition of foreign key
OlivierCoen Apr 18, 2026
e7cee9b
add error when foreign key exists
OlivierCoen Apr 18, 2026
5c37c8f
add cli for removing foreign key
OlivierCoen Apr 18, 2026
a0cfd11
Merge branch 'main' into feat/add-additional-data
OlivierCoen Apr 18, 2026
c954c33
Update resource.py
OlivierCoen Apr 19, 2026
eb5d839
Merge branch 'main' into feat/add-additional-data
OlivierCoen Apr 20, 2026
cf6370d
add anonymised test data
OlivierCoen Apr 18, 2026
cca0854
change location of files
OlivierCoen Apr 19, 2026
ff1d50b
add test function
OlivierCoen Apr 21, 2026
bbd0a76
factorise code in test_cli.py
OlivierCoen Apr 21, 2026
ad0967b
update uv environment
OlivierCoen Apr 21, 2026
7a912d9
add pytest step in CI
OlivierCoen Apr 21, 2026
5faedca
add copyright
OlivierCoen Apr 21, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/coordo-py.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,6 @@ jobs:

- name: Run tests
run: uv run test.py

- name: Run pytest
run: uv run pytest
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ uv.lock
.DS_Store
catalog/
data/
.vscode/
.vscode/
15 changes: 15 additions & 0 deletions coordo-py/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,18 @@ FROM parents LEFT OUTER JOIN children ON parents.id = children.parent_id)
SELECT anon_1.avg_1
FROM anon_1
```


## Development

### Data types

#### KoboToolbox

For surveys, Kobotoolbox uses the standard XLSForm format.
Briefly, each `xlsx` file contains 3 sheets:
- `survey`
- `choices`
- `settings`

All information can be found at https://support.kobotoolbox.org/edit_forms_excel.html
23 changes: 23 additions & 0 deletions coordo-py/coordo/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Copyright COORDONNÉES 2025, 2026
# SPDX-License-Identifier: MPL-2.0

import warnings


# ignore warnings due to shadowing of Pydantic's "schema" field in "Resource"
REGEX_TO_IGNORE = (
'Field name "schema" in "Resource" shadows an attribute in parent "(Base)?Model"'
)
warnings.filterwarnings(
"ignore",
category=UserWarning,
module="dplib",
message=REGEX_TO_IGNORE,
)

warnings.filterwarnings(
"ignore",
category=UserWarning,
module="coordo",
message=REGEX_TO_IGNORE,
)
46 changes: 26 additions & 20 deletions coordo-py/coordo/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@
from pathlib import Path

import typer
from dplib.models.schema.foreignKey import ForeignKey, ForeignKeyReference

from coordo import loaders
from coordo.loaders import ResourceAction, KoboToolboxLoader, FileLoader
from coordo.datapackage import DataPackage
from coordo.sql.builder import build_query

Expand Down Expand Up @@ -83,25 +82,18 @@ def kobotoolbox(
xlsform: Path,
xlsdata: Path,
package: Path = typer.Option(help="Path to the package directory"),
action: ResourceAction = typer.Option(help="Action to perform on resource"),
):
dp = DataPackage.from_path(package)
loaders.kobotoolbox.load(dp, xlsform, xlsdata)
dp.save()
KoboToolboxLoader(package, xlsform, xlsdata, action).etl()


@load.command()
def file(
path: Path,
package: Path = typer.Option(".", help="Path to the package directory"),
action: ResourceAction = typer.Option(help="Action to perform on resource"),
):
dp = DataPackage.from_path(package)
try:
loaders.file.load(dp, path)
except ValueError as e:
raise typer.BadParameter(
f"{e} Add --overwrite if you wish to continue.", param_hint="path"
)
dp.save()
FileLoader(package, path, action).etl()


app.add_typer(load, name="load")
Expand All @@ -119,13 +111,27 @@ def add_foreignkey(
dp.get_resource(
resource,
).add_foreignkey(
ForeignKey(
fields=[field],
reference=ForeignKeyReference(
fields=[foreign_field],
resource=None if resource == foreign_resource else foreign_resource,
),
),
fields=[field],
foreign_fields=[foreign_field],
foreign_resource=foreign_resource,
)
dp.save()

@app.command()
def remove_foreignkey(
from_: str,
to: str,
package: Path = typer.Option(".", help="Path to the package directory"),
):
dp = DataPackage.from_path(package)
resource, field = from_.split(".")
foreign_resource, foreign_field = to.split(".")
dp.get_resource(
resource,
).remove_foreignkey(
fields=[field],
foreign_fields=[foreign_field],
foreign_resource=foreign_resource,
)
dp.save()

Expand Down
58 changes: 45 additions & 13 deletions coordo-py/coordo/datapackage/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,13 @@ def handle_path(path: str | list[str]) -> str:
return path


def check_resource_fields_match(res1: Resource, res2: Resource) -> None:
if not res1.has_same_schema_as(res2):
raise ValueError(
f"Schemas do not match for resources {res1.name!r} and {res2.name!r}"
)


class DataPackage(pydantic.BaseModel):
id: Optional[str] = None
name: str = pydantic.Field(pattern=r"^[a-z0-9._-]+$")
Expand Down Expand Up @@ -91,39 +98,64 @@ def save(self):
)

def remove_resource(self, name: str) -> None:
"""
Remove a resource from the package.
For all other resources in the current datapackage, check if they have a foreign key pointing to this resource
and raise error it if so.
Args:
name (str): the name of the resource to remove
"""
print(f"Removing resource {name!r} from DataPackage {self.name!r}")
resource = self.get_resource(name=name)
# looping over all resources in the current datapackage, other than <resource>
for res in self.resources:
if res.name == name:
continue
sm = safe(res, "schema")
if sm.foreignKeys:
for fk in sm.foreignKeys:
assert fk.reference.resource != name, (
f"Can't remove the resource {name} : {res.name} have a foreign key pointing to this resource."
)
res_schema = safe(res, "schema")
if res_schema.foreignKeys:
for fk in res_schema.foreignKeys:
if fk.reference.resource == name:
# build a string containing the list of foreign key field pairs
fk_part_names_str = "\n".join(res.get_fk_names(fk))
raise ValueError(
f"Can't remove the resource {name!r} : {res.name!r} has a foreign key pointing to this resource. "
f"Please remove the following foreign keys beforehand:\n{fk_part_names_str}"
)
# remove the file associated with the resource
if resource.path:
path = handle_path(resource.path)
Path(self._basepath / path).unlink()
# update resources list
self.resources = [res for res in self.resources if res.name != name]

def add_resource(self, resource: Resource) -> None:
if any(res.name == resource.name for res in self.resources):
print(f"Adding resource {resource.name!r} to package {self.name!r}")
if self.resource_exists(resource.name):
raise ValueError(
f"A resource named {resource.name} already exists in package {self.name}."
f"A resource named {resource.name!r} already exists in package {self.name!r}. "
"Please remove the existing resource before adding a new one."
)
resource._package = self
self.resources.append(resource)
else:
resource._package = self
self.resources.append(resource)

def update_resource(self, resource: Resource) -> None:
pass

def get_resource(self, name: str) -> Resource:
resource = next(res for res in self.resources if res.name == name)
assert resource is not None, f"Resource {name} not found."
return resource
found_resources = [res for res in self.resources if res.name == name]
assert len(found_resources) <= 1, f"Multiple resources named {name!r} found."
assert len(found_resources) > 0, f"Resource {name!r} not found."
return found_resources[0]

def write_resource(self, resource_name: str, it: Iterable[dict]):
pass
# resource = self.get_resource(name=resource_name)
# schema = resource.get_schema()

def resource_exists(self, name: str) -> bool:
return any(res.name == name for res in self.resources)

def prepare_db(self) -> tuple[duckdb.DuckDBPyConnection, sa.MetaData]:
conn = load_conn()

Expand Down
52 changes: 47 additions & 5 deletions coordo-py/coordo/datapackage/resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import duckdb
import pydantic
from dplib.models import Contributor, Dialect, ForeignKey, License, Schema, Source
from dplib.models import Contributor, Dialect, ForeignKey, ForeignKeyReference, License, Schema, Source
from pydantic import model_validator

from .db_helpers import prepare_path
Expand Down Expand Up @@ -50,7 +50,21 @@ def load_table(self, conn: duckdb.DuckDBPyConnection):
query = f'CREATE VIEW "{self.name}" AS SELECT * FROM {prepare_path(self.package._basepath / self.path)}'
conn.execute(query)

def add_foreignkey(self, fk: ForeignKey) -> None:
def add_foreignkey(self, fields: list[str], foreign_fields: list[str], foreign_resource: str) -> None:
# TODO: remove this check when addition of multiple fields at once is supported
if len(fields) > 1 or len(foreign_fields) > 1:
raise ValueError("Adding a foreign key with multiple fields is not supported yet.")

fk = ForeignKey(
fields=fields,
reference=ForeignKeyReference(
fields=foreign_fields,
resource=None if self.name == foreign_resource else foreign_resource,
)
)
fk_part_names_str = " & ".join(self.get_fk_names(fk))
print(f"Adding foreign key {fk_part_names_str}")

if not self._package:
raise ValueError("You can't add a foreign key to an orphan resource.")
field_names = [f.name for f in self.schema.fields]
Expand All @@ -64,15 +78,43 @@ def add_foreignkey(self, fk: ForeignKey) -> None:
)
field_names = [f.name for f in parent_resource.schema.fields]
for f in fk.reference.fields:
assert (
f in field_names
), f"Resource {parent_resource.name} has no field named {f}"
assert f in field_names, (
f"Resource {parent_resource.name} has no field named {f}"
)
if fk in self.schema.foreignKeys:
raise ValueError(f"Foreign key {fk_part_names_str} already exists in resource {self.name}")
self.schema.foreignKeys.append(fk)

def remove_foreignkey(self, fields: list[str], foreign_fields: list[str], foreign_resource: str) -> None:
fk = ForeignKey(
fields=fields,
reference=ForeignKeyReference(
fields=foreign_fields,
resource=None if self.name == foreign_resource else foreign_resource,
)
)
fk_part_names_str = " & ".join(self.get_fk_names(fk))
print(f"Removing foreign key {fk_part_names_str}")
if fk not in self.schema.foreignKeys:
raise ValueError(f"Foreign key {fk_part_names_str} not found in resource {self.name}")
self.schema.foreignKeys.remove(fk)

@model_validator(mode="after")
def check_data_or_path(self) -> Self:
provided = [self.data, self.path]
count = len([f for f in provided if f is not None])
if count != 1:
raise ValueError("Exactly one of 'data' or 'path' must be provided.")
return self

def has_same_schema_as(self, other: Self) -> bool:
for attr in vars(self.schema):
if getattr(self.schema, attr) != getattr(other.schema, attr):
return False
return True

def get_fk_names(self, fk: ForeignKey) -> list[str]:
return [
f"'{self.name}.{field}' -> '{fk.reference.resource}.{reference_field}'"
for field, reference_field in zip(fk.fields, fk.reference.fields)
]
7 changes: 5 additions & 2 deletions coordo-py/coordo/loaders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# Copyright COORDONNÉES 2025, 2026
# SPDX-License-Identifier: MPL-2.0

from . import file, kobotoolbox
from .loader import ResourceAction
from .kobotoolbox_loader import KoboToolboxLoader
from .file_loader import FileLoader

__all__ = ["kobotoolbox", "file"]

__all__ = ["ResourceAction", "KoboToolboxLoader", "FileLoader"]
26 changes: 0 additions & 26 deletions coordo-py/coordo/loaders/file.py

This file was deleted.

37 changes: 37 additions & 0 deletions coordo-py/coordo/loaders/file_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright COORDONNÉES 2025, 2026
# SPDX-License-Identifier: MPL-2.0

import shutil
from pathlib import Path

from coordo.loaders.loader import Loader, ResourceAction
from ..datapackage import Field, Resource, Schema
from ..datapackage.db_helpers import prepare_path, to_dp_type


class FileLoader(Loader):
def __init__(self, package: Path, path: Path, action: ResourceAction):
super().__init__(package, action)
self.path = path

def extract(self):
query = f"SELECT * FROM {prepare_path(self.path)}"
conn, _ = self.dp.prepare_db()
rel = conn.sql(query)
schema = Schema()
for name, type in zip(rel.columns, rel.types):
schema.add_field(Field(name=name, **to_dp_type(type)))
conn.close()
# creating resurce for file
resource = Resource(
name=self.path.stem,
path=self.path.name,
schema=schema,
)
self.resources = [resource]

def transform(self):
pass

def load(self):
shutil.copy(self.path, self.dp._basepath / self.path.name)
Loading
Loading