diff --git a/.gitignore b/.gitignore index 63b1c25..003c2d8 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,6 @@ __pycache__/ # docs /docs/generated/ /docs/_build/ + +# lockfiles (library: not committed) +/uv.lock diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 582e6bb..48fe5f7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -49,3 +49,4 @@ repos: - sphinx - sphinx-autodoc-typehints - sphinxcontrib-katex + - types-PyYAML diff --git a/CHANGELOG.md b/CHANGELOG.md index e8048ef..917ed06 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,13 @@ and this project adheres to [Semantic Versioning][]. ### Added - A Sphinx extension to take care of documentation. This moves docstring processing from import time to documentation building time. +- A reusable `datasets` subpackage (behind the `datasets` extra): typed `DatasetEntry`/ + `FileEntry` + `parse_registry` (YAML), a thin pooch-based `fetch` (SHA-256 verification, + retries, archive processors), and a pluggable `type -> loader` registry + (`register_loader`) so packages can share dataset-download infrastructure. Ships built-in + `anndata` and `spatialdata` loaders (the latter behind the `spatialdata` extra); other + types are consumer-registered. +- `anndata` is now a core dependency. ### Changed diff --git a/docs/_templates/autosummary/class.rst b/docs/_templates/autosummary/class.rst index 7b4a0cf..ab448ca 100644 --- a/docs/_templates/autosummary/class.rst +++ b/docs/_templates/autosummary/class.rst @@ -7,55 +7,49 @@ .. autoclass:: {{ objname }} {% block attributes %} -{% if attributes %} +{% for item in attributes %} +{% if loop.length != 1 %} +{% if loop.first %} Attributes table ~~~~~~~~~~~~~~~~ .. autosummary:: -{% for item in attributes %} +{% endif %} ~{{ name }}.{{ item }} -{%- endfor %} {% endif %} +{%- endfor %} {% endblock %} {% block methods %} -{% if methods %} +{% for item in all_methods if item == '__call__' or not item.startswith('__') %} +{% if loop.length != 1 %} +{% if loop.first %} Methods table ~~~~~~~~~~~~~ .. autosummary:: -{% for item in methods %} - {%- if item != '__init__' %} +{% endif %} ~{{ name }}.{{ item }} - {%- endif -%} -{%- endfor %} {% endif %} +{%- endfor %} {% endblock %} {% block attributes_documentation %} -{% if attributes %} +{% for item in attributes %} +{% if loop.first %} Attributes ~~~~~~~~~~ - -{% for item in attributes %} - +{% endif %} .. autoattribute:: {{ [objname, item] | join(".") }} {%- endfor %} - -{% endif %} {% endblock %} {% block methods_documentation %} -{% if methods %} +{% for item in all_methods if item == '__call__' or not item.startswith('__') %} +{% if loop.first %} Methods ~~~~~~~ - -{% for item in methods %} -{%- if item != '__init__' %} - +{% endif %} .. automethod:: {{ [objname, item] | join(".") }} -{%- endif -%} {%- endfor %} - -{% endif %} {% endblock %} diff --git a/docs/api.md b/docs/api.md index 5717c4b..c23078c 100644 --- a/docs/api.md +++ b/docs/api.md @@ -1,10 +1,11 @@ # API ```{eval-rst} -.. currentmodule:: scverse_misc +.. module:: scverse_misc .. toctree:: ``` +(extensions)= ## Extensions ```{eval-rst} @@ -13,7 +14,9 @@ make_register_namespace_decorator ``` + Types used by the former: + ```{eval-rst} .. autosummary:: :toctree: generated @@ -23,7 +26,9 @@ Types used by the former: *Examples:* {ref}`example-extension-namespaces` +(deprecations)= ## Deprecations + ```{eval-rst} .. autosummary:: :toctree: generated @@ -35,6 +40,7 @@ Types used by the former: *Examples:* {ref}`example-deprecating-a-function`, {ref}`example-deprecating-a-function-argument`, {ref}`example-settings-class` +(settings)= ## Settings ```{eval-rst} @@ -43,9 +49,28 @@ Types used by the former: api/settings -+---------------------------+----------------------------------+ -| :class:`Settings` () | Base class for package settings. | -+---------------------------+----------------------------------+ +.. autosummary:: + :signatures: short + + Settings ``` *Examples:* {ref}`example-settings-class` + +(datasets)= +## Datasets (`scverse_misc.datasets`) + +```{eval-rst} +.. automodule:: scverse_misc.datasets +.. autosummary:: + :toctree: generated + + DatasetEntry + FileEntry + parse_registry + fetch + register_loader + available_loaders + Loader + DownloadCB +``` diff --git a/docs/conf.py b/docs/conf.py index 99d609f..22717f4 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -13,6 +13,7 @@ from sphinxcontrib import katex + HERE = Path(__file__).parent sys.path.insert(0, str(HERE / "extensions")) sys.path.insert(0, str(HERE / "sphinx_ext_examples")) @@ -104,6 +105,7 @@ "scipy": ("https://docs.scipy.org/doc/scipy", None), "pandas": ("https://pandas.pydata.org/docs/", None), "scanpy": ("https://scanpy.readthedocs.io/en/stable/", None), + "pooch": ("https://www.fatiando.org/pooch/latest/", None), "pydantic": ("https://pydantic.dev/docs/validation/", None), } @@ -137,5 +139,5 @@ nitpick_ignore: list[tuple[str, str]] = [ # If building the documentation fails because of a missing link that is outside your control, # you can add an exception to this list. - # ("py:class", "igraph.Graph"), + ("py:class", "scverse_misc._deprecated.CallableWithDeprecatedArg"), ] diff --git a/pyproject.toml b/pyproject.toml index 424561f..0cf50eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,11 +22,14 @@ classifiers = [ ] dynamic = [ "version" ] dependencies = [ + "anndata", # for debug logging (referenced from the issue template) "session-info2", "typing-extensions; python_version<'3.13'", ] +optional-dependencies.datasets = [ "pooch", "pyyaml", "tqdm" ] optional-dependencies.settings = [ "pydantic-settings", "python-dotenv" ] +optional-dependencies.spatialdata = [ "spatialdata" ] optional-dependencies.sphinx = [ "pydocstring-rs>=0.1.13", "sphinx>=9" ] # https://docs.pypi.org/project_metadata/#project-urls urls.Documentation = "https://scverse-misc.readthedocs.io/" @@ -38,13 +41,20 @@ dev = [ "pre-commit", "twine>=4.0.2", ] -test = [ "coverage>=7.10", "numpy", "pytest", "scverse-misc[settings,sphinx]", "sphinx", "sphinx-autodoc-typehints" ] +test = [ + "coverage>=7.10", + "numpy", + "pytest", + "scverse-misc[datasets,settings,sphinx]", + "sphinx", + "sphinx-autodoc-typehints" +] doc = [ "ipykernel", "ipython", "myst-nb>=1.1", "pandas", - "scverse-misc[settings,sphinx]", + "scverse-misc[datasets,settings,sphinx]", "sphinx>=8.1", "sphinx-autodoc-typehints", "sphinx-book-theme>=1", diff --git a/src/scverse_misc/datasets/__init__.py b/src/scverse_misc/datasets/__init__.py new file mode 100644 index 0000000..f22cfb7 --- /dev/null +++ b/src/scverse_misc/datasets/__init__.py @@ -0,0 +1,25 @@ +"""Reusable, declarative dataset download for scverse packages. + +Parse a YAML registry into typed :class:`DatasetEntry` objects, then download and load +one with :func:`fetch`. Dataset ``type`` strings are dispatched against a pluggable loader +registry (:func:`register_loader`); ``anndata`` and ``spatialdata`` loaders ship built in. + +Requires the ``datasets`` extra (``pip install scverse-misc[datasets]``); the built-in +``spatialdata`` loader additionally needs the ``spatialdata`` extra. +""" + +from __future__ import annotations + +from ._fetcher import DownloadCB, Loader, available_loaders, fetch, register_loader +from ._registry import DatasetEntry, FileEntry, parse_registry + +__all__ = [ + "FileEntry", + "DatasetEntry", + "parse_registry", + "fetch", + "register_loader", + "available_loaders", + "Loader", + "DownloadCB", +] diff --git a/src/scverse_misc/datasets/_fetcher.py b/src/scverse_misc/datasets/_fetcher.py new file mode 100644 index 0000000..87ee0fb --- /dev/null +++ b/src/scverse_misc/datasets/_fetcher.py @@ -0,0 +1,132 @@ +"""Download + load a dataset: a thin ``fetch`` over pooch + a pluggable ``type -> loader`` registry. + +A loader is a callable ``(entry, target_dir, download, **kwargs) -> object`` where ``download`` +is ``(FileEntry, dest=None, processor=None) -> path`` (pooch under the hood: hashing, caching, +retries, and archive processors). ``anndata`` and ``spatialdata`` loaders ship built in. +""" + +from __future__ import annotations + +from collections.abc import Callable +from pathlib import Path +from typing import TYPE_CHECKING, Any, Protocol, cast, overload + +if TYPE_CHECKING: + from ._registry import DatasetEntry, FileEntry + + if TYPE_CHECKING: # sphinx tries to import the above TYPE_CHECKING block + from anndata import AnnData + from pooch.typing import Processor + from spatialdata import SpatialData + else: + from typing import TypeAliasType + + # TypeAliasType.__module__ is readonly, so we have to be a bit creative. + Processor = eval('A("Processor", object)', globals=dict(__name__="pooch.typing", A=TypeAliasType)) + + +__all__ = ["register_loader", "available_loaders", "fetch", "Loader", "DownloadCB"] + + +class Loader[T](Protocol): + """Function that can be annotated by :func:`register_loader`.""" + + def __call__(self, entry: DatasetEntry, target: Path, download: DownloadCB, /, **kwargs: object) -> T: + """Call `download` (see :class:`DownloadCB`) and load ``entry``. + + Args: + entry: File to download. + target: Loaded when it exists, otherwise it will be created. + download: Called when `target` doesn’t exist. + kwargs: Passed to `download`. + """ + + +class DownloadCB(Protocol): + """Callback passed as `download` to a :class:`Loader`.""" + + def __call__(self, file: FileEntry, /, *, dest: Path | None = None, processor: Processor | None = None) -> str: + """Download ``file`` if necessary. + + Args: + file: File to download. + dest: Optional target directory, defaults to :func:`fetch`’s `cache_dir / entry.type`. + processor: Optional archive processor. + """ + + +_LOADERS: dict[str, Loader[object]] = {} + + +@overload +def register_loader[T](type_name: str) -> Callable[[Loader[T]], Loader[T]]: ... +@overload +def register_loader[T](type_name: str, loader: Loader[T]) -> Loader[T]: ... +def register_loader[T](type_name: str, loader: Loader[T] | None = None) -> Callable[[Loader[T]], Loader[T]] | Loader[T]: + """Register a :class:`Loader` for a dataset ``type`` (decorator or direct call).""" + + def deco(fn: Loader[T]) -> Loader[T]: + _LOADERS[type_name] = fn + return fn + + return deco if loader is None else deco(loader) + + +def available_loaders() -> list[str]: + """Return the names of all registered loader types.""" + return sorted(_LOADERS) + + +def fetch[T]( + entry: DatasetEntry, cache_dir: str | Path, *, base_url: str | None = None, retries: int = 3, **kwargs: object +) -> T: # type: ignore[type-var] + """Download (if needed) and load ``entry``, dispatching to the loader registered for ``entry.type``. + + Files are cached under ``cache_dir / entry.type``. ``kwargs`` are passed to the loader. + """ + target = Path(cache_dir) / entry.type + + def download(file: FileEntry, /, dest: Path | None = None, processor: Processor | None = None) -> str: + import pooch + + out = dest or target + out.mkdir(parents=True, exist_ok=True) + pup = pooch.create( + path=str(out), + base_url="", + registry={file.name: f"sha256:{file.sha256}" if file.sha256 else None}, + urls={file.name: file.resolve_url(base_url)}, + retry_if_failed=retries, + ) + return pup.fetch(file.name, processor=processor, progressbar=True) + + if entry.type not in _LOADERS: + raise KeyError(f"No loader registered for type {entry.type!r}. Available: {available_loaders()}") + return cast("Loader[T]", _LOADERS[entry.type])(entry, target, download, **kwargs) + + +@register_loader("anndata") +def _load_anndata(entry: DatasetEntry, target: Path, download: DownloadCB, /, **kwargs: object) -> AnnData: + """Built-in loader: download a single ``.h5ad`` and read it with :func:`anndata.read_h5ad`.""" + import anndata + + return anndata.read_h5ad(download(entry.file(suffix=".h5ad")), **cast("dict[str, Any]", kwargs)) + + +@register_loader("spatialdata") +def _load_spatialdata(entry: DatasetEntry, target: Path, download: DownloadCB, /, **kwargs: object) -> SpatialData: + """Built-in loader: download a ``.zip``, unzip it (via pooch) and read the single ``.zarr`` store inside. + + Extracts into a per-dataset directory so the ``.zarr`` can be found by glob (its name need not match + the registry key) without colliding with other spatialdata datasets cached under the same ``target``. + Needs the ``spatialdata`` extra. + """ + import pooch + import spatialdata as sd + + dest = target / entry.name + download(entry.file(suffix=".zip"), dest=dest, processor=pooch.Unzip(extract_dir=".")) + zarrs = sorted(dest.glob("*.zarr")) + if len(zarrs) != 1: + raise RuntimeError(f"Expected exactly one .zarr extracted under {dest}, found {len(zarrs)}: {zarrs}.") + return sd.read_zarr(zarrs[0], **cast("dict[str, Any]", kwargs)) diff --git a/src/scverse_misc/datasets/_registry.py b/src/scverse_misc/datasets/_registry.py new file mode 100644 index 0000000..f256f3e --- /dev/null +++ b/src/scverse_misc/datasets/_registry.py @@ -0,0 +1,109 @@ +"""Typed dataset entries + a YAML parser. Plain data — no registry/fetcher machinery.""" + +from __future__ import annotations + +import warnings +from dataclasses import dataclass, field, fields +from typing import TYPE_CHECKING, Any + +import yaml + +if TYPE_CHECKING: + from collections.abc import Mapping + from os import PathLike + +__all__ = ["FileEntry", "DatasetEntry", "parse_registry"] + + +@dataclass(frozen=True, slots=True) +class FileEntry: + """A single downloadable file belonging to a dataset. + + Parameters + ---------- + name + File name as it should appear on disk (e.g. ``"cells.zip"``). + url + Full download URL (e.g. a Zenodo file URL). Takes precedence over ``s3_key``. + s3_key + Key relative to the registry's ``base_url``. Used when ``url`` is unset. + sha256 + Expected SHA-256 hash. If set, downloads are verified against it. + """ + + name: str + url: str | None = None + s3_key: str | None = None + sha256: str | None = None + + def resolve_url(self, base_url: str | None = None) -> str: + """Resolve the download URL: the explicit ``url`` if set, else ``base_url/s3_key``.""" + if self.url: + return self.url + if base_url and self.s3_key: + return f"{base_url.rstrip('/')}/{self.s3_key}" + raise ValueError(f"FileEntry {self.name!r} has neither `url` nor `s3_key` (with a registry `base_url`).") + + +@dataclass(frozen=True, slots=True) +class DatasetEntry: + """A named dataset made up of one or more files. + + ``metadata`` holds everything in the YAML row other than ``type`` and ``files`` + (e.g. ``shape``, ``library_id``, ``doc_header``); the core does not interpret it. + """ + + name: str + type: str + files: tuple[FileEntry, ...] + metadata: Mapping[str, Any] = field(default_factory=dict) + + def file(self, *, name: str | None = None, suffix: str | None = None) -> FileEntry: + """Return the file matching ``name`` (exact) or ``suffix`` (endswith). Raises unless exactly one matches.""" + if name is not None: + matches = [f for f in self.files if f.name == name] + crit = f"name={name!r}" + elif suffix is not None: + matches = [f for f in self.files if f.name.endswith(suffix)] + crit = f"suffix={suffix!r}" + else: + raise ValueError("Pass exactly one of `name` or `suffix`.") + if len(matches) != 1: + raise ValueError(f"Expected exactly one file with {crit} in {self.name!r}, found {len(matches)}.") + return matches[0] + + +_FILE_FIELDS = frozenset(f.name for f in fields(FileEntry)) + + +def _file_entry(fd: Mapping[str, Any], dataset: str) -> FileEntry: + """Build a :class:`FileEntry`, warning on (and dropping) keys it doesn't recognise. + + Unknown keys are tolerated so per-file extras (e.g. ``description``) don't crash the + parse, but a warning surfaces likely typos. + """ + if unknown := fd.keys() - _FILE_FIELDS: + warnings.warn(f"Ignoring unknown file keys {sorted(unknown)} in dataset {dataset!r}.", stacklevel=3) + return FileEntry(**{k: v for k, v in fd.items() if k in _FILE_FIELDS}) + + +def parse_registry(path: PathLike[str] | str) -> tuple[str | None, dict[str, DatasetEntry]]: + """Parse a YAML registry into ``(base_url, {name: DatasetEntry})``. + + The YAML has a top-level ``base_url`` (or ``s3_base_url``) and a ``datasets`` mapping of + ``name -> {type, files: [{name, url?/s3_key?, sha256?}], ...}``. Any keys other than ``type`` + and ``files`` are collected into the entry's ``metadata``. + """ + with open(path) as f: + config = yaml.safe_load(f) or {} + base_url = config.get("base_url") or config.get("s3_base_url") + datasets = { + name: DatasetEntry( + name=name, + type=row["type"], + files=tuple(_file_entry(fd, name) for fd in row.get("files", [])), + metadata={k: v for k, v in row.items() if k not in ("type", "files")}, + ) + for name, row in (config.get("datasets") or {}).items() + } + return base_url, datasets diff --git a/stubs/anndata.pyi b/stubs/anndata.pyi new file mode 100644 index 0000000..e9e2013 --- /dev/null +++ b/stubs/anndata.pyi @@ -0,0 +1,6 @@ +import os +from typing import Any + +class AnnData: ... + +def read_h5ad(path: str | os.PathLike[str], **kwargs: Any) -> AnnData: ... # noqa: ANN401 diff --git a/stubs/pooch/__init__.pyi b/stubs/pooch/__init__.pyi new file mode 100644 index 0000000..b729956 --- /dev/null +++ b/stubs/pooch/__init__.pyi @@ -0,0 +1,37 @@ +from .typing import Downloader, PathInputType, PathType, Processor + +def create( + path: PathInputType, + base_url: str, + version: str | None = None, + version_dev: str = "master", + env: str | None = None, + registry: dict[str, str | None] | None = None, + urls: dict[str, str] | None = None, + retry_if_failed: int = 0, + allow_updates: bool | str = True, +) -> Pooch: ... + +class Pooch: + def __init__( + self, + path: PathType, + base_url: str, + registry: dict[str, str | None] | None = None, + urls: dict[str, str] | None = None, + retry_if_failed: int = 0, + allow_updates: bool = True, + ) -> None: ... + def fetch( + self, + fname: str, + processor: Processor | None = None, + downloader: Downloader | None = None, + progressbar: bool = False, + ) -> str: ... + +class Unzip: + def __init__(self, extract_dir: str | None = None) -> None: ... + def __call__(self, fname: str, action: str | None, pooch: Pooch | None) -> object: ... + +_u: Processor = Unzip() # type assertion diff --git a/stubs/pooch/typing.pyi b/stubs/pooch/typing.pyi new file mode 100644 index 0000000..fdfac0b --- /dev/null +++ b/stubs/pooch/typing.pyi @@ -0,0 +1,20 @@ +import os +from collections.abc import Callable +from typing import Literal, Protocol + +from . import Pooch + +type Action = Literal["download", "fetch", "update"] +type PathType = str | os.PathLike[str] +type PathInputType = PathType | list[PathType] | tuple[PathType, ...] +type Processor = Callable[[str, Action, Pooch | None], object] + +class Downloader(Protocol): + def __call__( # noqa: E704 + self, + fname: str, + action: PathType | None, + pooch: Pooch | None, + *, + check_only: bool | None = None, + ) -> object: ... diff --git a/stubs/spatialdata.pyi b/stubs/spatialdata.pyi new file mode 100644 index 0000000..3d023c7 --- /dev/null +++ b/stubs/spatialdata.pyi @@ -0,0 +1,6 @@ +import os +from typing import Any + +class SpatialData: ... + +def read_zarr(path: str | os.PathLike[str], **kwargs: Any) -> SpatialData: ... # noqa: ANN401 diff --git a/tests/test_datasets.py b/tests/test_datasets.py new file mode 100644 index 0000000..6f5da2b --- /dev/null +++ b/tests/test_datasets.py @@ -0,0 +1,188 @@ +from __future__ import annotations + +import sys +import types +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +from scverse_misc.datasets import ( + DatasetEntry, + FileEntry, + _fetcher, + available_loaders, + fetch, + parse_registry, + register_loader, +) + +if TYPE_CHECKING: + from scverse_misc.datasets import DownloadCB + + +_YAML = """\ +base_url: https://example.org/data/ +datasets: + toy: + type: dummy + shape: [10, 3] + files: + - name: toy.h5ad + s3_key: toy.h5ad + sha256: abc123 + remote: + type: dummy + files: + - name: remote.zip + url: https://zenodo.org/records/1/files/remote.zip +""" + + +@pytest.fixture +def registry(tmp_path: Path) -> dict[str, DatasetEntry]: + p = tmp_path / "datasets.yaml" + p.write_text(_YAML) + base_url, datasets = parse_registry(p) + assert base_url == "https://example.org/data/" + return datasets + + +def test_parse_registry(registry: dict[str, DatasetEntry]) -> None: + assert set(registry) == {"toy", "remote"} + toy = registry["toy"] + assert toy.type == "dummy" + assert toy.metadata["shape"] == [10, 3] # non-type/files keys land in metadata + assert toy.file(suffix=".h5ad").sha256 == "abc123" + + +def test_parse_registry_warns_on_extra_file_keys(tmp_path: Path) -> None: + p = tmp_path / "datasets.yaml" + p.write_text( + "datasets:\n" + " d:\n" + " type: dummy\n" + " files:\n" + " - name: x.h5ad\n" + " url: https://z/x.h5ad\n" + " description: an unknown-to-FileEntry key\n" + ) + # unknown keys are dropped (not fatal) but warned about so typos surface + with pytest.warns(UserWarning, match="unknown file keys.*description"): + _, datasets = parse_registry(p) + assert datasets["d"].file(name="x.h5ad").url == "https://z/x.h5ad" + + +def test_resolve_url() -> None: + # explicit url takes precedence over s3_key + assert FileEntry(name="x.zip", url="https://z/x.zip", s3_key="x.zip").resolve_url("https://b/") == "https://z/x.zip" + # s3_key resolves against base_url + assert FileEntry(name="x", s3_key="k").resolve_url("https://b") == "https://b/k" + # neither resolvable -> error + with pytest.raises(ValueError, match="neither"): + FileEntry(name="x", s3_key="k").resolve_url(None) + + +def test_file_selection_is_unambiguous(registry: dict[str, DatasetEntry]) -> None: + assert registry["toy"].file(name="toy.h5ad").s3_key == "toy.h5ad" # exact name match + with pytest.raises(ValueError, match="exactly one"): + registry["toy"].file(name="nope.h5ad") + with pytest.raises(ValueError, match="exactly one"): + registry["toy"].file(suffix=".missing") + with pytest.raises(ValueError, match="exactly one of"): + registry["toy"].file() + + +def test_builtin_loaders_are_shipped() -> None: + assert {"anndata", "spatialdata"} <= set(available_loaders()) + + +def test_register_and_dispatch(registry: dict[str, DatasetEntry], tmp_path: Path) -> None: + seen: dict[str, object] = {} + + @register_loader("dummy") + def _load(entry: DatasetEntry, target: Path, download: DownloadCB, /, **kw: object) -> str: + seen.update(kw) + return entry.name + + try: + # dummy loader does no download, so no network / pooch needed + assert fetch(registry["toy"], tmp_path, base_url="https://b", foo=1) == "toy" + assert seen == {"foo": 1} + finally: + _fetcher._LOADERS.pop("dummy", None) + + +def test_unknown_loader(registry: dict[str, DatasetEntry], tmp_path: Path) -> None: + # "toy" is type "dummy" but no dummy loader registered here + with pytest.raises(KeyError, match="No loader registered"): + fetch(registry["toy"], tmp_path) + + +def test_download_drives_pooch( + registry: dict[str, DatasetEntry], tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """The `download` closure wires FileEntry -> pooch.create/fetch without touching the network.""" + calls: dict[str, object] = {} + + class FakePup: + def fetch(self, name: str, *, processor: object, progressbar: bool) -> str: + calls["fetched"] = name + return f"/cache/{name}" + + def fake_create(**kw: object) -> FakePup: + calls.update(kw) + return FakePup() + + import pooch + + monkeypatch.setattr(pooch, "create", fake_create) + + @register_loader("dummy") + def _load(entry: DatasetEntry, target: Path, download: DownloadCB, /, **kw: object) -> str: + return download(entry.file(suffix=".h5ad")) + + try: + assert fetch(registry["toy"], tmp_path, base_url="https://b") == "/cache/toy.h5ad" + finally: + _fetcher._LOADERS.pop("dummy", None) + + assert calls["urls"] == {"toy.h5ad": "https://b/toy.h5ad"} + assert calls["registry"] == {"toy.h5ad": "sha256:abc123"} + assert calls["fetched"] == "toy.h5ad" + + +# old anndata versions use the old arguments +@pytest.mark.filterwarnings( + r"ignore:The (decorator_name|docstring_style|exported_object_name)( class)? argument is deprecated:DeprecationWarning" +) +def test_load_anndata_reads_h5ad(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + import anndata + + monkeypatch.setattr(anndata, "read_h5ad", lambda path, **kw: ("adata", path, kw)) + entry = DatasetEntry(name="toy", type="anndata", files=(FileEntry(name="toy.h5ad", url="https://z/toy.h5ad"),)) + result: object = _fetcher._load_anndata(entry, tmp_path, lambda f, **kw: "/cache/toy.h5ad", backed="r") + assert result == ("adata", "/cache/toy.h5ad", {"backed": "r"}) + + +def test_load_spatialdata_reads_zarr(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + fake_sd = types.ModuleType("spatialdata") + fake_sd.read_zarr = lambda path, **kw: ("sdata", path) # type: ignore[attr-defined] + monkeypatch.setitem(sys.modules, "spatialdata", fake_sd) + entry = DatasetEntry( + name="cells", type="spatialdata", files=(FileEntry(name="cells.zip", url="https://z/cells.zip"),) + ) + + # download extracted nothing -> loud failure (0 zarrs found) + with pytest.raises(RuntimeError, match="Expected exactly one"): + _fetcher._load_spatialdata(entry, tmp_path, lambda f, **kw: str(kw["dest"])) + + # the extracted .zarr need not be named after the registry key; glob finds the single one + def extract(file: FileEntry, **kw: object) -> str: + dest = kw["dest"] + assert isinstance(dest, Path) + (dest / "whatever.zarr").mkdir(parents=True) + return str(dest) + + result: object = _fetcher._load_spatialdata(entry, tmp_path, extract) + assert result == ("sdata", tmp_path / "cells" / "whatever.zarr")