Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
3ecdc56
feat(datasets): reusable dataset registry + downloader
timtreis Jun 15, 2026
3ad1929
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 15, 2026
ea6a13d
fix(datasets): satisfy strict mypy in pre-commit
timtreis Jun 15, 2026
74d5217
feat(datasets): allow FetchContext.download into a custom dest dir
timtreis Jun 15, 2026
592b180
feat(datasets): ship a built-in spatialdata loader; make anndata core
timtreis Jun 15, 2026
86e2527
test(datasets): positional-only ctx in the dummy loader for mypy
timtreis Jun 15, 2026
57c3e9d
refactor(datasets): sit thinly on pooch (create+fetch, Unzip processo…
timtreis Jun 15, 2026
c1b5d15
refactor(datasets): drop dead url-list + download_all (simplify)
timtreis Jun 15, 2026
2763d7c
refactor(datasets): drop machinery classes, keep typed data model
timtreis Jun 15, 2026
ddc2e50
Merge branch 'main' into pr/timtreis/40
flying-sheep Jun 17, 2026
412c074
Update .gitignore
timtreis Jun 17, 2026
80e33af
fix types
flying-sheep Jun 17, 2026
f1f87db
Merge branch 'main' into pr/timtreis/40
flying-sheep Jun 17, 2026
026d096
fix types
flying-sheep Jun 17, 2026
291f85d
docstrings
flying-sheep Jun 17, 2026
51b12ac
fix
flying-sheep Jun 17, 2026
8e5a623
fix docs
flying-sheep Jun 17, 2026
07a1742
fix(datasets): address review — coverage + 3 correctness fixes
timtreis Jun 18, 2026
79886df
Merge branch 'main' into pr/timtreis/40
flying-sheep Jun 18, 2026
392ba3c
fix tests
flying-sheep Jun 18, 2026
dec25bb
fix pre tests
flying-sheep Jun 18, 2026
826abfb
Merge branch 'main' into feat/datasets
timtreis Jun 19, 2026
4c30bff
Merge branch 'main' into pr/timtreis/40
flying-sheep Jun 19, 2026
a6c2eb6
Update docs/api.md
timtreis Jun 19, 2026
af3cf52
fix(datasets): warn on unknown file keys; drop dead test capture
timtreis Jun 19, 2026
9fa4c8d
module
flying-sheep Jun 19, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,6 @@ __pycache__/
# docs
/docs/generated/
/docs/_build/

# lockfiles (library: not committed)
/uv.lock
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,4 @@ repos:
- sphinx
- sphinx-autodoc-typehints
- sphinxcontrib-katex
- types-PyYAML
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@ and this project adheres to [Semantic Versioning][].
### Added

- A Sphinx extension to take care of documentation. This moves docstring processing from import time to documentation building time.
- A reusable `datasets` subpackage (behind the `datasets` extra): typed `DatasetEntry`/
`FileEntry` + `parse_registry` (YAML), a thin pooch-based `fetch` (SHA-256 verification,
retries, archive processors), and a pluggable `type -> loader` registry
(`register_loader`) so packages can share dataset-download infrastructure. Ships built-in
`anndata` and `spatialdata` loaders (the latter behind the `spatialdata` extra); other
types are consumer-registered.
- `anndata` is now a core dependency.

### Changed

Expand Down
38 changes: 16 additions & 22 deletions docs/_templates/autosummary/class.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,55 +7,49 @@
.. autoclass:: {{ objname }}

{% block attributes %}
{% if attributes %}
{% for item in attributes %}
{% if loop.length != 1 %}
{% if loop.first %}
Attributes table
~~~~~~~~~~~~~~~~

.. autosummary::
{% for item in attributes %}
{% endif %}
~{{ name }}.{{ item }}
{%- endfor %}
{% endif %}
{%- endfor %}
{% endblock %}

{% block methods %}
{% if methods %}
{% for item in all_methods if item == '__call__' or not item.startswith('__') %}
{% if loop.length != 1 %}
{% if loop.first %}
Methods table
~~~~~~~~~~~~~

.. autosummary::
{% for item in methods %}
{%- if item != '__init__' %}
{% endif %}
~{{ name }}.{{ item }}
{%- endif -%}
{%- endfor %}
{% endif %}
{%- endfor %}
{% endblock %}

{% block attributes_documentation %}
{% if attributes %}
{% for item in attributes %}
{% if loop.first %}
Attributes
~~~~~~~~~~

{% for item in attributes %}

{% endif %}
.. autoattribute:: {{ [objname, item] | join(".") }}
{%- endfor %}

{% endif %}
{% endblock %}

{% block methods_documentation %}
{% if methods %}
{% for item in all_methods if item == '__call__' or not item.startswith('__') %}
{% if loop.first %}
Methods
~~~~~~~

{% for item in methods %}
{%- if item != '__init__' %}

{% endif %}
.. automethod:: {{ [objname, item] | join(".") }}
{%- endif -%}
{%- endfor %}

{% endif %}
{% endblock %}
33 changes: 29 additions & 4 deletions docs/api.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# API

```{eval-rst}
.. currentmodule:: scverse_misc
.. module:: scverse_misc
.. toctree::
```

(extensions)=
## Extensions

```{eval-rst}
Expand All @@ -13,7 +14,9 @@

make_register_namespace_decorator
```

Types used by the former:

```{eval-rst}
.. autosummary::
:toctree: generated
Expand All @@ -23,7 +26,9 @@ Types used by the former:

*Examples:* {ref}`example-extension-namespaces`

(deprecations)=
## Deprecations

```{eval-rst}
.. autosummary::
:toctree: generated
Expand All @@ -35,6 +40,7 @@ Types used by the former:

*Examples:* {ref}`example-deprecating-a-function`, {ref}`example-deprecating-a-function-argument`, {ref}`example-settings-class`

(settings)=
## Settings

```{eval-rst}
Expand All @@ -43,9 +49,28 @@ Types used by the former:

api/settings

+---------------------------+----------------------------------+
| :class:`Settings` () | Base class for package settings. |
+---------------------------+----------------------------------+
.. autosummary::
:signatures: short

Settings
```

*Examples:* {ref}`example-settings-class`

(datasets)=
## Datasets (`scverse_misc.datasets`)

```{eval-rst}
.. automodule:: scverse_misc.datasets
.. autosummary::
:toctree: generated

DatasetEntry
FileEntry
parse_registry
fetch
register_loader
available_loaders
Loader
DownloadCB
```
4 changes: 3 additions & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from sphinxcontrib import katex


HERE = Path(__file__).parent
sys.path.insert(0, str(HERE / "extensions"))
sys.path.insert(0, str(HERE / "sphinx_ext_examples"))
Expand Down Expand Up @@ -104,6 +105,7 @@
"scipy": ("https://docs.scipy.org/doc/scipy", None),
"pandas": ("https://pandas.pydata.org/docs/", None),
"scanpy": ("https://scanpy.readthedocs.io/en/stable/", None),
"pooch": ("https://www.fatiando.org/pooch/latest/", None),
"pydantic": ("https://pydantic.dev/docs/validation/", None),
}

Expand Down Expand Up @@ -137,5 +139,5 @@
nitpick_ignore: list[tuple[str, str]] = [
# If building the documentation fails because of a missing link that is outside your control,
# you can add an exception to this list.
# ("py:class", "igraph.Graph"),
("py:class", "scverse_misc._deprecated.CallableWithDeprecatedArg"),
]
14 changes: 12 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,14 @@ classifiers = [
]
dynamic = [ "version" ]
dependencies = [
"anndata",
# for debug logging (referenced from the issue template)
"session-info2",
"typing-extensions; python_version<'3.13'",
]
optional-dependencies.datasets = [ "pooch", "pyyaml", "tqdm" ]
optional-dependencies.settings = [ "pydantic-settings", "python-dotenv" ]
optional-dependencies.spatialdata = [ "spatialdata" ]
optional-dependencies.sphinx = [ "pydocstring-rs>=0.1.13", "sphinx>=9" ]
# https://docs.pypi.org/project_metadata/#project-urls
urls.Documentation = "https://scverse-misc.readthedocs.io/"
Expand All @@ -38,13 +41,20 @@ dev = [
"pre-commit",
"twine>=4.0.2",
]
test = [ "coverage>=7.10", "numpy", "pytest", "scverse-misc[settings,sphinx]", "sphinx", "sphinx-autodoc-typehints" ]
test = [
"coverage>=7.10",
"numpy",
"pytest",
"scverse-misc[datasets,settings,sphinx]",
"sphinx",
"sphinx-autodoc-typehints"
]
doc = [
"ipykernel",
"ipython",
"myst-nb>=1.1",
"pandas",
"scverse-misc[settings,sphinx]",
"scverse-misc[datasets,settings,sphinx]",
"sphinx>=8.1",
"sphinx-autodoc-typehints",
"sphinx-book-theme>=1",
Expand Down
25 changes: 25 additions & 0 deletions src/scverse_misc/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""Reusable, declarative dataset download for scverse packages.

Parse a YAML registry into typed :class:`DatasetEntry` objects, then download and load
one with :func:`fetch`. Dataset ``type`` strings are dispatched against a pluggable loader
registry (:func:`register_loader`); ``anndata`` and ``spatialdata`` loaders ship built in.

Requires the ``datasets`` extra (``pip install scverse-misc[datasets]``); the built-in
``spatialdata`` loader additionally needs the ``spatialdata`` extra.
"""

from __future__ import annotations

from ._fetcher import DownloadCB, Loader, available_loaders, fetch, register_loader
from ._registry import DatasetEntry, FileEntry, parse_registry

__all__ = [
"FileEntry",
"DatasetEntry",
"parse_registry",
"fetch",
"register_loader",
"available_loaders",
"Loader",
"DownloadCB",
]
132 changes: 132 additions & 0 deletions src/scverse_misc/datasets/_fetcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
"""Download + load a dataset: a thin ``fetch`` over pooch + a pluggable ``type -> loader`` registry.

A loader is a callable ``(entry, target_dir, download, **kwargs) -> object`` where ``download``
is ``(FileEntry, dest=None, processor=None) -> path`` (pooch under the hood: hashing, caching,
retries, and archive processors). ``anndata`` and ``spatialdata`` loaders ship built in.
"""

from __future__ import annotations

from collections.abc import Callable
from pathlib import Path
from typing import TYPE_CHECKING, Any, Protocol, cast, overload

if TYPE_CHECKING:
from ._registry import DatasetEntry, FileEntry

if TYPE_CHECKING: # sphinx tries to import the above TYPE_CHECKING block
from anndata import AnnData
from pooch.typing import Processor
from spatialdata import SpatialData
else:
from typing import TypeAliasType

# TypeAliasType.__module__ is readonly, so we have to be a bit creative.
Processor = eval('A("Processor", object)', globals=dict(__name__="pooch.typing", A=TypeAliasType))


__all__ = ["register_loader", "available_loaders", "fetch", "Loader", "DownloadCB"]


class Loader[T](Protocol):
"""Function that can be annotated by :func:`register_loader`."""

def __call__(self, entry: DatasetEntry, target: Path, download: DownloadCB, /, **kwargs: object) -> T:
"""Call `download` (see :class:`DownloadCB`) and load ``entry``.

Args:
entry: File to download.
target: Loaded when it exists, otherwise it will be created.
download: Called when `target` doesn’t exist.
kwargs: Passed to `download`.
"""


class DownloadCB(Protocol):
"""Callback passed as `download` to a :class:`Loader`."""

def __call__(self, file: FileEntry, /, *, dest: Path | None = None, processor: Processor | None = None) -> str:
"""Download ``file`` if necessary.

Args:
file: File to download.
dest: Optional target directory, defaults to :func:`fetch`’s `cache_dir / entry.type`.
processor: Optional archive processor.
"""


_LOADERS: dict[str, Loader[object]] = {}


@overload
def register_loader[T](type_name: str) -> Callable[[Loader[T]], Loader[T]]: ...
@overload
def register_loader[T](type_name: str, loader: Loader[T]) -> Loader[T]: ...
def register_loader[T](type_name: str, loader: Loader[T] | None = None) -> Callable[[Loader[T]], Loader[T]] | Loader[T]:
"""Register a :class:`Loader` for a dataset ``type`` (decorator or direct call)."""

def deco(fn: Loader[T]) -> Loader[T]:
_LOADERS[type_name] = fn
return fn

return deco if loader is None else deco(loader)


def available_loaders() -> list[str]:
"""Return the names of all registered loader types."""
return sorted(_LOADERS)


def fetch[T](
entry: DatasetEntry, cache_dir: str | Path, *, base_url: str | None = None, retries: int = 3, **kwargs: object
) -> T: # type: ignore[type-var]
"""Download (if needed) and load ``entry``, dispatching to the loader registered for ``entry.type``.

Files are cached under ``cache_dir / entry.type``. ``kwargs`` are passed to the loader.
"""
target = Path(cache_dir) / entry.type

def download(file: FileEntry, /, dest: Path | None = None, processor: Processor | None = None) -> str:
import pooch

out = dest or target
out.mkdir(parents=True, exist_ok=True)
pup = pooch.create(
path=str(out),
base_url="",
registry={file.name: f"sha256:{file.sha256}" if file.sha256 else None},
urls={file.name: file.resolve_url(base_url)},
retry_if_failed=retries,
)
return pup.fetch(file.name, processor=processor, progressbar=True)

if entry.type not in _LOADERS:
raise KeyError(f"No loader registered for type {entry.type!r}. Available: {available_loaders()}")
return cast("Loader[T]", _LOADERS[entry.type])(entry, target, download, **kwargs)


@register_loader("anndata")
def _load_anndata(entry: DatasetEntry, target: Path, download: DownloadCB, /, **kwargs: object) -> AnnData:
"""Built-in loader: download a single ``.h5ad`` and read it with :func:`anndata.read_h5ad`."""
import anndata

return anndata.read_h5ad(download(entry.file(suffix=".h5ad")), **cast("dict[str, Any]", kwargs))


@register_loader("spatialdata")
def _load_spatialdata(entry: DatasetEntry, target: Path, download: DownloadCB, /, **kwargs: object) -> SpatialData:
"""Built-in loader: download a ``.zip``, unzip it (via pooch) and read the single ``.zarr`` store inside.

Extracts into a per-dataset directory so the ``.zarr`` can be found by glob (its name need not match
the registry key) without colliding with other spatialdata datasets cached under the same ``target``.
Needs the ``spatialdata`` extra.
"""
import pooch
import spatialdata as sd

dest = target / entry.name
download(entry.file(suffix=".zip"), dest=dest, processor=pooch.Unzip(extract_dir="."))
zarrs = sorted(dest.glob("*.zarr"))
if len(zarrs) != 1:
raise RuntimeError(f"Expected exactly one .zarr extracted under {dest}, found {len(zarrs)}: {zarrs}.")
return sd.read_zarr(zarrs[0], **cast("dict[str, Any]", kwargs))
Loading