scverse · flying-sheep · Jun 19, 2026 · Jun 15, 2026 · Jun 15, 2026 · Jun 15, 2026
diff --git a/.gitignore b/.gitignore
@@ -24,3 +24,6 @@ __pycache__/
 # docs
 /docs/generated/
 /docs/_build/
+
+# lockfiles (library: not committed)
+/uv.lock
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -49,3 +49,4 @@ repos:
       - sphinx
       - sphinx-autodoc-typehints
       - sphinxcontrib-katex
+      - types-PyYAML
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,13 @@ and this project adheres to [Semantic Versioning][].
 ### Added
 
 - A Sphinx extension to take care of documentation. This moves docstring processing from import time to documentation building time.
+- A reusable `datasets` subpackage (behind the `datasets` extra): typed `DatasetEntry`/
+  `FileEntry` + `parse_registry` (YAML), a thin pooch-based `fetch` (SHA-256 verification,
+  retries, archive processors), and a pluggable `type -> loader` registry
+  (`register_loader`) so packages can share dataset-download infrastructure. Ships built-in
+  `anndata` and `spatialdata` loaders (the latter behind the `spatialdata` extra); other
+  types are consumer-registered.
+- `anndata` is now a core dependency.
 
 ### Changed
 

diff --git a/docs/_templates/autosummary/class.rst b/docs/_templates/autosummary/class.rst
@@ -7,55 +7,49 @@
 .. autoclass:: {{ objname }}
 
 {% block attributes %}
-{% if attributes %}
+{% for item in attributes %}
+{% if loop.length != 1 %}
+{% if loop.first %}
 Attributes table
 ~~~~~~~~~~~~~~~~
 
 .. autosummary::
-{% for item in attributes %}
+{% endif %}
     ~{{ name }}.{{ item }}
-{%- endfor %}
 {% endif %}
+{%- endfor %}
 {% endblock %}
 
 {% block methods %}
-{% if methods %}
+{% for item in all_methods if item == '__call__' or not item.startswith('__') %}
+{% if loop.length != 1 %}
+{% if loop.first %}
 Methods table
 ~~~~~~~~~~~~~
 
 .. autosummary::
-{% for item in methods %}
-    {%- if item != '__init__' %}
+{% endif %}
     ~{{ name }}.{{ item }}
-    {%- endif -%}
-{%- endfor %}
 {% endif %}
+{%- endfor %}
 {% endblock %}
 
 {% block attributes_documentation %}
-{% if attributes %}
+{% for item in attributes %}
+{% if loop.first %}
 Attributes
 ~~~~~~~~~~
-
-{% for item in attributes %}
-
+{% endif %}
 .. autoattribute:: {{ [objname, item] | join(".") }}
 {%- endfor %}
-
-{% endif %}
 {% endblock %}
 
 {% block methods_documentation %}
-{% if methods %}
+{% for item in all_methods if item == '__call__' or not item.startswith('__') %}
+{% if loop.first %}
 Methods
 ~~~~~~~
-
-{% for item in methods %}
-{%- if item != '__init__' %}
-
+{% endif %}
 .. automethod:: {{ [objname, item] | join(".") }}
-{%- endif -%}
 {%- endfor %}
-
-{% endif %}
 {% endblock %}
diff --git a/docs/api.md b/docs/api.md
@@ -1,10 +1,11 @@
 # API
 
 ```{eval-rst}
-.. currentmodule:: scverse_misc
+.. module:: scverse_misc
 .. toctree::
 ```
 
+(extensions)=
 ## Extensions
 
 ```{eval-rst}
@@ -13,7 +14,9 @@
 
     make_register_namespace_decorator
 ```
+
 Types used by the former:
+
 ```{eval-rst}
 .. autosummary::
     :toctree: generated
@@ -23,7 +26,9 @@ Types used by the former:
 
 *Examples:* {ref}`example-extension-namespaces`
 
+(deprecations)=
 ## Deprecations
+
 ```{eval-rst}
 .. autosummary::
    :toctree: generated
@@ -35,6 +40,7 @@ Types used by the former:
 
 *Examples:* {ref}`example-deprecating-a-function`, {ref}`example-deprecating-a-function-argument`, {ref}`example-settings-class`
 
+(settings)=
 ## Settings
 
 ```{eval-rst}
@@ -43,9 +49,28 @@ Types used by the former:
 
    api/settings
 
-+---------------------------+----------------------------------+
-| :class:`Settings` ()      | Base class for package settings. |
-+---------------------------+----------------------------------+
+.. autosummary::
+   :signatures: short
+
+   Settings
 ```
 
 *Examples:* {ref}`example-settings-class`
+
+(datasets)=
+## Datasets (`scverse_misc.datasets`)
+
+```{eval-rst}
+.. automodule:: scverse_misc.datasets
+.. autosummary::
+    :toctree: generated
+
+    DatasetEntry
+    FileEntry
+    parse_registry
+    fetch
+    register_loader
+    available_loaders
+    Loader
+    DownloadCB
+```
diff --git a/docs/conf.py b/docs/conf.py
@@ -13,6 +13,7 @@
 
 from sphinxcontrib import katex
 
+
 HERE = Path(__file__).parent
 sys.path.insert(0, str(HERE / "extensions"))
 sys.path.insert(0, str(HERE / "sphinx_ext_examples"))
@@ -104,6 +105,7 @@
     "scipy": ("https://docs.scipy.org/doc/scipy", None),
     "pandas": ("https://pandas.pydata.org/docs/", None),
     "scanpy": ("https://scanpy.readthedocs.io/en/stable/", None),
+    "pooch": ("https://www.fatiando.org/pooch/latest/", None),
     "pydantic": ("https://pydantic.dev/docs/validation/", None),
 }
 
@@ -137,5 +139,5 @@
 nitpick_ignore: list[tuple[str, str]] = [
     # If building the documentation fails because of a missing link that is outside your control,
     # you can add an exception to this list.
-    #     ("py:class", "igraph.Graph"),
+    ("py:class", "scverse_misc._deprecated.CallableWithDeprecatedArg"),
 ]
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,11 +22,14 @@ classifiers = [
 ]
 dynamic = [ "version" ]
 dependencies = [
+  "anndata",
   # for debug logging (referenced from the issue template)
   "session-info2",
   "typing-extensions; python_version<'3.13'",
 ]
+optional-dependencies.datasets = [ "pooch", "pyyaml", "tqdm" ]
 optional-dependencies.settings = [ "pydantic-settings", "python-dotenv" ]
+optional-dependencies.spatialdata = [ "spatialdata" ]
 optional-dependencies.sphinx = [ "pydocstring-rs>=0.1.13", "sphinx>=9" ]
 # https://docs.pypi.org/project_metadata/#project-urls
 urls.Documentation = "https://scverse-misc.readthedocs.io/"
@@ -38,13 +41,20 @@ dev = [
   "pre-commit",
   "twine>=4.0.2",
 ]
-test = [ "coverage>=7.10", "numpy", "pytest", "scverse-misc[settings,sphinx]", "sphinx", "sphinx-autodoc-typehints" ]
+test = [
+  "coverage>=7.10",
+  "numpy",
+  "pytest",
+  "scverse-misc[datasets,settings,sphinx]",
+  "sphinx",
+  "sphinx-autodoc-typehints"
+]
 doc = [
   "ipykernel",
   "ipython",
   "myst-nb>=1.1",
   "pandas",
-  "scverse-misc[settings,sphinx]",
+  "scverse-misc[datasets,settings,sphinx]",
   "sphinx>=8.1",
   "sphinx-autodoc-typehints",
   "sphinx-book-theme>=1",

diff --git a/src/scverse_misc/datasets/__init__.py b/src/scverse_misc/datasets/__init__.py
@@ -0,0 +1,25 @@
+"""Reusable, declarative dataset download for scverse packages.
+
+Parse a YAML registry into typed :class:`DatasetEntry` objects, then download and load
+one with :func:`fetch`. Dataset ``type`` strings are dispatched against a pluggable loader
+registry (:func:`register_loader`); ``anndata`` and ``spatialdata`` loaders ship built in.
+
+Requires the ``datasets`` extra (``pip install scverse-misc[datasets]``); the built-in
+``spatialdata`` loader additionally needs the ``spatialdata`` extra.
+"""
+
+from __future__ import annotations
+
+from ._fetcher import DownloadCB, Loader, available_loaders, fetch, register_loader
+from ._registry import DatasetEntry, FileEntry, parse_registry
+
+__all__ = [
+    "FileEntry",
+    "DatasetEntry",
+    "parse_registry",
+    "fetch",
+    "register_loader",
+    "available_loaders",
+    "Loader",
+    "DownloadCB",
+]
diff --git a/src/scverse_misc/datasets/_fetcher.py b/src/scverse_misc/datasets/_fetcher.py
@@ -0,0 +1,132 @@
+"""Download + load a dataset: a thin ``fetch`` over pooch + a pluggable ``type -> loader`` registry.
+
+A loader is a callable ``(entry, target_dir, download, **kwargs) -> object`` where ``download``
+is ``(FileEntry, dest=None, processor=None) -> path`` (pooch under the hood: hashing, caching,
+retries, and archive processors). ``anndata`` and ``spatialdata`` loaders ship built in.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Protocol, cast, overload
+
+if TYPE_CHECKING:
+    from ._registry import DatasetEntry, FileEntry
+
+    if TYPE_CHECKING:  # sphinx tries to import the above TYPE_CHECKING block
+        from anndata import AnnData
+        from pooch.typing import Processor
+        from spatialdata import SpatialData
+    else:
+        from typing import TypeAliasType
+
+        # TypeAliasType.__module__ is readonly, so we have to be a bit creative.
+        Processor = eval('A("Processor", object)', globals=dict(__name__="pooch.typing", A=TypeAliasType))
+
+
+__all__ = ["register_loader", "available_loaders", "fetch", "Loader", "DownloadCB"]
+
+
+class Loader[T](Protocol):
+    """Function that can be annotated by :func:`register_loader`."""
+
+    def __call__(self, entry: DatasetEntry, target: Path, download: DownloadCB, /, **kwargs: object) -> T:
+        """Call `download` (see :class:`DownloadCB`) and load ``entry``.
+
+        Args:
+            entry: File to download.
+            target: Loaded when it exists, otherwise it will be created.
+            download: Called when `target` doesn’t exist.
+            kwargs: Passed to `download`.
+        """
+
+
+class DownloadCB(Protocol):
+    """Callback passed as `download` to a :class:`Loader`."""
+
+    def __call__(self, file: FileEntry, /, *, dest: Path | None = None, processor: Processor | None = None) -> str:
+        """Download ``file`` if necessary.
+
+        Args:
+            file: File to download.
+            dest: Optional target directory, defaults to :func:`fetch`’s `cache_dir / entry.type`.
+            processor: Optional archive processor.
+        """
+
+
+_LOADERS: dict[str, Loader[object]] = {}
+
+
+@overload
+def register_loader[T](type_name: str) -> Callable[[Loader[T]], Loader[T]]: ...
+@overload
+def register_loader[T](type_name: str, loader: Loader[T]) -> Loader[T]: ...
+def register_loader[T](type_name: str, loader: Loader[T] | None = None) -> Callable[[Loader[T]], Loader[T]] | Loader[T]:
+    """Register a :class:`Loader` for a dataset ``type`` (decorator or direct call)."""
+
+    def deco(fn: Loader[T]) -> Loader[T]:
+        _LOADERS[type_name] = fn
+        return fn
+
+    return deco if loader is None else deco(loader)
+
+
+def available_loaders() -> list[str]:
+    """Return the names of all registered loader types."""
+    return sorted(_LOADERS)
+
+
+def fetch[T](
+    entry: DatasetEntry, cache_dir: str | Path, *, base_url: str | None = None, retries: int = 3, **kwargs: object
+) -> T:  # type: ignore[type-var]
+    """Download (if needed) and load ``entry``, dispatching to the loader registered for ``entry.type``.
+
+    Files are cached under ``cache_dir / entry.type``. ``kwargs`` are passed to the loader.
+    """
+    target = Path(cache_dir) / entry.type
+
+    def download(file: FileEntry, /, dest: Path | None = None, processor: Processor | None = None) -> str:
+        import pooch
+
+        out = dest or target
+        out.mkdir(parents=True, exist_ok=True)
+        pup = pooch.create(
+            path=str(out),
+            base_url="",
+            registry={file.name: f"sha256:{file.sha256}" if file.sha256 else None},
+            urls={file.name: file.resolve_url(base_url)},
+            retry_if_failed=retries,
+        )
+        return pup.fetch(file.name, processor=processor, progressbar=True)
+
+    if entry.type not in _LOADERS:
+        raise KeyError(f"No loader registered for type {entry.type!r}. Available: {available_loaders()}")
+    return cast("Loader[T]", _LOADERS[entry.type])(entry, target, download, **kwargs)
+
+
+@register_loader("anndata")
+def _load_anndata(entry: DatasetEntry, target: Path, download: DownloadCB, /, **kwargs: object) -> AnnData:
+    """Built-in loader: download a single ``.h5ad`` and read it with :func:`anndata.read_h5ad`."""
+    import anndata
+
+    return anndata.read_h5ad(download(entry.file(suffix=".h5ad")), **cast("dict[str, Any]", kwargs))
+
+
+@register_loader("spatialdata")
+def _load_spatialdata(entry: DatasetEntry, target: Path, download: DownloadCB, /, **kwargs: object) -> SpatialData:
+    """Built-in loader: download a ``.zip``, unzip it (via pooch) and read the single ``.zarr`` store inside.
+
+    Extracts into a per-dataset directory so the ``.zarr`` can be found by glob (its name need not match
+    the registry key) without colliding with other spatialdata datasets cached under the same ``target``.
+    Needs the ``spatialdata`` extra.
+    """
+    import pooch
+    import spatialdata as sd
+
+    dest = target / entry.name
+    download(entry.file(suffix=".zip"), dest=dest, processor=pooch.Unzip(extract_dir="."))
+    zarrs = sorted(dest.glob("*.zarr"))
+    if len(zarrs) != 1:
+        raise RuntimeError(f"Expected exactly one .zarr extracted under {dest}, found {len(zarrs)}: {zarrs}.")
+    return sd.read_zarr(zarrs[0], **cast("dict[str, Any]", kwargs))