diff --git a/.github/workflows/rtd-preview.yml b/.github/workflows/rtd-preview.yml
new file mode 100644
index 0000000..a437edc
--- /dev/null
+++ b/.github/workflows/rtd-preview.yml
@@ -0,0 +1,30 @@
+name: Read the Docs PR preview
+
+on:
+  pull_request_target:
+    types:
+      - opened
+      - synchronize
+      - reopened
+    # Only fire when something that could change the docs build is touched.
+    paths:
+      - "docs/**"
+      - ".readthedocs.yaml"
+      - "conda_envs/environment-docs.yaml"
+      - "ptgp/**"
+      - "pyproject.toml"
+
+permissions:
+  pull-requests: write
+
+jobs:
+  documentation-links:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: readthedocs/actions/preview@v1
+        with:
+          # Project slug as configured on Read the Docs. Verify this matches
+          # the slug shown in the RTD project URL
+          # (https://readthedocs.org/projects/<slug>/) once the project is
+          # imported there.
+          project-slug: "ptgp"
diff --git a/.gitignore b/.gitignore
index 04a8f1f..7ea9f34 100644
--- a/.gitignore
+++ b/.gitignore
@@ -188,3 +188,4 @@ cython_debug/
 # PyPI configuration file
 .pypirc
 .idea/
+/docs/source/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 406faa9..787bcad 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -28,3 +28,12 @@ repos:
     - id: no-print-statements
       files: ^ptgp/
       exclude: ^ptgp/_cli\.py$
+
+- repo: local
+  hooks:
+    - id: check-kernel-gallery
+      name: check covariance gallery covers every public kernel
+      entry: python scripts/check_kernel_gallery.py
+      language: system
+      pass_filenames: false
+      files: ^(ptgp/kernels/|docs/sphinxext/generate_kernel_gallery\.py$)
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
new file mode 100644
index 0000000..080396c
--- /dev/null
+++ b/.readthedocs.yaml
@@ -0,0 +1,18 @@
+version: 2
+
+sphinx:
+  configuration: docs/source/conf.py
+  fail_on_warning: false
+
+conda:
+  environment: conda_envs/environment-docs.yaml
+
+python:
+  install:
+    - method: pip
+      path: .
+
+build:
+  os: "ubuntu-22.04"
+  tools:
+    python: "mambaforge-latest"
diff --git a/conda_envs/environment-docs.yaml b/conda_envs/environment-docs.yaml
new file mode 100644
index 0000000..551bf8e
--- /dev/null
+++ b/conda_envs/environment-docs.yaml
@@ -0,0 +1,36 @@
+name: ptgp-docs
+channels:
+    - conda-forge
+dependencies:
+    - python>=3.12
+    # Runtime deps: autodoc imports ptgp, so the full runtime stack has to
+    # be in scope.
+    - jax
+    - jaxlib
+    - optax
+    - numpy
+    - scipy
+    - libblas=*=*mkl
+    - mkl-service
+    - pymc
+    - matplotlib
+    # Docs build deps.
+    - ipython
+    - jupyter
+    - sphinx>=7
+    - pydata-sphinx-theme
+    - myst-nb
+    - numpydoc
+    - sphinx-copybutton
+    - sphinx-design
+    - sphinx-codeautolink
+    - sphinx-sitemap
+    - sphinx-notfound-page
+    - sphinx-autobuild
+    - jupyter-sphinx
+    - sphinxcontrib-bibtex
+    - pip
+    - pip:
+        - git+https://github.com/pymc-devs/pytensor@main
+        # ptgp itself so autodoc resolves the current source tree.
+        - -e ..
diff --git a/conda_envs/environment.yaml b/conda_envs/environment.yaml
index 1ead8fa..b11d3d2 100644
--- a/conda_envs/environment.yaml
+++ b/conda_envs/environment.yaml
@@ -22,6 +22,18 @@ dependencies:
     - pytest
     - mypy
     - polars
-    - pymc
+    # Docs tooling — kept in sync with conda_envs/environment-docs.yaml so
+    # `make html` works in the dev env without an extra install step.
+    - sphinx>=7
+    - pydata-sphinx-theme
+    - myst-nb
+    - numpydoc
+    - sphinx-copybutton
+    - sphinx-design
+    - sphinx-codeautolink
+    - sphinx-sitemap
+    - sphinx-notfound-page
+    - jupyter-sphinx
+    - sphinxcontrib-bibtex
     - arviz
     - pip
diff --git a/docs/.gitignore b/docs/.gitignore
new file mode 100644
index 0000000..b4ccf7a
--- /dev/null
+++ b/docs/.gitignore
@@ -0,0 +1,25 @@
+# Sphinx build output
+build/
+jupyter_execute/
+
+# Python cache
+__pycache__/
+*.pyc
+
+# Autosummary-generated stubs (one .rst per object, regenerated on build)
+source/api/generated/
+source/api/**/generated/
+source/api/**/classmethods/
+
+# Notebook gallery artifacts written by docs/sphinxext/generate_gallery.py
+source/examples/gallery.rst
+source/examples/examples/
+source/examples/introductory/
+source/examples/advanced/
+source/examples/case_study/
+source/_thumbnails/
+
+# Covariance gallery artifacts written by
+# docs/sphinxext/generate_kernel_gallery.py
+source/kernels/gallery.rst
+source/kernels/img/
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..661d5ee
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,36 @@
+# Minimal Makefile for Sphinx documentation.
+# Mirrors the standard sphinx-quickstart output.
+
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+.PHONY: help clean html show livehtml linkcheck Makefile
+
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+clean:
+	rm -rf "$(BUILDDIR)"
+	rm -rf "$(SOURCEDIR)"/api/generated
+	rm -rf "$(SOURCEDIR)"/api/*/generated
+	rm -rf "$(SOURCEDIR)"/_thumbnails
+	rm -f  "$(SOURCEDIR)"/examples/gallery.rst
+	rm -rf "$(SOURCEDIR)"/kernels/img
+	rm -f  "$(SOURCEDIR)"/kernels/gallery.rst
+
+html:
+	@$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+show: html
+	@open "$(BUILDDIR)/html/index.html"
+
+livehtml:
+	sphinx-autobuild "$(SOURCEDIR)" "$(BUILDDIR)/html" $(SPHINXOPTS) $(O)
+
+linkcheck:
+	@$(SPHINXBUILD) -M linkcheck "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 0000000..49fa0ba
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,29 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Install Sphinx, then re-run.
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/source/_static/.gitkeep b/docs/source/_static/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/docs/source/_templates/autosummary/class.rst b/docs/source/_templates/autosummary/class.rst
new file mode 100644
index 0000000..f1f165f
--- /dev/null
+++ b/docs/source/_templates/autosummary/class.rst
@@ -0,0 +1,34 @@
+{{ fullname | escape | underline}}
+
+.. currentmodule:: {{ module }}
+
+.. autoclass:: {{ objname }}
+
+   {% block methods %}
+   {% if methods %}
+
+   .. rubric:: Methods
+
+   .. autosummary::
+      :toctree: classmethods
+
+   {% for item in methods %}
+   {%- if item not in inherited_members %}
+      {{ objname }}.{{ item }}
+   {% endif %}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
+
+   {% block attributes %}
+   {% if attributes %}
+   .. rubric:: Attributes
+
+   .. autosummary::
+   {% for item in attributes %}
+   {%- if item not in inherited_members %}
+      ~{{ name }}.{{ item }}
+   {% endif %}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
diff --git a/docs/source/api.rst b/docs/source/api.rst
new file mode 100644
index 0000000..1023326
--- /dev/null
+++ b/docs/source/api.rst
@@ -0,0 +1,20 @@
+.. _api:
+
+API Reference
+=============
+
+.. toctree::
+    :maxdepth: 1
+    :titlesonly:
+
+    api/gp
+    api/kernels
+    api/likelihoods
+    api/mean
+    api/inducing
+    api/inducing_fourier
+    api/objectives
+    api/conditionals
+    api/kl
+    api/optim
+    api/utils
diff --git a/docs/source/api/conditionals.rst b/docs/source/api/conditionals.rst
new file mode 100644
index 0000000..6d56bad
--- /dev/null
+++ b/docs/source/api/conditionals.rst
@@ -0,0 +1,13 @@
+Conditionals
+============
+
+.. currentmodule:: ptgp.conditionals
+
+Symbolic predictive conditionals used by the variational GP models.
+
+.. autosummary::
+    :toctree: generated/
+
+    conditional_whitened
+    conditional_unwhitened
+    base_conditional
diff --git a/docs/source/api/gp.rst b/docs/source/api/gp.rst
new file mode 100644
index 0000000..76049ed
--- /dev/null
+++ b/docs/source/api/gp.rst
@@ -0,0 +1,13 @@
+GP Models
+=========
+
+.. currentmodule:: ptgp.gp
+
+.. autosummary::
+    :toctree: generated/
+
+    Unapproximated
+    VFE
+    SVGP
+    VariationalParams
+    init_variational_params
diff --git a/docs/source/api/inducing.rst b/docs/source/api/inducing.rst
new file mode 100644
index 0000000..cfbed27
--- /dev/null
+++ b/docs/source/api/inducing.rst
@@ -0,0 +1,35 @@
+Inducing Points
+===============
+
+.. currentmodule:: ptgp.inducing
+
+Inducing variables
+------------------
+
+.. autosummary::
+    :toctree: generated/
+
+    InducingVariables
+    Points
+
+Initialization strategies
+-------------------------
+
+.. autosummary::
+    :toctree: generated/
+
+    random_subsample_init
+    kmeans_init
+    greedy_variance_init
+
+Diagnostics
+-----------
+
+.. autosummary::
+    :toctree: generated/
+
+    KernelHealthDiagnostics
+    RandomSubsampleDiagnostics
+    KMeansDiagnostics
+    GreedyVarianceDiagnostics
+    compute_inducing_diagnostics
diff --git a/docs/source/api/inducing_fourier.rst b/docs/source/api/inducing_fourier.rst
new file mode 100644
index 0000000..df4e496
--- /dev/null
+++ b/docs/source/api/inducing_fourier.rst
@@ -0,0 +1,9 @@
+Fourier Features
+================
+
+.. currentmodule:: ptgp.inducing_fourier
+
+.. autosummary::
+    :toctree: generated/
+
+    FourierFeatures1D
diff --git a/docs/source/api/kernels.rst b/docs/source/api/kernels.rst
new file mode 100644
index 0000000..a76c23d
--- /dev/null
+++ b/docs/source/api/kernels.rst
@@ -0,0 +1,51 @@
+Kernels
+=======
+
+.. currentmodule:: ptgp.kernels
+
+Base
+----
+
+.. autosummary::
+    :toctree: generated/
+
+    Kernel
+
+Stationary
+----------
+
+.. autosummary::
+    :toctree: generated/
+
+    ExpQuad
+    Matern52
+    Matern32
+    Matern12
+
+Non-stationary
+--------------
+
+.. autosummary::
+    :toctree: generated/
+
+    Gibbs
+    RandomWalk
+    WarpedInput
+
+Categorical
+-----------
+
+.. autosummary::
+    :toctree: generated/
+
+    Overlap
+    LowRankCategorical
+
+Composition
+-----------
+
+.. autosummary::
+    :toctree: generated/
+
+    SumKernel
+    ProductKernel
diff --git a/docs/source/api/kl.rst b/docs/source/api/kl.rst
new file mode 100644
index 0000000..c902355
--- /dev/null
+++ b/docs/source/api/kl.rst
@@ -0,0 +1,10 @@
+KL Divergences
+==============
+
+.. currentmodule:: ptgp.kl
+
+.. autosummary::
+    :toctree: generated/
+
+    gauss_kl
+    gauss_kl_structured
diff --git a/docs/source/api/likelihoods.rst b/docs/source/api/likelihoods.rst
new file mode 100644
index 0000000..cfc8c7a
--- /dev/null
+++ b/docs/source/api/likelihoods.rst
@@ -0,0 +1,14 @@
+Likelihoods
+===========
+
+.. currentmodule:: ptgp.likelihoods
+
+.. autosummary::
+    :toctree: generated/
+
+    Likelihood
+    Gaussian
+    Bernoulli
+    Poisson
+    NegativeBinomial
+    StudentT
diff --git a/docs/source/api/mean.rst b/docs/source/api/mean.rst
new file mode 100644
index 0000000..e4e9732
--- /dev/null
+++ b/docs/source/api/mean.rst
@@ -0,0 +1,11 @@
+Mean Functions
+==============
+
+.. currentmodule:: ptgp.mean
+
+.. autosummary::
+    :toctree: generated/
+
+    Zero
+    Constant
+    Linear
diff --git a/docs/source/api/objectives.rst b/docs/source/api/objectives.rst
new file mode 100644
index 0000000..c34340a
--- /dev/null
+++ b/docs/source/api/objectives.rst
@@ -0,0 +1,17 @@
+Objectives
+==========
+
+.. currentmodule:: ptgp.objectives
+
+Each objective pairs with a specific GP model and returns a symbolic loss
+suitable for the training compilers in :mod:`ptgp.optim`.
+
+.. autosummary::
+    :toctree: generated/
+
+    marginal_log_likelihood
+    elbo
+    collapsed_elbo
+    fitc_log_marginal_likelihood
+    dpp_regularizer
+    vfe_diagnostics
diff --git a/docs/source/api/optim.rst b/docs/source/api/optim.rst
new file mode 100644
index 0000000..4212168
--- /dev/null
+++ b/docs/source/api/optim.rst
@@ -0,0 +1,47 @@
+Optimization
+============
+
+.. currentmodule:: ptgp.optim
+
+Training compilers
+------------------
+
+.. autosummary::
+    :toctree: generated/
+
+    compile_training_step
+    compile_scipy_objective
+    compile_scipy_diagnostics
+    compile_predict
+    get_trained_params
+
+Optimizers
+----------
+
+.. autosummary::
+    :toctree: generated/
+
+    adam
+    sgd
+
+Staged & tracked minimization
+-----------------------------
+
+.. autosummary::
+    :toctree: generated/
+
+    minimize_staged_vfe
+    tracked_minimize
+    phase_sort_key
+
+Schedules
+---------
+
+.. currentmodule:: ptgp.optim.schedules
+
+.. autosummary::
+    :toctree: generated/
+
+    constant
+    exponential_decay
+    cosine
diff --git a/docs/source/api/utils.rst b/docs/source/api/utils.rst
new file mode 100644
index 0000000..8ceb4e6
--- /dev/null
+++ b/docs/source/api/utils.rst
@@ -0,0 +1,12 @@
+Utilities
+=========
+
+.. currentmodule:: ptgp.utils
+
+.. autosummary::
+    :toctree: generated/
+
+    get_initial_params
+    check_init
+    save_fit
+    load_fit
diff --git a/docs/source/conf.py b/docs/source/conf.py
new file mode 100644
index 0000000..a75643b
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,183 @@
+import os
+import sys
+
+from pathlib import Path
+
+root_dir = Path("../..").resolve()
+sys.path.insert(0, str(root_dir))
+sys.path.insert(0, str(root_dir / "docs" / "sphinxext"))
+
+import ptgp  # noqa: E402
+
+# -- Project information -----------------------------------------------------
+project = "ptgp"
+copyright = "2025, Bill Engels"
+author = "Bill Engels"
+language = "en"
+html_baseurl = "https://ptgp.readthedocs.io"
+
+# -- Version handling --------------------------------------------------------
+# Mirrors the gEconpy / pytensor pattern so RTD version selector labels match
+# what users see in the package.
+version = ptgp.__version__
+on_readthedocs = os.environ.get("READTHEDOCS", None)
+rtd_version = os.environ.get("READTHEDOCS_VERSION", "")
+if on_readthedocs:
+    if rtd_version.lower() == "stable":
+        version = ptgp.__version__.split("+")[0]
+    elif rtd_version.lower() == "latest":
+        version = "dev"
+    else:
+        version = rtd_version
+else:
+    rtd_version = "local"
+release = version
+
+# -- General configuration ---------------------------------------------------
+extensions = [
+    "sphinx.ext.autodoc",
+    "sphinx.ext.autosummary",
+    "sphinx.ext.autosectionlabel",
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.mathjax",
+    "numpydoc",
+    "myst_nb",
+    "sphinx_design",
+    "sphinx_copybutton",
+    "sphinx_codeautolink",
+    "jupyter_sphinx",
+    "sphinxcontrib.bibtex",
+    "generate_gallery",
+    "generate_kernel_gallery",
+]
+
+# Bibliographic citations: drop new entries into docs/source/references.bib
+# and cite from prose with `{cite:t}` (textual) or `{cite:p}` (parenthetical).
+bibtex_bibfiles = ["references.bib"]
+bibtex_default_style = "unsrt"
+bibtex_reference_style = "author_year"
+
+# Use the document path as prefix for autosectionlabel anchors so the same
+# section title in two files doesn't collide.
+autosectionlabel_prefix_document = True
+
+templates_path = ["_templates"]
+
+exclude_patterns = [
+    "_build",
+    "**.ipynb_checkpoints",
+    "*/autosummary/*.rst",
+    "Thumbs.db",
+    ".DS_Store",
+]
+
+source_suffix = {
+    ".rst": "restructuredtext",
+    ".md": "myst-nb",
+    ".ipynb": "myst-nb",
+    ".myst": "myst-nb",
+}
+
+master_doc = "index"
+
+# -- Autodoc / autosummary ---------------------------------------------------
+autosummary_generate = True
+autodoc_typehints = "none"
+autoclass_content = "class"
+# Class method pages live under api/.../classmethods/ — keep them out of the
+# global toctree so they don't pollute the sidebar.
+remove_from_toctrees = ["**/classmethods/*"]
+
+numpydoc_show_class_members = False
+numpydoc_xref_param_type = True
+
+# Teach numpydoc about project-specific section headers so they render as
+# proper sections instead of emitting "Unknown section" warnings.
+from numpydoc.docscrape import NumpyDocString  # noqa: E402
+
+for _section in ("Recommended Workflow", "Factorisation", "Fields", "Interrupts"):
+    NumpyDocString.sections.setdefault(_section, [])
+del NumpyDocString
+numpydoc_xref_ignore = {
+    "of",
+    "or",
+    "optional",
+    "default",
+    "numeric",
+    "type",
+    "scalar",
+    "instance",
+    "array",
+    "array_like",
+    "1D",
+    "2D",
+    "3D",
+    "nD",
+    "M",
+    "N",
+    "D",
+    "K",
+}
+
+# -- HTML output -------------------------------------------------------------
+html_theme = "pydata_sphinx_theme"
+html_title = "ptgp"
+html_short_title = "ptgp"
+html_last_updated_fmt = ""
+
+sitemap_url_scheme = f"{{lang}}{rtd_version}/{{link}}"
+
+html_theme_options = {
+    "secondary_sidebar_items": ["page-toc", "edit-this-page", "sourcelink"],
+    "navbar_start": ["navbar-logo"],
+    "show_prev_next": True,
+    "icon_links": [
+        {
+            "url": "https://github.com/bwengals/ptgp",
+            "icon": "fa-brands fa-github",
+            "name": "GitHub",
+            "type": "fontawesome",
+        },
+    ],
+}
+
+github_version = version if "." in rtd_version else "main"
+html_context = {
+    "github_url": "https://github.com",
+    "github_user": "bwengals",
+    "github_repo": "ptgp",
+    "github_version": github_version,
+    "doc_path": "docs/source",
+    "default_mode": "dark",
+}
+
+html_sidebars = {"**": ["sidebar-nav-bs.html", "searchbox.html"]}
+html_static_path = ["_static"]
+
+# -- MyST / MyST-NB config ---------------------------------------------------
+myst_enable_extensions = [
+    "colon_fence",
+    "deflist",
+    "dollarmath",
+    "amsmath",
+    "substitution",
+]
+myst_dmath_double_inline = True
+
+# Notebooks ship pre-rendered. Re-executing them on RTD would require pinning
+# every numerical dep and would slow the build significantly; flip this to
+# "auto" or "force" later if the gallery becomes the source of truth.
+nb_execution_mode = "off"
+
+# -- Intersphinx -------------------------------------------------------------
+intersphinx_mapping = {
+    "python": ("https://docs.python.org/3/", None),
+    "numpy": ("https://numpy.org/doc/stable/", None),
+    "scipy": ("https://docs.scipy.org/doc/scipy/", None),
+    "jax": ("https://jax.readthedocs.io/en/latest/", None),
+    "pytensor": ("https://pytensor.readthedocs.io/en/latest/", None),
+    "pymc": ("https://www.pymc.io/projects/docs/en/stable/", None),
+    "arviz": ("https://python.arviz.org/en/latest/", None),
+    "myst": ("https://myst-parser.readthedocs.io/en/latest/", None),
+    "myst-nb": ("https://myst-nb.readthedocs.io/en/latest/", None),
+}
diff --git a/docs/source/dev/architecture.rst b/docs/source/dev/architecture.rst
new file mode 100644
index 0000000..398b1a5
--- /dev/null
+++ b/docs/source/dev/architecture.rst
@@ -0,0 +1,13 @@
+Architecture
+============
+
+.. note::
+
+   **WRITEME.** This page is a stub. The authoritative source today is
+   ``DESIGN.md`` at the repo root; port the architectural notes here once
+   the surface stabilizes, and split off the user-facing parts into
+   :doc:`/user_guide/design`.
+
+See ``DESIGN.md`` for the current state of the architecture, including
+the rewrite system, the kernel annotation convention, and the
+training-compiler design.
diff --git a/docs/source/dev/contributing.rst b/docs/source/dev/contributing.rst
new file mode 100644
index 0000000..5d914e2
--- /dev/null
+++ b/docs/source/dev/contributing.rst
@@ -0,0 +1,32 @@
+Contributing
+============
+
+.. note::
+
+   **WRITEME.** This page is a stub. Cover environment setup, the
+   pre-commit hooks, how to run tests, the GPJax-as-reference testing
+   convention, branching / PR workflow, and where new kernels and
+   likelihoods should live.
+
+Development install
+-------------------
+
+.. code-block:: bash
+
+    git clone https://github.com/bwengals/ptgp.git
+    cd ptgp
+    pip install -e ".[dev]"
+    pre-commit install
+
+Running tests
+-------------
+
+ptgp uses pytest:
+
+.. code-block:: bash
+
+    python -m pytest tests/
+
+Many tests compare against GPJax (the reference implementation) at
+``atol=1e-5``. GPJax runs in float32 and ptgp in float64; that tolerance
+is the cross-library cap.
diff --git a/docs/source/dev/docs.rst b/docs/source/dev/docs.rst
new file mode 100644
index 0000000..199d348
--- /dev/null
+++ b/docs/source/dev/docs.rst
@@ -0,0 +1,196 @@
+Working on the docs
+===================
+
+This page covers how the documentation system is wired and the workflows
+needed to extend it — adding a new kernel page, a new notebook example, or
+a citation.
+
+Building locally
+----------------
+
+The docs build from ``docs/`` using the ``ptgp-docs`` conda environment:
+
+.. code-block:: bash
+
+    conda env update -f conda_envs/environment-docs.yaml
+    cd docs
+    make show         # build + open the rendered HTML in the default browser
+    make livehtml     # auto-rebuild + auto-refresh on every save (sphinx-autobuild)
+    make clean        # wipe build/ and all generated source (gallery, thumbnails, autosummary stubs)
+
+Read the Docs builds the same way; ``.readthedocs.yaml`` points at the same
+conda env and ``docs/source/conf.py``.
+
+Layout
+------
+
+Source content lives under ``docs/source/``:
+
+.. list-table::
+    :header-rows: 1
+    :widths: 30 70
+
+    * - Path
+      - Purpose
+    * - ``index.rst``
+      - Landing page + top-level toctree.
+    * - ``api.rst`` + ``api/*.rst``
+      - Autosummary entry points; one file per public submodule.
+    * - ``get_started/``
+      - Install + quickstart + about. Hand-written narrative.
+    * - ``user_guide/``
+      - Conceptual pages (models, kernels, training, design).
+    * - ``kernels/gallery.rst``
+      - Covariance gallery landing page. **Generated** at build time.
+    * - ``kernels/gallery/*.md``
+      - Per-kernel detail pages. Hand-written MyST markdown.
+    * - ``examples/gallery.rst``
+      - Notebook gallery landing page. **Generated** at build time.
+    * - ``examples/<category>/*.ipynb``
+      - Notebook copies staged from ``notebooks/``. **Generated**.
+    * - ``dev/``
+      - This page and other contributor docs.
+    * - ``release/``
+      - Release notes.
+    * - ``references.bib``
+      - BibTeX entries; cited via ``{cite:t}`` or ``{cite:p}``.
+    * - ``_templates/autosummary/``
+      - Sphinx autosummary class template (per-method subpages).
+
+Build-time-generated paths are gitignored via ``docs/.gitignore``; never
+commit anything under ``source/kernels/img/``, ``source/_thumbnails/``,
+``source/examples/<category>/``, ``source/api/**/generated/``, or
+``source/kernels/gallery.rst`` / ``source/examples/gallery.rst``.
+
+The two custom Sphinx extensions live at ``docs/sphinxext/``:
+
+* ``generate_gallery.py`` — discovers notebooks under ``notebooks/``, copies
+  them into ``docs/source/examples/<category>/``, extracts thumbnails, and
+  emits ``examples/gallery.rst``.
+* ``generate_kernel_gallery.py`` — renders one cover image per entry in
+  ``KERNEL_RECIPES``, emits ``kernels/gallery.rst`` with grid cards. Cards
+  link to per-kernel pages when a matching ``kernels/gallery/<slug>.md``
+  exists on disk.
+
+Adding a new kernel to the covariance gallery
+---------------------------------------------
+
+When a new kernel lands in ``ptgp.kernels``, the
+``check-kernel-gallery`` pre-commit hook fails until you either add a cover
+recipe or blacklist the kernel. To add a cover recipe:
+
+1. **Add a builder and recipe** in
+   ``docs/sphinxext/generate_kernel_gallery.py``:
+
+   .. code-block:: python
+
+       def _build_mynewkernel():
+           return MyNewKernel(input_dim=1, ls=1.0), _line()
+
+       KERNEL_RECIPES.append(
+           CoverRecipe("MyNewKernel", "My New Kernel", _build_mynewkernel, "conditional"),
+       )
+
+   ``mode`` is one of ``"samples"``, ``"conditional"``, or ``"heatmap"``.
+   For non-stationary kernels with constrained domains (like ``RandomWalk``)
+   pass ``obs_x=`` / ``obs_y=`` so the conditional-render observations land
+   inside the valid input range.
+
+2. **Write the per-kernel page** at
+   ``docs/source/kernels/gallery/mynewkernel.md`` — the slug must be the
+   lowercase recipe name. Use ``docs/source/kernels/gallery/expquad.md`` as
+   the template. The standard sections are:
+
+   * MyST + Jupytext frontmatter.
+   * H1 title and a row of tag chip links.
+   * Two paragraphs of prose describing what the kernel is and when to use
+     it.
+   * ``## Key properties and parameters`` — table of domain, stationarity,
+     sample smoothness, variance; one-sentence constructor link to the
+     autosummary page.
+   * ``### Covariance function`` — math equation followed by a tab-set with
+     **Kernel decay**, **Prior samples**, **Posterior given 3 observations**,
+     **Code** tabs. The first three tabs use the plot helpers in
+     ``ptgp.plotting``; sync them on ``:sync: ls`` so clicking a
+     lengthscale selection persists across tabs.
+   * ``{seealso}`` admonition cross-linking related kernels.
+   * One prose paragraph at the end with ``{cite:t}`` citations and a
+     ``{bibliography}`` directive filtered by ``docname``.
+
+3. **The gallery extension auto-detects the page**. Cover-card link wiring is
+   automatic — no edits needed elsewhere.
+
+To **blacklist** a kernel instead (composition kernels, kernels that take a
+user-supplied callable, anything that doesn't have a canonical instance to
+visualize), add its name to ``KERNEL_GALLERY_BLACKLIST`` in the same module
+with a one-line comment explaining why.
+
+Adding a notebook example
+-------------------------
+
+Drop the ``.ipynb`` under ``notebooks/``. The ``generate_gallery``
+extension auto-discovers it on the next build, extracts the last image
+output as a thumbnail, and emits a grid card.
+
+To group notebooks into named categories, create subdirectories under
+``notebooks/`` (e.g. ``notebooks/introductory/foo.ipynb``). The subdir name
+becomes the category id; pretty titles are looked up in ``CATEGORY_TITLES``
+inside ``generate_gallery.py`` and fall back to title-casing the folder
+name.
+
+Notebooks must be tracked by git to appear in the gallery (so untracked
+work-in-progress notebooks under ``notebooks/`` don't pollute the build).
+
+Adding a citation
+-----------------
+
+Append the BibTeX entry to ``docs/source/references.bib``, then cite from
+prose with either textual or parenthetical form:
+
+.. code-block:: markdown
+
+    The kernel ridge regression view is discussed by {cite:t}`somekey-2024`.
+    This approach goes back decades {cite:p}`smith-1985`.
+
+Each page that uses citations should also include a per-page bibliography:
+
+.. code-block:: markdown
+
+    ```{bibliography}
+    :filter: docname in docnames
+    ```
+
+The ``:filter: docname in docnames`` clause restricts the rendered
+bibliography to entries cited on the current page. Sphinx will emit
+"duplicate citation" warnings when the same key is rendered from multiple
+pages — these are tolerated; we prefer per-page bibliographies for
+readability.
+
+Plot helpers
+------------
+
+Helpers used by the per-kernel pages live in :mod:`ptgp.plotting`:
+
+* :func:`ptgp.plotting.plot_kernel_decay` — overlaid curves of
+  :math:`k(0, x)` against distance.
+* :func:`ptgp.plotting.plot_prior_samples` — side-by-side panels of GP
+  prior samples for a list of kernel instances.
+* :func:`ptgp.plotting.plot_conditional` — side-by-side panels of GP
+  posterior draws given a few noisy observations, rendered McElreath-style
+  (hollow markers with popsicle error bars).
+
+These are public API — users can call them in their own notebooks to
+reproduce the gallery style. When adding a new visualization that more than
+one kernel page will use, extend ``ptgp.plotting`` rather than putting the
+helper in a docs-only module.
+
+Pre-commit checks
+-----------------
+
+* ``check-kernel-gallery`` — fails when a kernel exported from
+  ``ptgp.kernels`` is missing from both ``KERNEL_RECIPES`` and
+  ``KERNEL_GALLERY_BLACKLIST``. AST-only; no docs deps required.
+* ``ruff`` / ``ruff-format`` — apply to ``docs/sphinxext/`` and
+  ``ptgp.plotting``.
+* ``no-commit-to-branch`` — work on a feature branch, never commit
+  directly to ``main``.
diff --git a/docs/source/dev/index.rst b/docs/source/dev/index.rst
new file mode 100644
index 0000000..7fb18cb
--- /dev/null
+++ b/docs/source/dev/index.rst
@@ -0,0 +1,9 @@
+Developer Guide
+===============
+
+.. toctree::
+   :maxdepth: 1
+
+   contributing
+   architecture
+   docs
diff --git a/docs/source/get_started/about.rst b/docs/source/get_started/about.rst
new file mode 100644
index 0000000..6efcfe5
--- /dev/null
+++ b/docs/source/get_started/about.rst
@@ -0,0 +1,26 @@
+About ptgp
+==========
+
+.. note::
+
+   **WRITEME.** This page is a stub. Fill in project motivation, scope, and
+   how ptgp relates to the rest of the GP ecosystem (GPJax, GPflow,
+   GPyTorch, PyMC's built-in GP).
+
+ptgp is a Gaussian process library built on PyTensor's symbolic graph and
+rewrite system. It is aimed at practitioners who need flexible,
+well-supported GP modeling on real-world datasets — exact GPs, sparse
+variational methods, non-Gaussian likelihoods, and PyMC priors on
+hyperparameters.
+
+Researchers benefit from the underlying design: ptgp models return symbolic
+PyTensor tensors, so writing GP math directly (``pt.linalg.inv(K)``,
+``pt.linalg.slogdet(K)``) lets the compiler pick efficient algorithms based
+on declared matrix structure. This makes it straightforward to implement
+new GP approximations and custom models.
+
+ptgp distills approaches from existing GP libraries to make them more
+accessible, drawing primarily from
+`GPJax <https://github.com/JaxGaussianProcesses/GPJax>`_,
+`GPflow <https://github.com/GPflow/GPflow>`_, and
+`GPyTorch <https://github.com/cornellius-gp/gpytorch>`_.
diff --git a/docs/source/get_started/index.rst b/docs/source/get_started/index.rst
new file mode 100644
index 0000000..31840e0
--- /dev/null
+++ b/docs/source/get_started/index.rst
@@ -0,0 +1,9 @@
+Getting Started
+===============
+
+.. toctree::
+   :maxdepth: 1
+
+   install
+   quickstart
+   about
diff --git a/docs/source/get_started/install.rst b/docs/source/get_started/install.rst
new file mode 100644
index 0000000..fe8668c
--- /dev/null
+++ b/docs/source/get_started/install.rst
@@ -0,0 +1,55 @@
+Installation
+============
+
+.. note::
+
+   **WRITEME.** This page is a stub. Flesh out with full install instructions,
+   supported Python versions, and any platform-specific notes.
+
+ptgp targets Python ``>= 3.12``. It depends on PyMC (``>= 6.0``) and on
+PyTensor's ``main`` branch.
+
+From PyPI
+---------
+
+.. code-block:: bash
+
+    pip install ptgp
+
+.. note::
+
+   ptgp is not yet released on PyPI. Until then, install from source.
+
+From source
+-----------
+
+.. code-block:: bash
+
+    git clone https://github.com/bwengals/ptgp.git
+    cd ptgp
+    pip install -e .
+
+PyTensor and PyMC
+-----------------
+
+ptgp currently requires the development versions of PyTensor (``main``) and
+PyMC (``>= 6.0``). The pinned versions in the project's conda environments
+under ``conda_envs/`` are the reference setup the maintainer uses; mirror
+those if your install is misbehaving.
+
+.. code-block:: bash
+
+    pip install git+https://github.com/pymc-devs/pytensor@main
+    pip install pymc
+
+Development install
+-------------------
+
+.. code-block:: bash
+
+    git clone https://github.com/bwengals/ptgp.git
+    cd ptgp
+    pip install -e ".[dev]"
+    pre-commit install
+
+See :doc:`/dev/contributing` for the rest of the contributor setup.
diff --git a/docs/source/get_started/quickstart.rst b/docs/source/get_started/quickstart.rst
new file mode 100644
index 0000000..6171165
--- /dev/null
+++ b/docs/source/get_started/quickstart.rst
@@ -0,0 +1,52 @@
+Quickstart
+==========
+
+.. note::
+
+   **WRITEME.** This page is a stub. Walk a new user end-to-end through
+   fitting and predicting with one of the GP models. Cross-link to the
+   :doc:`/examples/gallery` for the full notebooks.
+
+The snippet below trains an SVGP on a small toy dataset.
+
+.. code-block:: python
+
+    import numpy as np
+    import pymc as pm
+    import pytensor.tensor as pt
+    import ptgp as pg
+
+    X = np.random.randn(200, 1)
+    y = np.sin(X.ravel()) + 0.1 * np.random.randn(200)
+    Z = np.linspace(-2, 2, 20)[:, None]
+
+    with pm.Model() as model:
+        ls = pm.InverseGamma("ls", alpha=2.0, beta=1.0)
+        eta = pm.Exponential("eta", lam=1.0)
+        kernel = eta**2 * pg.kernels.Matern52(input_dim=1, ls=ls)
+
+        vp = pg.gp.init_variational_params(M=20)
+        svgp = pg.gp.SVGP(
+            kernel=kernel,
+            likelihood=pg.likelihoods.Gaussian(sigma=0.1),
+            inducing_variable=pg.inducing.Points(pt.as_tensor(Z)),
+            variational_params=vp,
+        )
+
+    X_var = pt.matrix("X", shape=(None, 1))
+    y_var = pt.vector("y", shape=(None,))
+
+    step, shared_params, shared_extras = pg.optim.compile_training_step(
+        lambda gp, X, y: pg.objectives.elbo(gp, X, y).elbo,
+        svgp, X_var, y_var,
+        model=model,
+        extra_vars=vp.extra_vars,
+        extra_init=vp.extra_init,
+        learning_rate=1e-2,
+    )
+
+    for _ in range(500):
+        loss = step(X, y)
+
+See :doc:`/examples/gallery` for the full end-to-end notebooks covering
+``Unapproximated``, ``VFE``, and ``SVGP``.
diff --git a/docs/source/index.rst b/docs/source/index.rst
new file mode 100644
index 0000000..8a187eb
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,65 @@
+ptgp
+====
+
+A Gaussian process library for building GP models that solve real-world
+problems. Built on PyTensor's symbolic graph and rewrite system, with PyMC
+priors and native optimizers.
+
+ptgp ships exact GPs, sparse VFE, SVGP with minibatch training, and Fourier
+features; a full kernel library with composition and ``active_dims``;
+non-Gaussian likelihoods; and a training toolbox with L-BFGS-B and Adam,
+per-parameter learning rates, staged optimization, and inducing-point
+initialization.
+
+Quick install
+-------------
+
+.. code-block:: bash
+
+    pip install -e .
+
+Requires PyTensor from ``main`` and PyMC ``>= 6.0``. See the
+:doc:`installation guide <get_started/install>` for details.
+
+Quick example
+-------------
+
+.. code-block:: python
+
+    import numpy as np
+    import pymc as pm
+    import pytensor.tensor as pt
+    import ptgp as pg
+
+    X = np.random.randn(200, 1)
+    y = np.sin(X.ravel()) + 0.1 * np.random.randn(200)
+    Z = np.linspace(-2, 2, 20)[:, None]
+
+    with pm.Model() as model:
+        ls = pm.InverseGamma("ls", alpha=2.0, beta=1.0)
+        eta = pm.Exponential("eta", lam=1.0)
+        kernel = eta**2 * pg.kernels.Matern52(input_dim=1, ls=ls)
+
+        vp = pg.gp.init_variational_params(M=20)
+        svgp = pg.gp.SVGP(
+            kernel=kernel,
+            likelihood=pg.likelihoods.Gaussian(sigma=0.1),
+            inducing_variable=pg.inducing.Points(pt.as_tensor(Z)),
+            variational_params=vp,
+        )
+
+See the :doc:`example gallery <examples/gallery>` for full end-to-end
+walkthroughs.
+
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+   :titlesonly:
+
+   get_started/index
+   user_guide/index
+   kernels/gallery
+   examples/gallery
+   api
+   dev/index
+   release/index
diff --git a/docs/source/kernels/gallery/expquad.md b/docs/source/kernels/gallery/expquad.md
new file mode 100644
index 0000000..f050064
--- /dev/null
+++ b/docs/source/kernels/gallery/expquad.md
@@ -0,0 +1,130 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+kernelspec:
+  display_name: Python 3
+  language: python
+  name: ptgp
+---
+
+# Exponentiated Quadratic Kernel
+
+[Stationary](../gallery_tags.rst#stationary), [Isotropic](../gallery_tags.rst#isotropic), [Smooth](../gallery_tags.rst#smooth), [Universal](../gallery_tags.rst#universal)
+
+The exponentiated quadratic kernel — also known as the **squared exponential**, **RBF**, or **Gaussian** kernel — measures covariance with the squared Euclidean distance between inputs, decaying as a Gaussian in distance. It is the default first choice in most GP applications.
+
+Sample functions drawn from a GP with an ExpQuad covariance are infinitely differentiable, the smoothest functions a GP can produce. This makes the kernel appropriate when the function being modeled is genuinely smooth, but it also encodes a strong prior assumption: when the truth is rougher than ExpQuad expects, posterior credible intervals between observations are overconfident. The [Matérn 5/2](matern52.md) kernel is a common less aggressive substitute that preserves twice-differentiability without ExpQuad's strong smoothness assumption.
+
+## Key properties and parameters
+
+```{eval-rst}
+=================  =================================================
+Domain             :math:`x \in \mathbb{R}^D`
+Stationary         Yes
+Sample smoothness  :math:`C^\infty` — infinitely differentiable
+Variance           :math:`k(x, x) = 1`
+=================  =================================================
+```
+
+Constructor: [`ptgp.kernels.ExpQuad(input_dim, ls, active_dims=None)`](../../api/generated/ptgp.kernels.ExpQuad.rst). The lengthscale `ls` can be either a scalar (isotropic, one length-scale shared across input dimensions) or a vector of length `len(active_dims)` (one lengthscale per dimension — the standard GP recipe for letting the model down-weight dimensions that don't carry signal). Scale the kernel with multiplication: `eta**2 * ExpQuad(...)`; per ptgp convention `eta` is always squared.
+
+### Covariance function
+
+$$
+k(x, y) = \exp\left( -\frac{\lVert x - y \rVert^2}{2\, \ell^2} \right)
+$$
+
+where $\ell$ is the lengthscale (`ls`). The kernel is `1` when $x = y$ and decays to `0` as $\lVert x - y \rVert / \ell$ grows. By a distance of $3\ell$, the covariance is already below `0.012`.
+
+::::::{tab-set}
+:::::{tab-item} Kernel decay
+:sync: ls
+
+```{jupyter-execute}
+:hide-code:
+import ptgp as pg
+from ptgp.plotting import plot_kernel_decay
+plot_kernel_decay(
+    [
+        (r"$\ell=0.3$", pg.kernels.ExpQuad(input_dim=1, ls=0.3)),
+        (r"$\ell=1.5$", pg.kernels.ExpQuad(input_dim=1, ls=1.5)),
+        (r"$\ell=5.0$", pg.kernels.ExpQuad(input_dim=1, ls=5.0)),
+    ],
+    max_distance=5.0,
+)
+```
+:::::
+
+:::::{tab-item} Prior samples
+:sync: ls
+
+```{jupyter-execute}
+:hide-code:
+from ptgp.plotting import plot_prior_samples
+plot_prior_samples(
+    [
+        (r"$\ell=0.3$", pg.kernels.ExpQuad(input_dim=1, ls=0.3)),
+        (r"$\ell=1.5$", pg.kernels.ExpQuad(input_dim=1, ls=1.5)),
+        (r"$\ell=5.0$", pg.kernels.ExpQuad(input_dim=1, ls=5.0)),
+    ],
+    n_samples=4,
+)
+```
+:::::
+
+:::::{tab-item} Posterior given 3 observations
+:sync: ls
+
+```{jupyter-execute}
+:hide-code:
+from ptgp.plotting import plot_conditional
+plot_conditional(
+    [
+        (r"$\ell=0.3$", pg.kernels.ExpQuad(input_dim=1, ls=0.3)),
+        (r"$\ell=1.5$", pg.kernels.ExpQuad(input_dim=1, ls=1.5)),
+        (r"$\ell=5.0$", pg.kernels.ExpQuad(input_dim=1, ls=5.0)),
+    ],
+    n_draws=3,
+)
+```
+:::::
+
+:::::{tab-item} Code
+
+```{jupyter-execute}
+import numpy as np
+import pytensor
+import pytensor.tensor as pt
+import ptgp as pg
+
+kernel = pg.kernels.ExpQuad(input_dim=1, ls=1.0)
+
+X_sym = pt.matrix("X", shape=(None, 1))
+K_fn = pytensor.function([X_sym], kernel(X_sym))
+
+X = np.linspace(-3, 3, 200)[:, None]
+K = K_fn(X)
+print(K.shape, K.diagonal()[:3])
+```
+:::::
+::::::
+
+The three lengthscales show the kernel's effect on prior beliefs:
+
+- **$\ell=0.3$** — short lengthscale, wiggly draws, posterior reverts to the prior quickly outside data.
+- **$\ell=1.5$** — moderate lengthscale, smooth interpolation between observations.
+- **$\ell=5.0$** — long lengthscale, very smooth, draws nearly straight between observations and extrapolate confidently.
+
+```{seealso}
+- [Matern52](matern52.md) — same family at smoothness $\nu = 5/2$. Sample functions are twice mean-square differentiable. Recommended default when ExpQuad's $C^\infty$ assumption is too strong.
+- [Matern32](matern32.md) — once mean-square differentiable; visibly rougher samples.
+- [Matern12](matern12.md) — the exponential kernel ($\nu = 1/2$). Sample functions are continuous but nowhere differentiable.
+```
+
+The exponentiated quadratic is the canonical GP covariance and is discussed across all standard treatments; {cite:t}`rasmussen-williams-2006` Chapter 4 is the standard reference.
+
+```{bibliography}
+:filter: docname in docnames
+```
diff --git a/docs/source/kernels/gallery/linear.md b/docs/source/kernels/gallery/linear.md
new file mode 100644
index 0000000..cd51307
--- /dev/null
+++ b/docs/source/kernels/gallery/linear.md
@@ -0,0 +1,113 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+kernelspec:
+  display_name: Python 3
+  language: python
+  name: ptgp
+---
+
+# Linear Kernel
+
+[Non-stationary](../gallery_tags.rst#non-stationary), [Isotropic](../gallery_tags.rst#isotropic)
+
+The linear (dot-product) kernel produces sample functions that are straight lines through the center point $c$. It is the GP equivalent of Bayesian linear regression: the posterior mean is a linear function of the inputs and the posterior variance reflects uncertainty in the slope.
+
+Because the covariance grows without bound as inputs move away from the center, a GP with a pure linear kernel has widening uncertainty far from the data.  A GP with a linear kernel is equivalent to a linear regression $y = \beta_0 + \beta_1 x$.
+
+## Key properties and parameters
+
+```{eval-rst}
+=================  ===================================================
+Domain             :math:`x \in \mathbb{R}^D`
+Stationary         **No** — covariance depends on absolute position
+Sample smoothness  Linear (straight lines in 1-D)
+Variance           :math:`k(x, x) = \lVert x - c \rVert^2`
+=================  ===================================================
+```
+
+Constructor: [`ptgp.kernels.Linear(input_dim, c=0.0, active_dims=None)`](../../api/generated/ptgp.kernels.Linear.rst). The center `c` can be a scalar (same center for every active dimension) or a vector of length `len(active_dims)`. Scale the kernel with multiplication: `eta**2 * Linear(...)`.
+
+### Covariance function
+
+$$
+k(x, y) = (x - c)^\top (y - c)
+$$
+
+where $c$ is the center. When $c = 0$ this reduces to the standard dot product $x^\top y$. Shifting $c$ moves the pivot point around which the linear functions rotate.
+
+::::::{tab-set}
+:::::{tab-item} Prior samples
+:sync: c
+
+```{jupyter-execute}
+:hide-code:
+import ptgp as pg
+from ptgp.plotting import plot_prior_samples
+plot_prior_samples(
+    [
+        (r"$c=-1$", pg.kernels.Linear(input_dim=1, c=-1.0)),
+        (r"$c=0$", pg.kernels.Linear(input_dim=1, c=0.0)),
+        (r"$c=1$", pg.kernels.Linear(input_dim=1, c=1.0)),
+    ],
+    n_samples=4,
+)
+```
+:::::
+
+:::::{tab-item} Posterior given 3 observations
+:sync: c
+
+```{jupyter-execute}
+:hide-code:
+from ptgp.plotting import plot_conditional
+plot_conditional(
+    [
+        (r"$c=-1$", pg.kernels.Linear(input_dim=1, c=-1.0)),
+        (r"$c=0$", pg.kernels.Linear(input_dim=1, c=0.0)),
+        (r"$c=1$", pg.kernels.Linear(input_dim=1, c=1.0)),
+    ],
+    n_draws=3,
+    y_range=(-2.0, 2.0),
+)
+```
+:::::
+
+:::::{tab-item} Code
+
+```{jupyter-execute}
+import numpy as np
+import pytensor
+import pytensor.tensor as pt
+import ptgp as pg
+
+kernel = pg.kernels.Linear(input_dim=1, c=0.0)
+
+X_sym = pt.matrix("X", shape=(None, 1))
+K_fn = pytensor.function([X_sym], kernel(X_sym))
+
+X = np.linspace(-3, 3, 200)[:, None]
+K = K_fn(X)
+print(K.shape, K.diagonal()[:3])
+```
+:::::
+::::::
+
+The three centers show the kernel's effect on prior beliefs:
+
+- **$c = -1$** — lines pivot around $x = -1$; uncertainty is smallest near $-1$ and grows in both directions.
+- **$c = 0$** — the standard dot-product kernel; lines pass through the origin.
+- **$c = 1$** — lines pivot around $x = 1$.
+
+```{seealso}
+- [ExpQuad](expquad.md) — the default stationary kernel. Combine with Linear to model a smooth trend plus local structure.
+- [Random Walk](randomwalk.md) — another non-stationary kernel, but with rough (nowhere-differentiable) sample paths rather than straight lines.
+```
+
+The linear kernel and its connection to Bayesian linear regression are discussed in {cite:t}`rasmussen-williams-2006` Chapter 2.
+
+```{bibliography}
+:filter: docname in docnames
+```
diff --git a/docs/source/kernels/gallery/matern12.md b/docs/source/kernels/gallery/matern12.md
new file mode 100644
index 0000000..246f6ce
--- /dev/null
+++ b/docs/source/kernels/gallery/matern12.md
@@ -0,0 +1,125 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+kernelspec:
+  display_name: Python 3
+  language: python
+  name: ptgp
+---
+
+# Matérn 1/2 (Exponential) Kernel
+
+[Stationary](../gallery_tags.rst#stationary), [Isotropic](../gallery_tags.rst#isotropic), [Very Rough](../gallery_tags.rst#very-rough)
+
+The Matérn 1/2 kernel — also known as the **exponential kernel** — produces sample functions that are continuous but nowhere differentiable. The resulting GP is equivalent to an Ornstein–Uhlenbeck process: a mean-reverting random walk in continuous time. Sample paths look like noisy zig-zag trajectories rather than smooth curves.
+
+This kernel is rarely appropriate as a model of a deterministic underlying function — its sample paths are simply too rough for most real-world signals — but it is useful as a *control* against which to compare smoother kernels, and as a building block for compositions where rapid local variation is desired on top of a smoother trend.
+
+## Key properties and parameters
+
+```{eval-rst}
+=================  =================================================
+Domain             :math:`x \in \mathbb{R}^D`
+Stationary         Yes
+Sample smoothness  :math:`C^0` — continuous, nowhere differentiable
+Variance           :math:`k(x, x) = 1`
+=================  =================================================
+```
+
+Constructor: [`ptgp.kernels.Matern12(input_dim, ls, active_dims=None)`](../../api/generated/ptgp.kernels.Matern12.rst). The lengthscale `ls` can be scalar (isotropic) or a vector of length `len(active_dims)` (per-dimension). Scale with `eta**2 * Matern12(...)`.
+
+### Covariance function
+
+$$
+k(x, y) = \exp\!\left( -r \right),
+\qquad r = \frac{\lVert x - y \rVert}{\ell}
+$$
+
+where $\ell$ is the lengthscale (`ls`). This is the slowest-decaying member of the Matérn family at long distances.
+
+::::::{tab-set}
+:::::{tab-item} Kernel decay
+:sync: ls
+
+```{jupyter-execute}
+:hide-code:
+import ptgp as pg
+from ptgp.plotting import plot_kernel_decay
+plot_kernel_decay(
+    [
+        (r"$\ell=0.3$", pg.kernels.Matern12(input_dim=1, ls=0.3)),
+        (r"$\ell=1.5$", pg.kernels.Matern12(input_dim=1, ls=1.5)),
+        (r"$\ell=5.0$", pg.kernels.Matern12(input_dim=1, ls=5.0)),
+    ],
+    max_distance=5.0,
+)
+```
+:::::
+
+:::::{tab-item} Prior samples
+:sync: ls
+
+```{jupyter-execute}
+:hide-code:
+from ptgp.plotting import plot_prior_samples
+plot_prior_samples(
+    [
+        (r"$\ell=0.3$", pg.kernels.Matern12(input_dim=1, ls=0.3)),
+        (r"$\ell=1.5$", pg.kernels.Matern12(input_dim=1, ls=1.5)),
+        (r"$\ell=5.0$", pg.kernels.Matern12(input_dim=1, ls=5.0)),
+    ],
+    n_samples=4,
+)
+```
+:::::
+
+:::::{tab-item} Posterior given 3 observations
+:sync: ls
+
+```{jupyter-execute}
+:hide-code:
+from ptgp.plotting import plot_conditional
+plot_conditional(
+    [
+        (r"$\ell=0.3$", pg.kernels.Matern12(input_dim=1, ls=0.3)),
+        (r"$\ell=1.5$", pg.kernels.Matern12(input_dim=1, ls=1.5)),
+        (r"$\ell=5.0$", pg.kernels.Matern12(input_dim=1, ls=5.0)),
+    ],
+    n_draws=3,
+)
+```
+:::::
+
+:::::{tab-item} Code
+
+```{jupyter-execute}
+import numpy as np
+import pytensor
+import pytensor.tensor as pt
+import ptgp as pg
+
+kernel = pg.kernels.Matern12(input_dim=1, ls=1.0)
+
+X_sym = pt.matrix("X", shape=(None, 1))
+K_fn = pytensor.function([X_sym], kernel(X_sym))
+
+X = np.linspace(-3, 3, 200)[:, None]
+K = K_fn(X)
+print(K.shape, K.diagonal()[:3])
+```
+:::::
+::::::
+
+```{seealso}
+- [Matern32](matern32.md) — same family at $\nu = 3/2$. Once mean-square differentiable; visibly smoother draws.
+- [Matern52](matern52.md) — same family at $\nu = 5/2$. Twice differentiable; the standard recommended default.
+- [RandomWalk](randomwalk.md) — also produces nowhere-differentiable sample paths, but non-stationary (variance grows with input).
+```
+
+The Matérn family and its smoothness-tuning role in GP modeling is discussed in {cite:t}`rasmussen-williams-2006` Chapter 4.
+
+```{bibliography}
+:filter: docname in docnames
+```
diff --git a/docs/source/kernels/gallery/matern32.md b/docs/source/kernels/gallery/matern32.md
new file mode 100644
index 0000000..fe31f92
--- /dev/null
+++ b/docs/source/kernels/gallery/matern32.md
@@ -0,0 +1,125 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+kernelspec:
+  display_name: Python 3
+  language: python
+  name: ptgp
+---
+
+# Matérn 3/2 Kernel
+
+[Stationary](../gallery_tags.rst#stationary), [Isotropic](../gallery_tags.rst#isotropic), [Rough](../gallery_tags.rst#rough)
+
+The Matérn 3/2 kernel produces sample functions that are once mean-square differentiable but no smoother — the function itself and its first derivative exist and are continuous, but the second derivative does not. This is the right kernel when the process you are modeling has a clearly defined velocity but acceleration is a meaningless concept (e.g. abrupt regime changes, piecewise-linear-ish behavior on small scales).
+
+Matérn 3/2 sits between Matérn 5/2 (twice differentiable, the default smooth kernel) and Matérn 1/2 (the exponential kernel, nowhere differentiable). Compared to Matérn 5/2, draws look visibly more jittery while still respecting overall trends.
+
+## Key properties and parameters
+
+```{eval-rst}
+=================  =================================================
+Domain             :math:`x \in \mathbb{R}^D`
+Stationary         Yes
+Sample smoothness  :math:`C^1` — once mean-square differentiable
+Variance           :math:`k(x, x) = 1`
+=================  =================================================
+```
+
+Constructor: [`ptgp.kernels.Matern32(input_dim, ls, active_dims=None)`](../../api/generated/ptgp.kernels.Matern32.rst). The lengthscale `ls` can be scalar (isotropic) or a vector of length `len(active_dims)` (per-dimension). Scale with `eta**2 * Matern32(...)`.
+
+### Covariance function
+
+$$
+k(x, y) = \left( 1 + \sqrt{3}\, r \right) \exp\!\left( -\sqrt{3}\, r \right),
+\qquad r = \frac{\lVert x - y \rVert}{\ell}
+$$
+
+where $\ell$ is the lengthscale (`ls`).
+
+::::::{tab-set}
+:::::{tab-item} Kernel decay
+:sync: ls
+
+```{jupyter-execute}
+:hide-code:
+import ptgp as pg
+from ptgp.plotting import plot_kernel_decay
+plot_kernel_decay(
+    [
+        (r"$\ell=0.3$", pg.kernels.Matern32(input_dim=1, ls=0.3)),
+        (r"$\ell=1.5$", pg.kernels.Matern32(input_dim=1, ls=1.5)),
+        (r"$\ell=5.0$", pg.kernels.Matern32(input_dim=1, ls=5.0)),
+    ],
+    max_distance=5.0,
+)
+```
+:::::
+
+:::::{tab-item} Prior samples
+:sync: ls
+
+```{jupyter-execute}
+:hide-code:
+from ptgp.plotting import plot_prior_samples
+plot_prior_samples(
+    [
+        (r"$\ell=0.3$", pg.kernels.Matern32(input_dim=1, ls=0.3)),
+        (r"$\ell=1.5$", pg.kernels.Matern32(input_dim=1, ls=1.5)),
+        (r"$\ell=5.0$", pg.kernels.Matern32(input_dim=1, ls=5.0)),
+    ],
+    n_samples=4,
+)
+```
+:::::
+
+:::::{tab-item} Posterior given 3 observations
+:sync: ls
+
+```{jupyter-execute}
+:hide-code:
+from ptgp.plotting import plot_conditional
+plot_conditional(
+    [
+        (r"$\ell=0.3$", pg.kernels.Matern32(input_dim=1, ls=0.3)),
+        (r"$\ell=1.5$", pg.kernels.Matern32(input_dim=1, ls=1.5)),
+        (r"$\ell=5.0$", pg.kernels.Matern32(input_dim=1, ls=5.0)),
+    ],
+    n_draws=3,
+)
+```
+:::::
+
+:::::{tab-item} Code
+
+```{jupyter-execute}
+import numpy as np
+import pytensor
+import pytensor.tensor as pt
+import ptgp as pg
+
+kernel = pg.kernels.Matern32(input_dim=1, ls=1.0)
+
+X_sym = pt.matrix("X", shape=(None, 1))
+K_fn = pytensor.function([X_sym], kernel(X_sym))
+
+X = np.linspace(-3, 3, 200)[:, None]
+K = K_fn(X)
+print(K.shape, K.diagonal()[:3])
+```
+:::::
+::::::
+
+```{seealso}
+- [Matern52](matern52.md) — same family at $\nu = 5/2$. One additional order of mean-square differentiability; the standard "smooth-but-not-analytic" choice.
+- [Matern12](matern12.md) — the exponential kernel at $\nu = 1/2$. Continuous but nowhere differentiable.
+- [ExpQuad](expquad.md) — the limit $\nu \to \infty$. Sample functions are infinitely differentiable.
+```
+
+The Matérn family and its smoothness-tuning role in GP modeling is discussed in {cite:t}`rasmussen-williams-2006` Chapter 4.
+
+```{bibliography}
+:filter: docname in docnames
+```
diff --git a/docs/source/kernels/gallery/matern52.md b/docs/source/kernels/gallery/matern52.md
new file mode 100644
index 0000000..23cdb3c
--- /dev/null
+++ b/docs/source/kernels/gallery/matern52.md
@@ -0,0 +1,125 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+kernelspec:
+  display_name: Python 3
+  language: python
+  name: ptgp
+---
+
+# Matérn 5/2 Kernel
+
+[Stationary](../gallery_tags.rst#stationary), [Isotropic](../gallery_tags.rst#isotropic), [Smooth](../gallery_tags.rst#smooth)
+
+The Matérn 5/2 kernel is the smoothest member of the Matérn family that is *not* infinitely differentiable. Sample functions are twice mean-square differentiable — the function and its first derivative are well-defined and continuous, but the curvature can have jumps. This is enough smoothness for most real-world processes and avoids the overconfident extrapolation that the [exponentiated quadratic](expquad.md) tends to produce when the true function is not analytic.
+
+In practice Matérn 5/2 is the recommended default kernel when you don't have a specific reason to assume infinite differentiability. It is the standard choice across Bayesian optimization, geostatistics, and surrogate modeling literatures.
+
+## Key properties and parameters
+
+```{eval-rst}
+=================  =================================================
+Domain             :math:`x \in \mathbb{R}^D`
+Stationary         Yes
+Sample smoothness  :math:`C^2` — twice mean-square differentiable
+Variance           :math:`k(x, x) = 1`
+=================  =================================================
+```
+
+Constructor: [`ptgp.kernels.Matern52(input_dim, ls, active_dims=None)`](../../api/generated/ptgp.kernels.Matern52.rst). The lengthscale `ls` can be scalar (isotropic) or a vector of length `len(active_dims)` (per-dimension). Scale with `eta**2 * Matern52(...)`.
+
+### Covariance function
+
+$$
+k(x, y) = \left( 1 + \sqrt{5}\, r + \tfrac{5}{3} r^2 \right) \exp\!\left( -\sqrt{5}\, r \right),
+\qquad r = \frac{\lVert x - y \rVert}{\ell}
+$$
+
+where $\ell$ is the lengthscale (`ls`). The covariance decays slower than the exponentiated quadratic at moderate distances and faster than Matérn 3/2.
+
+::::::{tab-set}
+:::::{tab-item} Kernel decay
+:sync: ls
+
+```{jupyter-execute}
+:hide-code:
+import ptgp as pg
+from ptgp.plotting import plot_kernel_decay
+plot_kernel_decay(
+    [
+        (r"$\ell=0.3$", pg.kernels.Matern52(input_dim=1, ls=0.3)),
+        (r"$\ell=1.5$", pg.kernels.Matern52(input_dim=1, ls=1.5)),
+        (r"$\ell=5.0$", pg.kernels.Matern52(input_dim=1, ls=5.0)),
+    ],
+    max_distance=5.0,
+)
+```
+:::::
+
+:::::{tab-item} Prior samples
+:sync: ls
+
+```{jupyter-execute}
+:hide-code:
+from ptgp.plotting import plot_prior_samples
+plot_prior_samples(
+    [
+        (r"$\ell=0.3$", pg.kernels.Matern52(input_dim=1, ls=0.3)),
+        (r"$\ell=1.5$", pg.kernels.Matern52(input_dim=1, ls=1.5)),
+        (r"$\ell=5.0$", pg.kernels.Matern52(input_dim=1, ls=5.0)),
+    ],
+    n_samples=4,
+)
+```
+:::::
+
+:::::{tab-item} Posterior given 3 observations
+:sync: ls
+
+```{jupyter-execute}
+:hide-code:
+from ptgp.plotting import plot_conditional
+plot_conditional(
+    [
+        (r"$\ell=0.3$", pg.kernels.Matern52(input_dim=1, ls=0.3)),
+        (r"$\ell=1.5$", pg.kernels.Matern52(input_dim=1, ls=1.5)),
+        (r"$\ell=5.0$", pg.kernels.Matern52(input_dim=1, ls=5.0)),
+    ],
+    n_draws=3,
+)
+```
+:::::
+
+:::::{tab-item} Code
+
+```{jupyter-execute}
+import numpy as np
+import pytensor
+import pytensor.tensor as pt
+import ptgp as pg
+
+kernel = pg.kernels.Matern52(input_dim=1, ls=1.0)
+
+X_sym = pt.matrix("X", shape=(None, 1))
+K_fn = pytensor.function([X_sym], kernel(X_sym))
+
+X = np.linspace(-3, 3, 200)[:, None]
+K = K_fn(X)
+print(K.shape, K.diagonal()[:3])
+```
+:::::
+::::::
+
+```{seealso}
+- [ExpQuad](expquad.md) — the limit $\nu \to \infty$ of the Matérn family. Sample functions are infinitely differentiable; encodes a stronger smoothness assumption.
+- [Matern32](matern32.md) — same family at $\nu = 3/2$. Sample functions are only once mean-square differentiable; visibly rougher draws.
+- [Matern12](matern12.md) — the exponential kernel at $\nu = 1/2$. Continuous but nowhere differentiable.
+```
+
+The Matérn family and its smoothness-tuning role in GP modeling is discussed in {cite:t}`rasmussen-williams-2006` Chapter 4.
+
+```{bibliography}
+:filter: docname in docnames
+```
diff --git a/docs/source/kernels/gallery/randomwalk.md b/docs/source/kernels/gallery/randomwalk.md
new file mode 100644
index 0000000..a5979cf
--- /dev/null
+++ b/docs/source/kernels/gallery/randomwalk.md
@@ -0,0 +1,103 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+kernelspec:
+  display_name: Python 3
+  language: python
+  name: ptgp
+---
+
+# Random Walk Kernel
+
+[Non-stationary](../gallery_tags.rst#non-stationary), [1-D](../gallery_tags.rst#one-d), [Rough](../gallery_tags.rst#rough)
+
+The random walk kernel — the covariance function of standard Brownian motion (the Wiener process) — is the textbook example of a non-stationary GP kernel. Its covariance depends not just on the distance between inputs but on their actual locations: $k(x, y) = \min(x, y)$ means that variance grows linearly with the input value, and two points are correlated only via their shared "history" from the origin.
+
+Use this kernel for processes that genuinely accumulate randomness over time: drifting hardware sensors, financial paths, particle trajectories. Sample paths are continuous but nowhere differentiable, qualitatively similar to Matérn 1/2 but with an explicit notion of an origin from which uncertainty grows.
+
+## Key properties and parameters
+
+```{eval-rst}
+=================  ===================================================
+Domain             :math:`x \in \mathbb{R}^+` (positive scalars, 1-D)
+Stationary         **No** — variance grows linearly with input
+Sample smoothness  :math:`C^0` — continuous, nowhere differentiable
+Variance           :math:`k(x, x) = x`
+=================  ===================================================
+```
+
+Constructor: [`ptgp.kernels.RandomWalk(input_dim=1, active_dims=None)`](../../api/generated/ptgp.kernels.RandomWalk.rst). The kernel has no shape parameters of its own; scale it with `eta**2 * RandomWalk()`. It operates on a single input column — pass `active_dims=[k]` to pick a column when `input_dim > 1`.
+
+### Covariance function
+
+$$
+k(x, y) = \min(x, y)
+$$
+
+for $x, y \in \mathbb{R}^+$. Two inputs share the variance contributed by their common interval from the origin to the smaller of the two; beyond that point their increments are independent.
+
+::::::{tab-set}
+:::::{tab-item} Prior samples
+
+```{jupyter-execute}
+:hide-code:
+import ptgp as pg
+from ptgp.plotting import plot_prior_samples
+plot_prior_samples(
+    [("Random Walk", pg.kernels.RandomWalk(input_dim=1))],
+    n_samples=4,
+    x_range=(0.05, 5.0),
+)
+```
+:::::
+
+:::::{tab-item} Posterior given 3 observations
+
+```{jupyter-execute}
+:hide-code:
+import numpy as np
+from ptgp.plotting import plot_conditional
+plot_conditional(
+    [("Random Walk", pg.kernels.RandomWalk(input_dim=1))],
+    n_draws=3,
+    x_range=(0.05, 5.0),
+    obs_x=np.array([[0.6], [1.6], [4.0]]),
+    obs_y=np.array([0.45, 0.55, -1.10]),
+)
+```
+:::::
+
+:::::{tab-item} Code
+
+```{jupyter-execute}
+import numpy as np
+import pytensor
+import pytensor.tensor as pt
+import ptgp as pg
+
+kernel = pg.kernels.RandomWalk(input_dim=1)
+
+X_sym = pt.matrix("X", shape=(None, 1))
+K_fn = pytensor.function([X_sym], kernel(X_sym))
+
+X = np.linspace(0.05, 5.0, 200)[:, None]
+K = K_fn(X)
+print(K.shape, K.diagonal()[:3])
+```
+:::::
+::::::
+
+Prior samples fan out from the origin; the posterior pinches around each observation and the uncertainty between them is bounded by how the Brownian variance accumulates between data points. Far past the last observation, the posterior reverts toward the prior and uncertainty grows without bound.
+
+```{seealso}
+- [Matern12](matern12.md) — stationary kernel that also produces nowhere-differentiable sample paths, but without the Brownian "variance grows from an origin" structure.
+- [Matern52](matern52.md) — smooth stationary alternative; use this when "drift over time" is better modeled as a smooth trend than as accumulating randomness.
+```
+
+The random walk kernel and the broader connection between GPs and stochastic processes are covered in {cite:t}`rasmussen-williams-2006` Chapter 4.
+
+```{bibliography}
+:filter: docname in docnames
+```
diff --git a/docs/source/references.bib b/docs/source/references.bib
new file mode 100644
index 0000000..a05222f
--- /dev/null
+++ b/docs/source/references.bib
@@ -0,0 +1,8 @@
+@book{rasmussen-williams-2006,
+    author    = {Rasmussen, Carl Edward and Williams, Christopher K. I.},
+    title     = {Gaussian Processes for Machine Learning},
+    publisher = {The MIT Press},
+    year      = {2006},
+    isbn      = {026218253X},
+    url       = {http://www.gaussianprocess.org/gpml/},
+}
diff --git a/docs/source/release/index.rst b/docs/source/release/index.rst
new file mode 100644
index 0000000..8991222
--- /dev/null
+++ b/docs/source/release/index.rst
@@ -0,0 +1,10 @@
+Release Notes
+=============
+
+.. note::
+
+   **WRITEME.** This page is a stub. Add a section per release with the
+   user-visible changes — features, breaking changes, bug fixes. Mirror
+   the format used by PyMC / PyTensor release notes.
+
+ptgp is currently in a prototype phase; no tagged releases yet.
diff --git a/docs/source/user_guide/design.rst b/docs/source/user_guide/design.rst
new file mode 100644
index 0000000..a78027b
--- /dev/null
+++ b/docs/source/user_guide/design.rst
@@ -0,0 +1,26 @@
+Design Overview
+===============
+
+.. note::
+
+   **WRITEME.** This page is a stub. The full design notes live in
+   ``DESIGN.md`` at the repo root; port the relevant sections here and
+   prune anything that's no longer accurate.
+
+ptgp is built on PyTensor's symbolic graph and rewrite system. Kernels,
+likelihoods, and GP models return symbolic PyTensor tensors with naive
+linear algebra (``pt.linalg.inv(K)``, ``pt.linalg.slogdet(K)``) that
+PyTensor's rewrite system lowers to efficient, Cholesky-based code using
+declared matrix properties.
+
+Key design choices:
+
+- **PyMC priors only.** ``pm.Model()`` is used as a prior container; ptgp
+  never invokes a PyMC sampler.
+- **Naive linear algebra in user code.** Don't hand-roll Cholesky chains —
+  write the math symbolically and let the rewrite system specialize.
+- **Native kernels.** Kernels are implemented in ptgp, not reused from
+  PyMC. Long-term goal is for PyMC to depend on ptgp for kernels.
+- **Standalone objectives.** Loss functions are free functions, one per
+  model, with a uniform ``(model, X, y)`` signature suitable for the
+  training compilers.
diff --git a/docs/source/user_guide/index.rst b/docs/source/user_guide/index.rst
new file mode 100644
index 0000000..a33d844
--- /dev/null
+++ b/docs/source/user_guide/index.rst
@@ -0,0 +1,12 @@
+User Guide
+==========
+
+.. toctree::
+   :maxdepth: 1
+
+   models
+   kernels
+   likelihoods
+   inducing
+   training
+   design
diff --git a/docs/source/user_guide/inducing.rst b/docs/source/user_guide/inducing.rst
new file mode 100644
index 0000000..3a2d069
--- /dev/null
+++ b/docs/source/user_guide/inducing.rst
@@ -0,0 +1,27 @@
+Inducing Points
+===============
+
+.. note::
+
+   **WRITEME.** This page is a stub. Cover the three initialization
+   strategies (random subsample, k-means, greedy variance), the
+   diagnostics namedtuples they return, and when to prefer each.
+
+ptgp provides three init routines for inducing point locations, each
+paired with a diagnostics object:
+
+.. list-table::
+   :header-rows: 1
+   :widths: 35 65
+
+   * - Function
+     - Diagnostics
+   * - :func:`~ptgp.inducing.random_subsample_init`
+     - :class:`~ptgp.inducing.RandomSubsampleDiagnostics`
+   * - :func:`~ptgp.inducing.kmeans_init`
+     - :class:`~ptgp.inducing.KMeansDiagnostics`
+   * - :func:`~ptgp.inducing.greedy_variance_init`
+     - :class:`~ptgp.inducing.GreedyVarianceDiagnostics`
+
+All three return a :class:`~ptgp.inducing.Points` object wrapping a numpy
+array, so ``ip.Z`` is directly usable for plotting.
diff --git a/docs/source/user_guide/kernels.rst b/docs/source/user_guide/kernels.rst
new file mode 100644
index 0000000..ab366de
--- /dev/null
+++ b/docs/source/user_guide/kernels.rst
@@ -0,0 +1,36 @@
+Kernels
+=======
+
+.. note::
+
+   **WRITEME.** This page is a stub. Cover the kernel catalogue,
+   composition via ``+`` and ``*``, ``active_dims``, the ``eta`` scale
+   convention, and the ``pt.assume(K, symmetric=True, positive_definite=True)``
+   annotation pattern.
+
+ptgp kernels are implemented natively (not reused from PyMC) and operate on
+matrix pairs. ``Kernel.__call__(X, Y=None)`` returns a symbolic PyTensor
+tensor. When called with ``Y=None`` the kernel produces ``K(X, X)``
+annotated as ``symmetric=True, positive_definite=True`` so downstream
+rewrites can specialize.
+
+Available families
+------------------
+
+- **Stationary**: :class:`~ptgp.kernels.ExpQuad`,
+  :class:`~ptgp.kernels.Matern52`, :class:`~ptgp.kernels.Matern32`,
+  :class:`~ptgp.kernels.Matern12`.
+- **Non-stationary**: :class:`~ptgp.kernels.Gibbs`,
+  :class:`~ptgp.kernels.RandomWalk`, :class:`~ptgp.kernels.WarpedInput`.
+- **Categorical**: :class:`~ptgp.kernels.Overlap`,
+  :class:`~ptgp.kernels.LowRankCategorical` for multi-class or categorical
+  inputs.
+- **Composition**: :class:`~ptgp.kernels.SumKernel`,
+  :class:`~ptgp.kernels.ProductKernel`, produced by the ``+`` and ``*``
+  operators on existing kernels.
+
+Scale convention
+----------------
+
+Kernels are scaled by ``eta**2`` (e.g. ``eta**2 * ExpQuad(ls=ls)``), so
+``eta`` is always squared.
diff --git a/docs/source/user_guide/likelihoods.rst b/docs/source/user_guide/likelihoods.rst
new file mode 100644
index 0000000..9ea2151
--- /dev/null
+++ b/docs/source/user_guide/likelihoods.rst
@@ -0,0 +1,16 @@
+Likelihoods
+===========
+
+.. note::
+
+   **WRITEME.** This page is a stub. Cover when to use each likelihood,
+   how Gaussian likelihoods unlock the collapsed bound for VFE, and how
+   non-Gaussian likelihoods change the SVGP training story.
+
+ptgp supports both Gaussian and non-Gaussian observation models:
+
+- :class:`~ptgp.likelihoods.Gaussian`
+- :class:`~ptgp.likelihoods.Bernoulli`
+- :class:`~ptgp.likelihoods.Poisson`
+- :class:`~ptgp.likelihoods.NegativeBinomial`
+- :class:`~ptgp.likelihoods.StudentT`
diff --git a/docs/source/user_guide/models.rst b/docs/source/user_guide/models.rst
new file mode 100644
index 0000000..688265c
--- /dev/null
+++ b/docs/source/user_guide/models.rst
@@ -0,0 +1,32 @@
+GP Models
+=========
+
+.. note::
+
+   **WRITEME.** This page is a stub. Walk through ``Unapproximated``,
+   ``VFE``, and ``SVGP`` — when to use each, what they assume about your
+   data, and how to set them up. Cross-link to the corresponding objective
+   in :mod:`ptgp.objectives`.
+
+ptgp ships three user-facing GP models:
+
+.. list-table::
+   :header-rows: 1
+   :widths: 25 15 60
+
+   * - Model
+     - Scale
+     - Best for
+   * - :class:`~ptgp.gp.Unapproximated`
+     - ``N < ~2,000``
+     - Exact inference, model comparison.
+   * - :class:`~ptgp.gp.VFE`
+     - ``N < ~50,000``
+     - Medium-scale data with inducing points.
+   * - :class:`~ptgp.gp.SVGP`
+     - ``N`` up to ``~500,000``
+     - Large data, non-Gaussian likelihoods, minibatch training.
+
+A fourth model, :class:`~ptgp.inducing_fourier.FourierFeatures1D`, supplies a
+structured ``K_uu`` for 1-D Matérn kernels via a Fourier basis and removes
+the need to place inducing points by hand.
diff --git a/docs/source/user_guide/training.rst b/docs/source/user_guide/training.rst
new file mode 100644
index 0000000..eb90684
--- /dev/null
+++ b/docs/source/user_guide/training.rst
@@ -0,0 +1,31 @@
+Training
+========
+
+.. note::
+
+   **WRITEME.** This page is a stub. Cover the two training paths
+   (``compile_training_step`` for Adam/SGD vs. ``compile_scipy_objective``
+   for L-BFGS-B), MAP vs. pure-ELBO, ``param_groups`` for per-parameter
+   learning rates, ``minimize_staged_vfe``, and ``compile_predict``.
+
+ptgp offers two training paths, each matched to a different regime:
+
+- :func:`~ptgp.optim.compile_training_step` — Adam / SGD via PyTensor
+  shared variables, for stochastic or minibatch training (SVGP).
+  Prediction reads the same shared variables.
+- :func:`~ptgp.optim.compile_scipy_objective` — returns a ``(loss, grad)``
+  callable for :func:`scipy.optimize.minimize`, for full-batch training
+  (Unapproximated, VFE).
+
+Priors regularize training by default: both compilers add
+``model.logp(jacobian=True, sum=True)`` to the loss, yielding MAP in the
+unconstrained value-var space. Pass ``include_prior=False`` for MLE / pure
+ELBO.
+
+Staged & tracked
+----------------
+
+- :func:`~ptgp.optim.minimize_staged_vfe` runs the VFE training schedule
+  in phases (kernel-only, then jointly with inducing points, etc.).
+- :func:`~ptgp.optim.tracked_minimize` wraps :func:`scipy.optimize.minimize`
+  with per-iteration diagnostics.
diff --git a/docs/sphinxext/generate_gallery.py b/docs/sphinxext/generate_gallery.py
new file mode 100644
index 0000000..4b1d181
--- /dev/null
+++ b/docs/sphinxext/generate_gallery.py
@@ -0,0 +1,274 @@
+"""
+Sphinx plugin that builds an example gallery from ``notebooks/``.
+
+Sources notebooks from the top-level ``notebooks/`` directory at the repo root
+and emits a grid-card gallery page at ``docs/source/examples/gallery.rst``.
+
+Categorization rule:
+  - Notebooks placed in subdirectories (``notebooks/<category>/foo.ipynb``)
+    are grouped under that subdirectory's title (looked up in
+    ``CATEGORY_TITLES``, with the folder name title-cased as fallback).
+  - Notebooks at the top level (``notebooks/foo.ipynb``) fall under
+    "Examples".
+
+For each notebook the script copies the file into
+``docs/source/examples/<category>/<name>.ipynb`` so MyST-NB picks it up, and
+extracts the last image output as a thumbnail under
+``docs/source/_thumbnails/<category>/<name>.png``. A user-supplied thumbnail
+at that path is left alone.
+
+Adapted from gEconpy, which adapted it from PyMC / seaborn / mpld3.
+"""
+
+import base64
+import json
+import shutil
+import subprocess
+
+from pathlib import Path
+
+import matplotlib
+
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import sphinx
+
+from matplotlib import image
+
+logger = sphinx.util.logging.getLogger(__name__)
+
+# Repo root: docs/sphinxext/generate_gallery.py -> repo
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+NOTEBOOKS_ROOT = REPO_ROOT / "notebooks"
+
+# Pretty titles for known subfolders. Anything not listed is title-cased.
+CATEGORY_TITLES = {
+    "examples": "Examples",
+    "introductory": "Introductory",
+    "advanced": "Advanced",
+    "case_study": "Case Studies",
+}
+
+DEFAULT_IMG_LOC = None
+
+TITLE = """
+Example Gallery
+===============
+"""
+
+TOCTREE_HEAD = """
+.. toctree::
+   :hidden:
+
+"""
+
+SECTION_TEMPLATE = """
+.. _gallery-{section_id}:
+
+{section_title}
+{underlines}
+
+.. grid:: 1 2 3 3
+   :gutter: 4
+
+"""
+
+ITEM_TEMPLATE = """
+   .. grid-item-card:: :doc:`{doc_name}`
+      :img-top: {image}
+      :link: {doc_reference}
+      :link-type: {link_type}
+      :shadow: none
+"""
+
+
+def is_tracked_by_git(filepath):
+    try:
+        result = subprocess.run(
+            ["git", "ls-files", "--error-unmatch", str(filepath)],
+            capture_output=True,
+            check=False,
+            cwd=REPO_ROOT,
+        )
+    except FileNotFoundError:
+        return True
+    else:
+        return result.returncode == 0
+
+
+def create_thumbnail(infile, width=275, height=275, cx=0.5, cy=0.5, border=4):
+    im = image.imread(infile)
+    rows, cols = im.shape[:2]
+    size = min(rows, cols)
+    if size == cols:
+        xslice = slice(0, size)
+        ymin = min(max(0, int(cy * rows - size // 2)), rows - size)
+        yslice = slice(ymin, ymin + size)
+    else:
+        yslice = slice(0, size)
+        xmin = min(max(0, int(cx * cols - size // 2)), cols - size)
+        xslice = slice(xmin, xmin + size)
+    thumb = im[yslice, xslice]
+    thumb[:border, :, :3] = thumb[-border:, :, :3] = 0
+    thumb[:, :border, :3] = thumb[:, -border:, :3] = 0
+
+    dpi = 100
+    fig = plt.figure(figsize=(width / dpi, height / dpi), dpi=dpi)
+    ax = fig.add_axes([0, 0, 1, 1], aspect="auto", frameon=False, xticks=[], yticks=[])
+    ax.imshow(thumb, aspect="auto", resample=True, interpolation="bilinear")
+    fig.savefig(infile, dpi=dpi)
+    plt.close(fig)
+    return fig
+
+
+class NotebookGenerator:
+    """Extract a thumbnail and stage a notebook for inclusion in the gallery."""
+
+    def __init__(self, src_nb: Path, category: str, examples_dir: Path, thumbnails_dir: Path):
+        self.src_nb = src_nb
+        self.stripped_name = src_nb.stem
+        self.category = category
+        self.staged_nb = examples_dir / category / f"{self.stripped_name}.ipynb"
+        self.png_path = thumbnails_dir / category / f"{self.stripped_name}.png"
+
+        with src_nb.open(encoding="utf-8") as fid:
+            self.json_source = json.load(fid)
+
+    def stage_notebook(self):
+        self.staged_nb.parent.mkdir(parents=True, exist_ok=True)
+        # Always re-copy: notebooks at the source can change between builds.
+        shutil.copyfile(self.src_nb, self.staged_nb)
+
+    def extract_preview_pic(self):
+        pic = None
+        for cell in self.json_source["cells"]:
+            for output in cell.get("outputs", []):
+                if "image/png" in output.get("data", []):
+                    pic = output["data"]["image/png"]
+        if pic is not None:
+            return base64.b64decode(pic)
+        return None
+
+    def gen_previews(self):
+        self.png_path.parent.mkdir(parents=True, exist_ok=True)
+        if self.png_path.exists():
+            logger.info(
+                f"Custom thumbnail already exists for {self.src_nb.name}, skipping extraction",
+                type="thumbnail_extractor",
+            )
+            return
+
+        preview = self.extract_preview_pic()
+        if preview is not None:
+            with self.png_path.open("wb") as buff:
+                buff.write(preview)
+            create_thumbnail(self.png_path)
+        else:
+            if DEFAULT_IMG_LOC is not None:
+                shutil.copy(DEFAULT_IMG_LOC, self.png_path)
+                create_thumbnail(self.png_path)
+            else:
+                logger.warning(
+                    f"No image found in {self.src_nb.name} and no default thumbnail set",
+                    type="thumbnail_extractor",
+                )
+
+
+def discover_notebooks():
+    """Group notebooks by category.
+
+    Returns dict mapping ``category -> list[Path]`` where category is the
+    immediate subfolder of ``notebooks/`` (or ``"examples"`` for top-level
+    notebooks).
+    """
+    if not NOTEBOOKS_ROOT.exists():
+        return {}
+
+    grouped: dict[str, list[Path]] = {}
+    for path in sorted(NOTEBOOKS_ROOT.rglob("*.ipynb")):
+        if ".ipynb_checkpoints" in path.parts:
+            continue
+        rel = path.relative_to(NOTEBOOKS_ROOT)
+        category = rel.parts[0] if len(rel.parts) > 1 else "examples"
+        grouped.setdefault(category, []).append(path)
+    return grouped
+
+
+def main(app):
+    logger.info("Starting ptgp example gallery generation.")
+
+    src_dir = Path(app.builder.srcdir)
+    examples_dir = src_dir / "examples"
+    thumbnails_dir = src_dir / "_thumbnails"
+    examples_dir.mkdir(parents=True, exist_ok=True)
+    thumbnails_dir.mkdir(parents=True, exist_ok=True)
+
+    grouped = discover_notebooks()
+
+    if not grouped:
+        logger.warning(
+            "No notebooks found under notebooks/; writing empty gallery.",
+            type="thumbnail_extractor",
+        )
+
+    toctree_entries: list[str] = []
+    section_lines: list[str] = []
+
+    for category in sorted(grouped):
+        nb_paths = grouped[category]
+        title = CATEGORY_TITLES.get(category, category.replace("_", " ").title())
+        section_lines.append(
+            SECTION_TEMPLATE.format(
+                section_title=title,
+                section_id=category,
+                underlines="-" * len(title),
+            )
+        )
+
+        for nb_path in nb_paths:
+            if not is_tracked_by_git(nb_path):
+                logger.info(
+                    f"Skipping {nb_path.name}, not tracked by git",
+                    type="thumbnail_extractor",
+                )
+                continue
+
+            nbg = NotebookGenerator(
+                src_nb=nb_path,
+                category=category,
+                examples_dir=examples_dir,
+                thumbnails_dir=thumbnails_dir,
+            )
+            nbg.stage_notebook()
+            nbg.gen_previews()
+
+            doc_name = f"{category}/{nbg.stripped_name}"
+            toctree_entries.append(doc_name)
+            # Path is relative to docs/source/ — the leading slash makes
+            # Sphinx resolve it from the source root, matching gEconpy's
+            # convention so users can drop in custom thumbnails too.
+            img_path = f"/_thumbnails/{category}/{nbg.stripped_name}.png"
+            section_lines.append(
+                ITEM_TEMPLATE.format(
+                    doc_name=doc_name,
+                    image=img_path,
+                    doc_reference=doc_name,
+                    link_type="doc",
+                )
+            )
+
+    # Assemble: title, hidden toctree (so notebooks register with Sphinx),
+    # then the visible grid-card sections.
+    file_lines = [TITLE, TOCTREE_HEAD]
+    file_lines.extend(f"   {entry}\n" for entry in toctree_entries)
+    file_lines.append("\n")
+    file_lines.extend(section_lines)
+
+    gallery_rst = examples_dir / "gallery.rst"
+    gallery_rst.write_text("\n".join(file_lines), encoding="utf-8")
+    logger.info(f"Wrote gallery to {gallery_rst.relative_to(src_dir)}")
+
+
+def setup(app):
+    app.connect("builder-inited", main)
+    return {"parallel_read_safe": True, "parallel_write_safe": True}
diff --git a/docs/sphinxext/generate_kernel_gallery.py b/docs/sphinxext/generate_kernel_gallery.py
new file mode 100644
index 0000000..88a53ef
--- /dev/null
+++ b/docs/sphinxext/generate_kernel_gallery.py
@@ -0,0 +1,412 @@
+"""
+Sphinx plugin that builds a covariance gallery for ptgp kernels.
+
+For each kernel in ``KERNEL_RECIPES``, renders a single cover image (GP prior
+samples for stationary kernels, ``K(X, X)`` heatmap for categorical ones) and
+emits a grid-card landing page at ``docs/source/kernels/gallery.rst``.
+
+This is the MVP: one panel per cover, no per-kernel pages, no live execution.
+Phase 2 adds per-kernel detail pages with math, parameter tables, and tabbed
+parameter sweeps. Phase 3 adds Distill-style interactivity.
+
+Inspired by preliz's distribution gallery
+(https://github.com/arviz-devs/preliz/blob/main/docs/get_cover_gallery.py).
+"""
+
+import logging
+
+from collections.abc import Callable
+from dataclasses import dataclass
+from pathlib import Path
+
+import matplotlib
+
+matplotlib.use("Agg")
+matplotlib.rcParams["font.family"] = "serif"
+import matplotlib.pyplot as plt
+import numpy as np
+import pytensor
+import pytensor.tensor as pt
+
+from ptgp.kernels import (
+    ExpQuad,
+    Linear,
+    Matern12,
+    Matern32,
+    Matern52,
+    RandomWalk,
+)
+
+# Use Sphinx's logger when running inside Sphinx; fall back to stdlib logging
+# so the cover script can be run standalone (smoke tests, regeneration).
+try:
+    import sphinx.util.logging as _sphinx_logging
+
+    logger = _sphinx_logging.getLogger(__name__)
+except ImportError:
+    logger = logging.getLogger(__name__)
+
+
+@dataclass
+class CoverRecipe:
+    """How to instantiate and render one cover for a kernel."""
+
+    name: str
+    display_name: str
+    builder: Callable
+    mode: str  # "samples", "conditional", or "heatmap"
+    # Per-recipe observation override for "conditional" mode. None falls back
+    # to the module-level _OBS_X / _OBS_Y. RandomWalk needs positive inputs,
+    # so it provides its own.
+    obs_x: np.ndarray | None = None
+    obs_y: np.ndarray | None = None
+
+
+# -- Builders ----------------------------------------------------------------
+# Each builder returns (kernel_instance, X_np) ready to feed into the
+# rendering pipeline. Kernels parameterised by floats only — no PyMC RVs in
+# the gallery, since the cover is a single canonical instance per kernel.
+
+
+def _line(low=-3.0, high=3.0, n=200):
+    return np.linspace(low, high, n)[:, None]
+
+
+def _build_expquad():
+    return ExpQuad(input_dim=1, ls=1.0), _line()
+
+
+def _build_matern52():
+    return Matern52(input_dim=1, ls=1.0), _line()
+
+
+def _build_matern32():
+    return Matern32(input_dim=1, ls=1.0), _line()
+
+
+def _build_matern12():
+    return Matern12(input_dim=1, ls=1.0), _line()
+
+
+def _build_linear():
+    return Linear(input_dim=1, c=0.0), _line()
+
+
+def _build_random_walk():
+    # Wiener process requires positive inputs.
+    return RandomWalk(input_dim=1), _line(low=0.05, high=5.0, n=200)
+
+
+# Positive-domain obs for RandomWalk: two close-ish times then one farther.
+_RW_OBS_X = np.array([[0.6], [1.6], [4.0]])
+_RW_OBS_Y = np.array([0.45, 0.55, -1.10])
+
+
+KERNEL_RECIPES: list[CoverRecipe] = [
+    CoverRecipe("ExpQuad", "Exponentiated Quadratic", _build_expquad, "conditional"),
+    CoverRecipe("Matern52", "Matérn 5/2", _build_matern52, "conditional"),
+    CoverRecipe("Matern32", "Matérn 3/2", _build_matern32, "conditional"),
+    CoverRecipe("Matern12", "Matérn 1/2", _build_matern12, "conditional"),
+    CoverRecipe("Linear", "Linear", _build_linear, "conditional"),
+    CoverRecipe(
+        "RandomWalk",
+        "Random Walk",
+        _build_random_walk,
+        "conditional",
+        obs_x=_RW_OBS_X,
+        obs_y=_RW_OBS_Y,
+    ),
+]
+
+# Public kernels intentionally excluded from the gallery. Listed here so the
+# pre-commit ``check-kernel-gallery`` hook stops nagging about them, but new
+# kernels added to ``ptgp.kernels`` will still trigger the check until they
+# land in ``KERNEL_RECIPES`` or are added here.
+KERNEL_GALLERY_BLACKLIST: set[str] = {
+    "Kernel",  # abstract base class
+    "SumKernel",  # composition, not a "kernel" on its own
+    "ProductKernel",  # composition
+    "Gibbs",  # requires a user-supplied lengthscale function
+    "WarpedInput",  # requires a user-supplied warp function
+    "Overlap",  # categorical, needs a different render mode
+    "LowRankCategorical",  # categorical
+}
+
+
+# -- Rendering ---------------------------------------------------------------
+
+_N_SAMPLES = 3
+_JITTER = 1e-6
+_FIG_SIZE = (3.5, 2.3)
+_DPI = 144
+_RNG_SEED = 1
+
+# Cover styling: pale off-blue panel + thin dashed black grid + warm sample
+# palette. Line thickness + alpha cribbed from McElreath's
+# plot_predictive_covariance (alpha=0.5, linewidth=5-ish) so a small number
+# of bold draws layer instead of competing.
+_BG_COLOR = "#eff3f7"
+_GRID_COLOR = "black"
+_GRID_LW = 0.3
+_GRID_LS = "-"
+_SAMPLE_PALETTE = ["#e8543f", "#f3a712", "#9b4dca", "#1f4e79", "#2a9d8f"]
+_LINE_LW = 2.0
+_LINE_ALPHA = 0.8
+
+# Conditional ("McElreath") render: a few noisy observations with thick
+# popsicle errorbars and GP posterior draws threaded through them.
+_OBS_X = np.array([[-1.6], [-0.4], [1.8]])
+_OBS_Y = np.array([0.65, -0.35, -1.05])
+_OBS_SIGMA = 0.3
+_N_COND_DRAWS = 3
+# Popsicle in vivid red (the canonical McElreath "data" color); marker
+# outline in black so the dot pops against any palette color underneath.
+_OBS_COLOR = "#d11149"
+_MARKER_EDGE_COLOR = "black"
+_POPSICLE_LW = 8.0
+_POPSICLE_ALPHA = 0.6
+_MARKER_SIZE = 7.0
+_MARKER_EDGE_LW = 1.5
+
+
+def _compile_k(kernel, X_dtype):
+    X_sym = pt.matrix("_X_cover", shape=(None, 1), dtype=X_dtype)
+    K_sym = kernel(X_sym)
+    return pytensor.function([X_sym], K_sym)
+
+
+def _compile_kxy(kernel, X_dtype):
+    X_sym = pt.matrix("_X_a", shape=(None, 1), dtype=X_dtype)
+    Y_sym = pt.matrix("_X_b", shape=(None, 1), dtype=X_dtype)
+    return pytensor.function([X_sym, Y_sym], kernel(X_sym, Y_sym))
+
+
+def _render_samples(kernel, X_np, ax):
+    fn = _compile_k(kernel, X_dtype=str(X_np.dtype))
+    K = fn(X_np)
+    K = 0.5 * (K + K.T)
+    K += _JITTER * np.eye(K.shape[0])
+    L = np.linalg.cholesky(K)
+    rng = np.random.default_rng(_RNG_SEED)
+    samples = L @ rng.standard_normal((K.shape[0], _N_SAMPLES))
+    for i in range(_N_SAMPLES):
+        ax.plot(
+            X_np[:, 0],
+            samples[:, i],
+            lw=_LINE_LW,
+            alpha=_LINE_ALPHA,
+            color=_SAMPLE_PALETTE[i % len(_SAMPLE_PALETTE)],
+        )
+
+
+def _render_heatmap(kernel, X_np, ax):
+    fn = _compile_k(kernel, X_dtype=str(X_np.dtype))
+    K = fn(X_np)
+    ax.imshow(K, cmap="viridis", aspect="auto", interpolation="nearest")
+
+
+def _render_conditional(kernel, X_np, ax, obs_x=None, obs_y=None):
+    """McElreath-style: hollow-circle observations + popsicle noise bars + thin
+    GP posterior draws threaded through them.
+    """
+    obs_x = _OBS_X if obs_x is None else obs_x
+    obs_y = _OBS_Y if obs_y is None else obs_y
+    X_obs = obs_x.astype(X_np.dtype)
+    y_obs = obs_y
+
+    fn_oo = _compile_k(kernel, X_dtype=str(X_obs.dtype))
+    fn_xy = _compile_kxy(kernel, X_dtype=str(X_np.dtype))
+    fn_gg = _compile_k(kernel, X_dtype=str(X_np.dtype))
+
+    K_oo = fn_oo(X_obs) + (_OBS_SIGMA**2 + _JITTER) * np.eye(len(X_obs))
+    K_go = fn_xy(X_np, X_obs)
+    K_gg = fn_gg(X_np)
+
+    L = np.linalg.cholesky(K_oo)
+    alpha_vec = np.linalg.solve(L.T, np.linalg.solve(L, y_obs))
+    mu = K_go @ alpha_vec
+    v = np.linalg.solve(L, K_go.T)
+    cov = K_gg - v.T @ v
+    cov = 0.5 * (cov + cov.T) + _JITTER * np.eye(len(X_np))
+    L_post = np.linalg.cholesky(cov)
+
+    rng = np.random.default_rng(_RNG_SEED)
+    samples = mu[:, None] + L_post @ rng.standard_normal((len(X_np), _N_COND_DRAWS))
+
+    for i in range(samples.shape[1]):
+        ax.plot(
+            X_np[:, 0],
+            samples[:, i],
+            color=_SAMPLE_PALETTE[i % len(_SAMPLE_PALETTE)],
+            lw=_LINE_LW,
+            alpha=_LINE_ALPHA,
+            zorder=2,
+        )
+
+    container = ax.errorbar(
+        X_obs[:, 0],
+        y_obs,
+        yerr=_OBS_SIGMA,
+        fmt="none",
+        ecolor=_OBS_COLOR,
+        elinewidth=_POPSICLE_LW,
+        capsize=0,
+        zorder=4,
+    )
+    for bar_line in container[2]:
+        bar_line.set_capstyle("round")
+        bar_line.set_alpha(_POPSICLE_ALPHA)
+
+    ax.plot(
+        X_obs[:, 0],
+        y_obs,
+        "o",
+        mfc="none",
+        mec=_MARKER_EDGE_COLOR,
+        mew=_MARKER_EDGE_LW,
+        ms=_MARKER_SIZE,
+        zorder=5,
+    )
+
+
+def _render_cover(recipe: CoverRecipe, out_path: Path):
+    kernel, X_np = recipe.builder()
+    fig, ax = plt.subplots(figsize=_FIG_SIZE, dpi=_DPI)
+    paneled = recipe.mode in ("samples", "conditional")
+    if paneled:
+        ax.set_facecolor(_BG_COLOR)
+        ax.set_axisbelow(True)
+    if recipe.mode == "samples":
+        _render_samples(kernel, X_np, ax)
+    elif recipe.mode == "heatmap":
+        _render_heatmap(kernel, X_np, ax)
+    elif recipe.mode == "conditional":
+        _render_conditional(
+            kernel,
+            X_np,
+            ax,
+            obs_x=recipe.obs_x,
+            obs_y=recipe.obs_y,
+        )
+    else:
+        raise ValueError(f"Unknown render mode for {recipe.name}: {recipe.mode}")
+    if paneled:
+        # Grid lines draw at tick locations, so we keep the ticks but hide
+        # the tick marks and labels — the grid is the only thing visible.
+        ax.grid(True, color=_GRID_COLOR, lw=_GRID_LW, ls=_GRID_LS, zorder=0)
+        ax.tick_params(
+            left=False,
+            bottom=False,
+            labelleft=False,
+            labelbottom=False,
+        )
+    else:
+        ax.set_xticks([])
+        ax.set_yticks([])
+    for side in ("top", "right", "bottom", "left"):
+        ax.spines[side].set_visible(False)
+    fig.tight_layout(pad=0.2)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    fig.savefig(out_path, dpi=_DPI, facecolor="white")
+    plt.close(fig)
+
+
+# -- Gallery page ------------------------------------------------------------
+
+_GALLERY_TITLE = """
+Covariance Gallery
+==================
+
+A visual reference for the kernels shipped in :mod:`ptgp.kernels`. Each cover
+shows GP prior samples drawn with the kernel (or the ``K(X, X)`` matrix as a
+heatmap for categorical kernels), giving an immediate sense of the kind of
+functions the covariance produces.
+"""
+
+_TOCTREE_HEAD = """
+.. toctree::
+   :hidden:
+
+"""
+
+_GRID_HEAD = """
+.. grid:: 1 2 3 3
+   :gutter: 2 2 3 3
+
+"""
+
+_CARD_TEMPLATE_LINKED = """
+   .. grid-item-card::
+      :text-align: center
+      :shadow: none
+      :class-card: example-gallery
+      :link: gallery/{slug}
+      :link-type: doc
+
+      .. image:: img/{file_name}.png
+         :alt: {display_name}
+
+      +++
+      {display_name}
+"""
+
+_CARD_TEMPLATE_PLAIN = """
+   .. grid-item-card::
+      :text-align: center
+      :shadow: none
+      :class-card: example-gallery
+
+      .. image:: img/{file_name}.png
+         :alt: {display_name}
+
+      +++
+      {display_name}
+"""
+
+
+def main(app):
+    logger.info("Starting ptgp covariance gallery generation.")
+
+    src_dir = Path(app.builder.srcdir)
+    kernels_dir = src_dir / "kernels"
+    img_dir = kernels_dir / "img"
+    kernels_dir.mkdir(parents=True, exist_ok=True)
+    img_dir.mkdir(parents=True, exist_ok=True)
+
+    rendered: list[tuple[str, str, str, bool]] = []
+    for recipe in KERNEL_RECIPES:
+        try:
+            _render_cover(recipe, img_dir / f"{recipe.name}.png")
+        except Exception as exc:  # keep build going if one recipe is broken
+            logger.warning(
+                f"Failed to render cover for {recipe.name}: {exc}",
+                type="kernel_gallery",
+            )
+            continue
+        slug = recipe.name.lower()
+        page_exists = (kernels_dir / "gallery" / f"{slug}.md").exists()
+        rendered.append((recipe.name, recipe.display_name, slug, page_exists))
+
+    # Assemble: title, hidden toctree (only for pages that exist), grid header,
+    # cards (linked when the page exists, plain otherwise).
+    lines = [_GALLERY_TITLE, _TOCTREE_HEAD]
+    lines.extend(f"   gallery/{slug}\n" for _, _, slug, exists in rendered if exists)
+    lines.append(_GRID_HEAD)
+    for file_name, display_name, slug, page_exists in rendered:
+        template = _CARD_TEMPLATE_LINKED if page_exists else _CARD_TEMPLATE_PLAIN
+        lines.append(
+            template.format(
+                file_name=file_name,
+                display_name=display_name,
+                slug=slug,
+            )
+        )
+
+    gallery_rst = kernels_dir / "gallery.rst"
+    gallery_rst.write_text("\n".join(lines), encoding="utf-8")
+    logger.info(f"Wrote covariance gallery to {gallery_rst.relative_to(src_dir)}")
+
+
+def setup(app):
+    app.connect("builder-inited", main)
+    return {"parallel_read_safe": True, "parallel_write_safe": True}
diff --git a/ptgp/plotting.py b/ptgp/plotting.py
new file mode 100644
index 0000000..c6ed716
--- /dev/null
+++ b/ptgp/plotting.py
@@ -0,0 +1,277 @@
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+import numpy as np
+import pytensor
+import pytensor.tensor as pt
+
+mpl.rcParams["font.family"] = "serif"
+
+_BG_COLOR = "#eff3f7"
+_GRID_COLOR = "black"
+_GRID_LW = 0.3
+_GRID_LS = "-"
+_SAMPLE_PALETTE = ["#e8543f", "#f3a712", "#9b4dca", "#1f4e79", "#2a9d8f"]
+_LINE_LW = 2.0
+_LINE_ALPHA = 0.8
+_PANEL_W = 3.5
+_PANEL_H = 2.6
+_DPI = 144
+
+_OBS_X = np.array([[-1.6], [-0.4], [1.8]])
+_OBS_Y = np.array([0.65, -0.35, -1.05])
+_OBS_SIGMA = 0.3
+_OBS_COLOR = "#d11149"
+_MARKER_EDGE_COLOR = "black"
+_POPSICLE_LW = 8.0
+_POPSICLE_ALPHA = 0.6
+_MARKER_SIZE = 7.0
+_MARKER_EDGE_LW = 1.5
+
+_JITTER = 1e-6
+
+
+def _compile_k(kernel, X_dtype="float64"):
+    _X = pt.matrix("_X_plot", shape=(None, 1), dtype=X_dtype)
+    return pytensor.function([_X], kernel(_X))
+
+
+def _compile_kxy(kernel, X_dtype="float64"):
+    _A = pt.matrix("_X_plot_a", shape=(None, 1), dtype=X_dtype)
+    _B = pt.matrix("_X_plot_b", shape=(None, 1), dtype=X_dtype)
+    return pytensor.function([_A, _B], kernel(_A, _B))
+
+
+def _style_ax(ax):
+    ax.set_facecolor(_BG_COLOR)
+    ax.set_axisbelow(True)
+    ax.grid(True, color=_GRID_COLOR, lw=_GRID_LW, ls=_GRID_LS, zorder=0)
+    ax.tick_params(left=False, bottom=False, labelleft=True, labelbottom=True)
+    for side in ("top", "right", "bottom", "left"):
+        ax.spines[side].set_visible(False)
+
+
+def _make_axes(n_panels):
+    fig, axes = plt.subplots(
+        1,
+        n_panels,
+        figsize=(_PANEL_W * n_panels, _PANEL_H),
+        dpi=_DPI,
+        constrained_layout=True,
+    )
+    return fig, np.atleast_1d(axes)
+
+
+def plot_kernel_decay(
+    kernels_and_labels,
+    max_distance=5.0,
+    n_x=200,
+):
+    """Plot :math:`k(0, x)` against distance for a sequence of kernels.
+
+    All curves share a single axes so the user can read off how covariance
+    falls off relative to each other for the chosen parameter sweep. Inputs
+    are 1-D and the kernel is evaluated at ``x = 0`` vs. a grid in
+    ``[0, max_distance]``.
+
+    Parameters
+    ----------
+    kernels_and_labels : list of (str, Kernel)
+        Pairs of label and kernel instance. Each pair becomes one overlaid
+        curve.
+    max_distance : float
+        Upper bound of the distance grid. Default 5.
+    n_x : int
+        Number of grid points. Default 200.
+
+    Returns
+    -------
+    None
+        Plots into a fresh figure on the current matplotlib backend; in
+        notebooks the figure renders inline.
+    """
+    fig, ax = plt.subplots(
+        figsize=(_PANEL_W * 2, _PANEL_H),
+        dpi=_DPI,
+        constrained_layout=True,
+    )
+    grid = np.linspace(0.0, max_distance, n_x)[:, None]
+    origin = np.zeros((1, 1))
+    for i, (label, kernel) in enumerate(kernels_and_labels):
+        K = _compile_kxy(kernel)(origin, grid)
+        ax.plot(
+            grid[:, 0],
+            K[0, :],
+            lw=_LINE_LW,
+            alpha=_LINE_ALPHA,
+            color=_SAMPLE_PALETTE[i % len(_SAMPLE_PALETTE)],
+            label=label,
+        )
+    ax.set_xlim(0, max_distance)
+    ax.set_ylim(-0.05, 1.05)
+    ax.set_xlabel(r"$\Vert x - x' \Vert$")
+    ax.set_ylabel(r"$k(x, x')$")
+    ax.legend(loc="upper right", frameon=False)
+    ax.set_facecolor(_BG_COLOR)
+    ax.set_axisbelow(True)
+    ax.grid(True, color=_GRID_COLOR, lw=_GRID_LW, ls=_GRID_LS, zorder=0)
+    for side in ("top", "right"):
+        ax.spines[side].set_visible(False)
+
+
+def plot_prior_samples(
+    kernels_and_labels,
+    n_samples=4,
+    x_range=(-3.0, 3.0),
+    y_range=(-3.0, 3.0),
+    n_x=200,
+    seed=1,
+):
+    """Plot GP prior samples for a sequence of kernels as side-by-side panels.
+
+    Parameters
+    ----------
+    kernels_and_labels : list of (str, Kernel)
+        Pairs of label and kernel instance. Each pair becomes one panel.
+    n_samples : int
+        Number of sample functions drawn per panel. Default 4.
+    x_range : tuple of float
+        ``(low, high)`` range for the evaluation grid. Default ``(-3, 3)``.
+    y_range : tuple of float
+        ``(low, high)`` fixed y-axis limits. Default ``(-3, 3)``.
+    n_x : int
+        Number of grid points per panel. Default 200.
+    seed : int
+        RNG seed for sample reproducibility. Default 1.
+
+    Returns
+    -------
+    None
+        Plots into a fresh figure on the current matplotlib backend; in
+        notebooks the figure renders inline.
+    """
+    fig, axes = _make_axes(len(kernels_and_labels))
+    X_np = np.linspace(x_range[0], x_range[1], n_x)[:, None]
+    rng = np.random.default_rng(seed)
+    for i_ax, (ax, (label, kernel)) in enumerate(zip(axes, kernels_and_labels)):
+        K = _compile_k(kernel)(X_np)
+        K = 0.5 * (K + K.T) + _JITTER * np.eye(n_x)
+        L = np.linalg.cholesky(K)
+        samples = L @ rng.standard_normal((n_x, n_samples))
+        for i in range(n_samples):
+            ax.plot(
+                X_np[:, 0],
+                samples[:, i],
+                lw=_LINE_LW,
+                alpha=_LINE_ALPHA,
+                color=_SAMPLE_PALETTE[i % len(_SAMPLE_PALETTE)],
+            )
+        ax.set_title(label, fontsize=11)
+        _style_ax(ax)
+        ax.set_xlim(x_range)
+        ax.set_ylim(y_range, auto=False)
+        if i_ax > 0:
+            ax.tick_params(labelleft=False)
+
+def plot_conditional(
+    kernels_and_labels,
+    n_draws=3,
+    x_range=(-3.0, 3.0),
+    y_range=(-3.0, 3.0),
+    n_x=200,
+    obs_x=None,
+    obs_y=None,
+    sigma=_OBS_SIGMA,
+    seed=1,
+):
+    """Plot GP posterior conditional draws given a few noisy observations.
+
+    Draws ``n_draws`` posterior samples per kernel and overlays the
+    observations as hollow markers with thick low-alpha popsicle error bars
+    indicating the observation noise scale.
+
+    Parameters
+    ----------
+    kernels_and_labels : list of (str, Kernel)
+        Pairs of label and kernel instance. Each pair becomes one panel.
+    n_draws : int
+        Number of posterior sample functions drawn per panel. Default 3.
+    x_range : tuple of float
+        ``(low, high)`` range for the evaluation grid. Default ``(-3, 3)``.
+    y_range : tuple of float
+        ``(low, high)`` fixed y-axis limits. Default ``(-3, 3)``.
+    n_x : int
+        Number of grid points per panel. Default 200.
+    obs_x : ndarray, optional
+        Observation locations, shape ``(N_obs, 1)``. Defaults to three points
+        spanning the canonical range.
+    obs_y : ndarray, optional
+        Observed values at ``obs_x``, shape ``(N_obs,)``. Defaults match the
+        canonical ``obs_x``.
+    sigma : float
+        Observation noise standard deviation. Default 0.3.
+    seed : int
+        RNG seed for sample reproducibility. Default 1.
+
+    Returns
+    -------
+    None
+        Plots into a fresh figure on the current matplotlib backend; in
+        notebooks the figure renders inline.
+    """
+    obs_x = _OBS_X if obs_x is None else obs_x
+    obs_y = _OBS_Y if obs_y is None else obs_y
+    fig, axes = _make_axes(len(kernels_and_labels))
+    X_np = np.linspace(x_range[0], x_range[1], n_x)[:, None]
+    rng = np.random.default_rng(seed)
+    for i_ax, (ax, (label, kernel)) in enumerate(zip(axes, kernels_and_labels)):
+        fn_k = _compile_k(kernel)
+        fn_kxy = _compile_kxy(kernel)
+        K_oo = fn_k(obs_x) + (sigma**2 + _JITTER) * np.eye(len(obs_x))
+        K_go = fn_kxy(X_np, obs_x)
+        K_gg = fn_k(X_np)
+        L = np.linalg.cholesky(K_oo)
+        alpha_vec = np.linalg.solve(L.T, np.linalg.solve(L, obs_y))
+        mu = K_go @ alpha_vec
+        v = np.linalg.solve(L, K_go.T)
+        cov = K_gg - v.T @ v
+        cov = 0.5 * (cov + cov.T) + _JITTER * np.eye(n_x)
+        L_post = np.linalg.cholesky(cov)
+        draws = mu[:, None] + L_post @ rng.standard_normal((n_x, n_draws))
+        for i in range(n_draws):
+            ax.plot(
+                X_np[:, 0],
+                draws[:, i],
+                lw=_LINE_LW,
+                alpha=_LINE_ALPHA,
+                color=_SAMPLE_PALETTE[i % len(_SAMPLE_PALETTE)],
+                zorder=2,
+            )
+        container = ax.errorbar(
+            obs_x[:, 0],
+            obs_y,
+            yerr=sigma,
+            fmt="none",
+            ecolor=_OBS_COLOR,
+            elinewidth=_POPSICLE_LW,
+            capsize=0,
+            zorder=4,
+        )
+        for bar_line in container[2]:
+            bar_line.set_capstyle("round")
+            bar_line.set_alpha(_POPSICLE_ALPHA)
+        ax.plot(
+            obs_x[:, 0],
+            obs_y,
+            "o",
+            mfc="none",
+            mec=_MARKER_EDGE_COLOR,
+            mew=_MARKER_EDGE_LW,
+            ms=_MARKER_SIZE,
+            zorder=5,
+        )
+        ax.set_title(label, fontsize=11)
+        _style_ax(ax)
+        ax.set_xlim(x_range)
+        ax.set_ylim(y_range, auto=False)
+        if i_ax > 0:
+            ax.tick_params(labelleft=False)
diff --git a/ptgp/utils.py b/ptgp/utils.py
index d0dfc24..4388e70 100644
--- a/ptgp/utils.py
+++ b/ptgp/utils.py
@@ -70,8 +70,8 @@ def check_init(
     """Evaluate the compiled objective at theta0 and report whether the result is finite.
 
     Prints a one-line summary for the loss and grad, then lists the top ``top_k``
-    largest |grad| components annotated with their variable names (when ``model``
-    is supplied).
+    largest ``|grad|`` components annotated with their variable names (when
+    ``model`` is supplied).
 
     Parameters
     ----------
@@ -94,7 +94,7 @@ def check_init(
         Initial values for ``extra_vars``.  Required for labelling when
         ``extra_vars`` is provided.
     top_k : int
-        Number of largest |grad| components to print.  Default 10.
+        Number of largest ``|grad|`` components to print.  Default 10.
 
     Returns
     -------
diff --git a/scripts/check_kernel_gallery.py b/scripts/check_kernel_gallery.py
new file mode 100644
index 0000000..3736aba
--- /dev/null
+++ b/scripts/check_kernel_gallery.py
@@ -0,0 +1,98 @@
+"""Fail if any public kernel is neither in the gallery nor blacklisted.
+
+Run by the ``check-kernel-gallery`` pre-commit hook whenever ``ptgp.kernels``
+or the gallery module changes. To resolve a failure, either add the kernel to
+``KERNEL_RECIPES`` in ``docs/sphinxext/generate_kernel_gallery.py`` or list
+it in ``KERNEL_GALLERY_BLACKLIST`` in the same file.
+
+Uses stdlib-only AST parsing so the hook runs in pre-commit's minimal
+``language: system`` env (no numpy, pytensor, or ptgp install required).
+"""
+
+import ast
+import sys
+
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parent.parent
+KERNELS_INIT = ROOT / "ptgp" / "kernels" / "__init__.py"
+GALLERY = ROOT / "docs" / "sphinxext" / "generate_kernel_gallery.py"
+
+
+def _parse(path: Path) -> ast.Module:
+    return ast.parse(path.read_text(encoding="utf-8"), filename=str(path))
+
+
+def _find_assign(tree: ast.Module, name: str) -> ast.expr | None:
+    """Return the RHS of the module-level assignment ``name = ...``."""
+    for node in tree.body:
+        if isinstance(node, ast.Assign) and any(
+            isinstance(t, ast.Name) and t.id == name for t in node.targets
+        ):
+            return node.value
+        if (
+            isinstance(node, ast.AnnAssign)
+            and isinstance(node.target, ast.Name)
+            and node.target.id == name
+            and node.value is not None
+        ):
+            return node.value
+    return None
+
+
+def _string_elts(expr: ast.expr | None, source: Path, name: str) -> set[str]:
+    if expr is None or not isinstance(expr, ast.List | ast.Set | ast.Tuple):
+        raise RuntimeError(f"could not find list/set literal {name} in {source}")
+    out: set[str] = set()
+    for el in expr.elts:
+        if not isinstance(el, ast.Constant) or not isinstance(el.value, str):
+            raise RuntimeError(f"non-string entry in {name} in {source}: {ast.dump(el)}")
+        out.add(el.value)
+    return out
+
+
+def _recipe_names(tree: ast.Module) -> set[str]:
+    """Collect the first positional arg of every ``CoverRecipe(...)`` call."""
+    names: set[str] = set()
+    for node in ast.walk(tree):
+        if (
+            isinstance(node, ast.Call)
+            and isinstance(node.func, ast.Name)
+            and node.func.id == "CoverRecipe"
+            and node.args
+            and isinstance(node.args[0], ast.Constant)
+            and isinstance(node.args[0].value, str)
+        ):
+            names.add(node.args[0].value)
+    return names
+
+
+def main() -> int:
+    kernels_tree = _parse(KERNELS_INIT)
+    gallery_tree = _parse(GALLERY)
+
+    public = _string_elts(_find_assign(kernels_tree, "__all__"), KERNELS_INIT, "__all__")
+    blacklist = _string_elts(
+        _find_assign(gallery_tree, "KERNEL_GALLERY_BLACKLIST"),
+        GALLERY,
+        "KERNEL_GALLERY_BLACKLIST",
+    )
+    recipes = _recipe_names(gallery_tree)
+
+    missing = sorted(public - recipes - blacklist)
+    if not missing:
+        return 0
+
+    print(
+        "Public kernels missing from the covariance gallery:\n"
+        + "\n".join(f"  - {name}" for name in missing)
+        + "\n\nAdd each to KERNEL_RECIPES in docs/sphinxext/generate_kernel_gallery.py"
+        " (with a cover-rendering recipe) or to KERNEL_GALLERY_BLACKLIST in the same"
+        " file if it should be excluded.",
+        file=sys.stderr,
+    )
+    return 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())