From e680439fc5288327786a1eb33528118858e2e0ef Mon Sep 17 00:00:00 2001
From: jessegrabowski <jessegrabowski@gmail.com>
Date: Thu, 28 May 2026 16:48:55 -0500
Subject: [PATCH 1/8] Add clone_replace_data mechanism and re-root Gaussian
 sigma

---
 ptgp/gp/svgp.py                       |  2 +-
 ptgp/gp/unapproximated.py             |  3 +-
 ptgp/gp/vfe.py                        |  3 +-
 ptgp/likelihoods/base.py              | 71 ++++++++++++++++++++++++
 ptgp/likelihoods/gaussian.py          | 30 ++--------
 tests/likelihoods/test_base.py        | 79 ++++++++++++++++++++++++++-
 tests/test_heteroskedastic_predict.py | 48 ++++++++--------
 7 files changed, 182 insertions(+), 54 deletions(-)

diff --git a/ptgp/gp/svgp.py b/ptgp/gp/svgp.py
index cdfa52b..a4f34b0 100644
--- a/ptgp/gp/svgp.py
+++ b/ptgp/gp/svgp.py
@@ -230,7 +230,7 @@ def predict_marginal(self, X, incl_lik=False):
             fmean, fvar = conditional_unwhitened(A, Kmn, Knn_diag, self.q_mu, self.q_sqrt)
         fmean = fmean + self.mean(X)
         if incl_lik:
-            return self.likelihood.predict_mean_and_var(fmean, fvar)
+            return self.likelihood.clone_replace_data(X).predict_mean_and_var(fmean, fvar)
         return fmean, fvar
 
     def predict_joint(self, X):
diff --git a/ptgp/gp/unapproximated.py b/ptgp/gp/unapproximated.py
index 0b37af0..795998f 100644
--- a/ptgp/gp/unapproximated.py
+++ b/ptgp/gp/unapproximated.py
@@ -64,6 +64,5 @@ def predict_marginal(self, X_new, X_train, y_train, incl_lik=False):
         fvar = Kss_diag - pt.sum(Kns * (Knn_inv @ Kns), axis=0)
 
         if incl_lik:
-            sigma_new = self.likelihood.sigma_at(X_train, X_new)
-            return fmean, fvar + sigma_new**2
+            return self.likelihood.clone_replace_data(X_new).predict_mean_and_var(fmean, fvar)
         return fmean, fvar
diff --git a/ptgp/gp/vfe.py b/ptgp/gp/vfe.py
index 3c2550a..11c5fa6 100644
--- a/ptgp/gp/vfe.py
+++ b/ptgp/gp/vfe.py
@@ -87,6 +87,5 @@ def predict_marginal(self, X_new, X_train, y_train, incl_lik=False):
         fvar = Kss_diag - pt.sum(Kus * ((Kuu_inv - Sigma_inv) @ Kus), axis=0)
 
         if incl_lik:
-            sigma_new = self.likelihood.sigma_at(X_train, X_new)
-            return fmean, fvar + sigma_new**2
+            return self.likelihood.clone_replace_data(X_new).predict_mean_and_var(fmean, fvar)
         return fmean, fvar
diff --git a/ptgp/likelihoods/base.py b/ptgp/likelihoods/base.py
index 24771d6..6669438 100644
--- a/ptgp/likelihoods/base.py
+++ b/ptgp/likelihoods/base.py
@@ -1,6 +1,63 @@
+import copy
+
 import numpy as np
 import pytensor.tensor as pt
 
+from pytensor.graph.replace import clone_replace
+from pytensor.graph.traversal import ancestors
+from pytensor.tensor.type import TensorType
+
+
+def _data_inputs(value, X):
+    """Leaf nodes in ``value``'s graph that look like the design matrix ``X``.
+
+    A leaf (no owner) with the same ndim as ``X`` and matching feature
+    dimensions is taken to be a copy of the model input — whether it is a
+    symbolic placeholder, a ``pm.Data`` / shared variable, or a constant array
+    baked in via ``pt.as_tensor_variable``. Matching is by type, so ``pm.Data``
+    (a SharedVariable whose type is ``TensorType``) is caught. PyMC random
+    variables and other computed nodes carry an owner and are excluded; scalars
+    never match, so literal constants like ``0.1`` are left alone.
+    """
+    feat = X.type.shape[1:]
+    return [
+        v
+        for v in ancestors([value])
+        if v.owner is None
+        and isinstance(v.type, TensorType)
+        and v.type.ndim == X.type.ndim
+        and v.type.ndim > 0
+        and all(a is None or b is None or a == b for a, b in zip(v.type.shape[1:], feat))
+    ]
+
+
+def _reroot(value, X):
+    """Re-root ``value`` onto ``X`` by replacing design-matrix-shaped leaves.
+
+    The input(s) to replace are discovered from the graph, so the caller only
+    supplies the new data. ``rebuild_strict=False`` lets a baked constant array
+    (whose row dimension is statically frozen) be replaced by ``X`` without
+    coercing ``X`` back to that frozen shape. A data-independent ``value``
+    (scalar or PyMC RV) is returned unchanged.
+
+    ``X`` must be a 2D design matrix ``(N, D)``. The discovery in
+    :func:`_data_inputs` keys on dimensionality: parameters are per-observation
+    1D vectors, so a 2D ``X`` is what distinguishes the data from the parameter
+    itself. A 1D ``X`` would collapse that gap and could match the parameter, so
+    it is rejected.
+    """
+    if X.type.ndim < 2:
+        raise ValueError(
+            "Re-rooting a likelihood parameter requires a 2D design matrix (N, D); "
+            f"got X with ndim={X.type.ndim}. Parameters are per-observation 1D "
+            "vectors, so a 1D X cannot be distinguished from the parameter itself."
+        )
+    value = pt.as_tensor_variable(value)
+    inputs = _data_inputs(value, X)
+    if not inputs:
+        return value
+    return clone_replace(value, {v: X for v in inputs}, rebuild_strict=False)
+
 
 class Likelihood:
     """Base class for all PTGP likelihoods.
@@ -23,6 +80,20 @@ class Likelihood:
 
     n_points: int = 20
     invlink = None
+    param_names: tuple = ()
+
+    def clone_replace_data(self, X):
+        """Return a copy with every data-dependent parameter re-rooted onto X.
+
+        Each parameter named in ``param_names`` whose graph depends on a free
+        data input is re-expressed over ``X``; data-independent parameters are
+        left untouched. Used at predict time to evaluate parameters at the test
+        inputs rather than the training inputs they were built against.
+        """
+        new = copy.copy(self)
+        for name in self.param_names:
+            setattr(new, name, _reroot(getattr(self, name), X))
+        return new
 
     def _log_prob(self, f, y):
         """Symbolic log p(y|f). Subclasses must implement."""
diff --git a/ptgp/likelihoods/gaussian.py b/ptgp/likelihoods/gaussian.py
index ef2cb8a..9b1964a 100644
--- a/ptgp/likelihoods/gaussian.py
+++ b/ptgp/likelihoods/gaussian.py
@@ -1,10 +1,5 @@
 import pytensor.tensor as pt
 
-from pytensor.graph.basic import Constant
-from pytensor.graph.replace import graph_replace
-from pytensor.graph.traversal import ancestors
-from pytensor.tensor.type import TensorType
-
 from ptgp.likelihoods.base import Likelihood
 
 LOG2PI = pt.log(2.0 * pt.pi)
@@ -20,32 +15,15 @@ class Gaussian(Likelihood):
     sigma : tensor
         Observation noise standard deviation. May be a scalar (homoskedastic)
         or a vector built against a symbolic ``X`` (heteroskedastic). At
-        predict time, :meth:`sigma_at` substitutes the training ``X`` for
-        ``X_new`` in sigma's graph; mismatches surface as a strict-mode
-        ``graph_replace`` error.
+        predict time, :meth:`Likelihood.clone_replace_data` re-roots sigma's graph onto
+        the test inputs.
     """
 
+    param_names = ("sigma",)
+
     def __init__(self, sigma):
         self.sigma = pt.as_tensor_variable(sigma)
 
-    def sigma_at(self, X_train, X_new):
-        """Return ``sigma`` with ``X_train`` substituted by ``X_new``.
-
-        Sigma is treated as data-dependent iff it has any free symbolic
-        tensor input in its graph (covers both raw ``pt.matrix`` X and
-        ``pm.Data`` X — the latter is a SharedVariable but its type is
-        ``TensorType``). For data-independent sigma the call is a no-op.
-        For data-dependent sigma, strict-mode ``graph_replace`` raises a
-        clear error if ``X_train`` is not in sigma's graph.
-        """
-        has_data_dep = any(
-            v.owner is None and not isinstance(v, Constant) and isinstance(v.type, TensorType)
-            for v in ancestors([self.sigma])
-        )
-        if not has_data_dep:
-            return self.sigma
-        return graph_replace(self.sigma, {X_train: X_new})
-
     def _log_prob(self, f, y):
         return -0.5 * (LOG2PI + pt.log(self.sigma**2) + (y - f) ** 2 / self.sigma**2)
 
diff --git a/tests/likelihoods/test_base.py b/tests/likelihoods/test_base.py
index d87edea..9d2ab4f 100644
--- a/tests/likelihoods/test_base.py
+++ b/tests/likelihoods/test_base.py
@@ -1,10 +1,12 @@
 """Tests for the likelihood base class — configurable inverse-link behavior."""
 
 import numpy as np
+import pymc as pm
 import pytensor
 import pytensor.tensor as pt
+import pytest
 
-from ptgp.likelihoods import Bernoulli, Poisson
+from ptgp.likelihoods import Bernoulli, Gaussian, Poisson
 
 
 def _eval(*tensors):
@@ -12,6 +14,18 @@ def _eval(*tensors):
     return f()
 
 
+def _hetero(X):
+    return 0.5 + 0.1 * X[:, 0] ** 2
+
+
+def _provider(arr, kind):
+    if kind == "symbolic":
+        return pt.matrix("Xtr", shape=(None, arr.shape[1]))
+    if kind == "baked":
+        return pt.as_tensor_variable(arr)
+    return pm.Data("Xtr", arr)
+
+
 class TestConfigurableLink:
     def test_bernoulli_logit_link(self):
         """Bernoulli with logit link should differ from probit but still be valid."""
@@ -46,3 +60,66 @@ def softplus(f):
             )
         )
         assert np.isfinite(ve).all()
+
+
+class TestCloneReplaceData:
+    @pytest.mark.parametrize("kind", ["symbolic", "baked", "pm_data"])
+    def test_reroot_matches_direct_build(self, kind):
+        """Re-rooting yields the parameter evaluated at the new inputs, however
+        the training data was supplied. ``train`` and ``new`` differ in length so
+        a baked array's statically frozen row dimension cannot be re-imposed."""
+        train = np.linspace(-1.0, 1.0, 5)[:, None]
+        new = np.array([[0.0], [1.0], [2.0]])
+        X_new = pt.matrix("X_new", shape=(None, 1))
+        with pm.Model():
+            rerooted = Gaussian(_hetero(_provider(train, kind))).clone_replace_data(X_new).sigma
+        got = pytensor.function([X_new], rerooted)(new)
+        np.testing.assert_allclose(got, _hetero(new))
+
+    def test_reroot_matches_multidim_input(self):
+        new = np.array([[0.0, 1.0], [2.0, 3.0]])
+        X = pt.matrix("X", shape=(None, 2))
+        X_new = pt.matrix("X_new", shape=(None, 2))
+        rerooted = Gaussian(0.1 + pt.sum(X**2, axis=1)).clone_replace_data(X_new).sigma
+        got = pytensor.function([X_new], rerooted)(new)
+        np.testing.assert_allclose(got, 0.1 + np.sum(new**2, axis=1))
+
+    def test_returns_copy_leaving_original_untouched(self):
+        X = pt.matrix("X", shape=(None, 1))
+        lik = Gaussian(_hetero(X))
+        original = lik.sigma
+        out = lik.clone_replace_data(pt.matrix("X_new", shape=(None, 1)))
+        assert lik.sigma is original
+        assert out.sigma is not original
+
+    def test_scalar_hyperparam_survives_reroot(self):
+        """A parameter mixing data with a scalar placeholder re-roots only the
+        data; the scalar still flows through to the result."""
+        X = pt.matrix("X", shape=(None, 1))
+        hyp = pt.scalar("hyp")
+        new = np.array([[0.0], [1.0], [2.0]])
+        X_new = pt.matrix("X_new", shape=(None, 1))
+        out = Gaussian(hyp + 0.05 * X[:, 0] ** 2).clone_replace_data(X_new)
+        got = pytensor.function([X_new, hyp], out.sigma)(new, 2.0)
+        np.testing.assert_allclose(got, 2.0 + 0.05 * new[:, 0] ** 2)
+
+    def test_eager_numpy_parameter_is_not_rerooted(self):
+        """Pure-numpy arithmetic collapses the data dependence to a fixed vector
+        before pytensor sees it, leaving nothing to re-root."""
+        arr = np.linspace(-1.0, 1.0, 5)[:, None]
+        sigma = pt.as_tensor_variable(0.1 + 0.05 * arr[:, 0] ** 2)
+        out = Gaussian(sigma).clone_replace_data(pt.matrix("X_new", shape=(None, 1)))
+        assert out.sigma is sigma
+
+    def test_mismatched_feature_count_is_not_rerooted(self):
+        sigma = 0.1 + pt.sum(pt.matrix("Xtr", shape=(None, 3)) ** 2, axis=1)
+        out = Gaussian(sigma).clone_replace_data(pt.matrix("X_new", shape=(None, 1)))
+        assert out.sigma is sigma
+
+    def test_one_dimensional_X_is_rejected(self):
+        """A 1D X is indistinguishable from a per-observation parameter, so
+        re-rooting against it would risk replacing the parameter itself."""
+        X = pt.matrix("X", shape=(None, 1))
+        lik = Gaussian(0.1 + 0.05 * X[:, 0] ** 2)
+        with pytest.raises(ValueError, match=r"requires a 2D design matrix"):
+            lik.clone_replace_data(pt.vector("X_new", shape=(None,)))
diff --git a/tests/test_heteroskedastic_predict.py b/tests/test_heteroskedastic_predict.py
index d6d407b..75be4aa 100644
--- a/tests/test_heteroskedastic_predict.py
+++ b/tests/test_heteroskedastic_predict.py
@@ -2,11 +2,11 @@
 import pymc as pm
 import pytensor
 import pytensor.tensor as pt
-import pytest
 
-from ptgp.gp import VFE, Unapproximated
+from ptgp.gp import SVGP, VFE, Unapproximated, init_variational_params
 from ptgp.inducing import Points
 from ptgp.kernels import ExpQuad
+from ptgp.likelihoods import Gaussian
 from ptgp.mean import Zero
 from ptgp.objectives import collapsed_elbo
 
@@ -47,18 +47,6 @@ def test_incl_lik_adds_pointwise_noise(self):
         expected_sigma_new = 0.1 + 0.05 * X_new[:, 0] ** 2
         np.testing.assert_allclose(yv, v + expected_sigma_new**2, atol=1e-10)
 
-    def test_wrong_X_raises(self):
-        """sigma built against X_a, predict called with X_b — ancestor check fires."""
-        X_a = pt.matrix("X_a", shape=(None, 1))
-        X_b = pt.matrix("X_b", shape=(None, 1))
-        gp = Unapproximated(
-            kernel=ExpQuad(input_dim=1, ls=1.0), mean=Zero(), sigma=_hetero_sigma(X_a)
-        )
-        X_new_t = pt.matrix("X_new", shape=(None, 1))
-        y_t = pt.vector("y", shape=(None,))
-        with pytest.raises(ValueError, match=r"Some replacements were not used"):
-            gp.predict_marginal(X_new_t, X_b, y_t, incl_lik=True)
-
 
 class TestVFEHeteroskedastic:
     def _build(self, X):
@@ -86,14 +74,30 @@ def test_incl_lik_adds_pointwise_noise(self):
         expected_sigma_new = 0.1 + 0.05 * X_new[:, 0] ** 2
         np.testing.assert_allclose(yv, v + expected_sigma_new**2, atol=1e-10)
 
-    def test_wrong_X_raises(self):
-        X_a = pt.matrix("X_a", shape=(None, 1))
-        X_b = pt.matrix("X_b", shape=(None, 1))
-        vfe = self._build(X_a)
+
+class TestSVGPHeteroskedastic:
+    def test_incl_lik_adds_pointwise_noise(self):
+        X_train, y_train, X_new = _data()
+        M = 6
+        Z = np.linspace(-1.5, 1.5, M)[:, None].astype(np.float64)
+        vp = init_variational_params(M)
+        X = pt.matrix("X", shape=(None, 1))
+        svgp = SVGP(
+            kernel=ExpQuad(input_dim=1, ls=1.0),
+            mean=Zero(),
+            likelihood=Gaussian(_hetero_sigma(X)),
+            inducing_variable=Points(pt.as_tensor_variable(Z)),
+            variational_params=vp,
+        )
         X_new_t = pt.matrix("X_new", shape=(None, 1))
-        y_t = pt.vector("y", shape=(None,))
-        with pytest.raises(ValueError, match=r"Some replacements were not used"):
-            vfe.predict_marginal(X_new_t, X_b, y_t, incl_lik=True)
+
+        _, fvar = svgp.predict_marginal(X_new_t)
+        _, yvar = svgp.predict_marginal(X_new_t, incl_lik=True)
+        fn = pytensor.function([X_new_t, *vp.extra_vars], [fvar, yvar], on_unused_input="ignore")
+        v, yv = fn(X_new, *vp.extra_init)
+
+        expected_sigma_new = 0.1 + 0.05 * X_new[:, 0] ** 2
+        np.testing.assert_allclose(yv - v, expected_sigma_new**2, atol=1e-10)
 
 
 class TestScalarSigmaUnaffected:
@@ -148,7 +152,7 @@ def test_pm_data_sigma_is_detected_as_data_dependent():
     """sigma built against a pm.Data SharedVariable must still be treated
     as data-dependent. The heuristic filters by type (TensorType), not by
     class (SharedVariable) — without this, pm.Data is silently invisible
-    to sigma_at and predictions use the wrong noise.
+    to clone_replace_data and predictions use the wrong noise.
     """
     X_train_arr = np.linspace(-1.5, 1.5, 10)[:, None].astype(np.float64)
     X_new_arr = np.linspace(-2.0, 2.0, 5)[:, None].astype(np.float64)

From cce4c9272ab3b4ad6c52ab98504fc24aa54a1223 Mon Sep 17 00:00:00 2001
From: jessegrabowski <jessegrabowski@gmail.com>
Date: Thu, 28 May 2026 16:49:34 -0500
Subject: [PATCH 2/8] Extend parameter re-rooting to StudentT and
 NegativeBinomial

---
 ptgp/likelihoods/negative_binomial.py |  4 +++-
 ptgp/likelihoods/student_t.py         |  6 ++++--
 tests/likelihoods/test_base.py        | 29 ++++++++++++++++++++++++++-
 3 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/ptgp/likelihoods/negative_binomial.py b/ptgp/likelihoods/negative_binomial.py
index 788e3d5..4c0a537 100644
--- a/ptgp/likelihoods/negative_binomial.py
+++ b/ptgp/likelihoods/negative_binomial.py
@@ -20,8 +20,10 @@ class NegativeBinomial(Likelihood):
         Number of Gauss-Hermite quadrature points (default 20).
     """
 
+    param_names = ("alpha",)
+
     def __init__(self, alpha, invlink=None, n_points=20):
-        self.alpha = alpha
+        self.alpha = pt.as_tensor_variable(alpha)
         self.invlink = invlink or pt.exp
         self.n_points = n_points
 
diff --git a/ptgp/likelihoods/student_t.py b/ptgp/likelihoods/student_t.py
index 247dd19..60a4329 100644
--- a/ptgp/likelihoods/student_t.py
+++ b/ptgp/likelihoods/student_t.py
@@ -18,9 +18,11 @@ class StudentT(Likelihood):
         Number of Gauss-Hermite quadrature points (default 20).
     """
 
+    param_names = ("nu", "sigma")
+
     def __init__(self, nu, sigma, n_points=20):
-        self.nu = nu
-        self.sigma = sigma
+        self.nu = pt.as_tensor_variable(nu)
+        self.sigma = pt.as_tensor_variable(sigma)
         self.n_points = n_points
 
     def _log_prob(self, f, y):
diff --git a/tests/likelihoods/test_base.py b/tests/likelihoods/test_base.py
index 9d2ab4f..71b9690 100644
--- a/tests/likelihoods/test_base.py
+++ b/tests/likelihoods/test_base.py
@@ -6,7 +6,7 @@
 import pytensor.tensor as pt
 import pytest
 
-from ptgp.likelihoods import Bernoulli, Gaussian, Poisson
+from ptgp.likelihoods import Bernoulli, Gaussian, NegativeBinomial, Poisson, StudentT
 
 
 def _eval(*tensors):
@@ -84,6 +84,28 @@ def test_reroot_matches_multidim_input(self):
         got = pytensor.function([X_new], rerooted)(new)
         np.testing.assert_allclose(got, 0.1 + np.sum(new**2, axis=1))
 
+    def test_reroots_all_tensor_params(self):
+        X = pt.matrix("X", shape=(None, 1))
+        new = np.array([[1.0], [2.0], [3.0]])
+        out = StudentT(nu=3.0 + X[:, 0] ** 2, sigma=_hetero(X)).clone_replace_data(
+            pt.as_tensor_variable(new)
+        )
+        nu_val, sigma_val = _eval(out.nu, out.sigma)
+        np.testing.assert_allclose(nu_val, 3.0 + new[:, 0] ** 2)
+        np.testing.assert_allclose(sigma_val, _hetero(new))
+
+    def test_negative_binomial_alpha(self):
+        X = pt.matrix("X", shape=(None, 1))
+        new = np.array([[0.0], [2.0]])
+        out = NegativeBinomial(alpha=_hetero(X)).clone_replace_data(pt.as_tensor_variable(new))
+        np.testing.assert_allclose(_eval(out.alpha), _hetero(new))
+
+    def test_leaves_scalar_params_untouched(self):
+        X = pt.matrix("X", shape=(None, 1))
+        lik = StudentT(nu=4.0, sigma=_hetero(X))
+        out = lik.clone_replace_data(pt.matrix("X_new", shape=(None, 1)))
+        assert out.nu is lik.nu
+
     def test_returns_copy_leaving_original_untouched(self):
         X = pt.matrix("X", shape=(None, 1))
         lik = Gaussian(_hetero(X))
@@ -92,6 +114,11 @@ def test_returns_copy_leaving_original_untouched(self):
         assert lik.sigma is original
         assert out.sigma is not original
 
+    def test_preserves_non_parameter_attributes(self):
+        X = pt.matrix("X", shape=(None, 1))
+        lik = StudentT(nu=_hetero(X), sigma=0.5, n_points=33)
+        assert lik.clone_replace_data(pt.matrix("X_new", shape=(None, 1))).n_points == 33
+
     def test_scalar_hyperparam_survives_reroot(self):
         """A parameter mixing data with a scalar placeholder re-roots only the
         data; the scalar still flows through to the result."""

From e29178ffcbcc79cd4bdcb045135e15137fe04a0a Mon Sep 17 00:00:00 2001
From: Ricardo Vieira <ricardo.vieira1994@gmail.com>
Date: Fri, 29 May 2026 15:50:46 +0200
Subject: [PATCH 3/8] More functional design for Likelihood

---
 ptgp/gp/svgp.py                             |   2 +-
 ptgp/gp/unapproximated.py                   |  10 +-
 ptgp/gp/vfe.py                              |  10 +-
 ptgp/likelihoods/__init__.py                |  26 +-
 ptgp/likelihoods/base.py                    | 285 ++++++++++++--------
 ptgp/likelihoods/bernoulli.py               |  48 ++--
 ptgp/likelihoods/gaussian.py                |  61 +++--
 ptgp/likelihoods/negative_binomial.py       |  52 ++--
 ptgp/likelihoods/poisson.py                 |  46 ++--
 ptgp/likelihoods/student_t.py               |  49 ++--
 tests/likelihoods/test_base.py              | 125 ++++++---
 tests/likelihoods/test_functional_api.py    | 104 +++++++
 tests/likelihoods/test_negative_binomial.py |   4 +-
 tests/likelihoods/test_poisson.py           |   6 +-
 tests/test_heteroskedastic_predict.py       |  19 +-
 15 files changed, 567 insertions(+), 280 deletions(-)
 create mode 100644 tests/likelihoods/test_functional_api.py

diff --git a/ptgp/gp/svgp.py b/ptgp/gp/svgp.py
index a4f34b0..67eabca 100644
--- a/ptgp/gp/svgp.py
+++ b/ptgp/gp/svgp.py
@@ -230,7 +230,7 @@ def predict_marginal(self, X, incl_lik=False):
             fmean, fvar = conditional_unwhitened(A, Kmn, Knn_diag, self.q_mu, self.q_sqrt)
         fmean = fmean + self.mean(X)
         if incl_lik:
-            return self.likelihood.clone_replace_data(X).predict_mean_and_var(fmean, fvar)
+            return self.likelihood.at(X).predict_mean_and_var(fmean, fvar)
         return fmean, fvar
 
     def predict_joint(self, X):
diff --git a/ptgp/gp/unapproximated.py b/ptgp/gp/unapproximated.py
index 795998f..a2e3f9f 100644
--- a/ptgp/gp/unapproximated.py
+++ b/ptgp/gp/unapproximated.py
@@ -18,6 +18,10 @@ class Unapproximated:
         Mean function (default: ``Zero()``).
     sigma : tensor or PyMC random variable
         Observation noise standard deviation.
+    x : tensor, optional
+        The design matrix ``sigma`` was built against, for heteroskedastic
+        noise. Pass it so sigma can be re-rooted onto the test inputs at predict
+        time.
     """
 
     extra_vars = ()
@@ -25,11 +29,11 @@ class Unapproximated:
     default_objective = staticmethod(marginal_log_likelihood)
     predict_needs_data = True
 
-    def __init__(self, kernel, mean=None, sigma=None):
+    def __init__(self, kernel, mean=None, sigma=None, x=None):
         """Store the kernel and mean; build a Gaussian likelihood from sigma."""
         self.kernel = kernel
         self.mean = mean if mean is not None else Zero()
-        self.likelihood = Gaussian(sigma)
+        self.likelihood = Gaussian(sigma, x=x)
 
     def predict_marginal(self, X_new, X_train, y_train, incl_lik=False):
         """Posterior marginal mean and variance at each point in X_new.
@@ -64,5 +68,5 @@ def predict_marginal(self, X_new, X_train, y_train, incl_lik=False):
         fvar = Kss_diag - pt.sum(Kns * (Knn_inv @ Kns), axis=0)
 
         if incl_lik:
-            return self.likelihood.clone_replace_data(X_new).predict_mean_and_var(fmean, fvar)
+            return self.likelihood.at(X_new).predict_mean_and_var(fmean, fvar)
         return fmean, fvar
diff --git a/ptgp/gp/vfe.py b/ptgp/gp/vfe.py
index 11c5fa6..a3a5f44 100644
--- a/ptgp/gp/vfe.py
+++ b/ptgp/gp/vfe.py
@@ -22,12 +22,16 @@ class VFE:
         Observation noise standard deviation.
     inducing_variable : InducingVariables
         Inducing point locations.
+    x : tensor, optional
+        The design matrix ``sigma`` was built against, for heteroskedastic
+        noise. Pass it so sigma can be re-rooted onto the test inputs at predict
+        time.
     """
 
     default_objective = staticmethod(collapsed_elbo)
     predict_needs_data = True
 
-    def __init__(self, kernel, mean=None, sigma=None, inducing_variable=None):
+    def __init__(self, kernel, mean=None, sigma=None, inducing_variable=None, x=None):
         """Store the kernel, mean, and inducing variable; build a Gaussian likelihood from sigma."""
         if not hasattr(inducing_variable, "Z"):
             raise TypeError(
@@ -37,7 +41,7 @@ def __init__(self, kernel, mean=None, sigma=None, inducing_variable=None):
             )
         self.kernel = kernel
         self.mean = mean if mean is not None else Zero()
-        self.likelihood = Gaussian(sigma)
+        self.likelihood = Gaussian(sigma, x=x)
         self.inducing_variable = inducing_variable
 
     @property
@@ -87,5 +91,5 @@ def predict_marginal(self, X_new, X_train, y_train, incl_lik=False):
         fvar = Kss_diag - pt.sum(Kus * ((Kuu_inv - Sigma_inv) @ Kus), axis=0)
 
         if incl_lik:
-            return self.likelihood.clone_replace_data(X_new).predict_mean_and_var(fmean, fvar)
+            return self.likelihood.at(X_new).predict_mean_and_var(fmean, fvar)
         return fmean, fvar
diff --git a/ptgp/likelihoods/__init__.py b/ptgp/likelihoods/__init__.py
index f51989b..a14b3c8 100644
--- a/ptgp/likelihoods/__init__.py
+++ b/ptgp/likelihoods/__init__.py
@@ -1,8 +1,30 @@
-from ptgp.likelihoods.base import Likelihood
+from ptgp.likelihoods.base import (
+    Likelihood,
+    op_of,
+    param,
+    predict_log_density,
+    predict_mean_and_var,
+    variational_expectation,
+    at,
+)
 from ptgp.likelihoods.bernoulli import Bernoulli
 from ptgp.likelihoods.gaussian import Gaussian
 from ptgp.likelihoods.negative_binomial import NegativeBinomial
 from ptgp.likelihoods.poisson import Poisson
 from ptgp.likelihoods.student_t import StudentT
 
-__all__ = ["Likelihood", "Gaussian", "Bernoulli", "StudentT", "Poisson", "NegativeBinomial"]
+__all__ = [
+    "Likelihood",
+    "Gaussian",
+    "Bernoulli",
+    "StudentT",
+    "Poisson",
+    "NegativeBinomial",
+    # Purely functional API — operate on a likelihood node via ``owner.op``.
+    "op_of",
+    "param",
+    "at",
+    "variational_expectation",
+    "predict_mean_and_var",
+    "predict_log_density",
+]
diff --git a/ptgp/likelihoods/base.py b/ptgp/likelihoods/base.py
index 6669438..8d7e4e1 100644
--- a/ptgp/likelihoods/base.py
+++ b/ptgp/likelihoods/base.py
@@ -3,139 +3,97 @@
 import numpy as np
 import pytensor.tensor as pt
 
-from pytensor.graph.replace import clone_replace
-from pytensor.graph.traversal import ancestors
-from pytensor.tensor.type import TensorType
+from pytensor.graph.basic import Apply
+from pytensor.graph.null_type import NullType
+from pytensor.graph.op import Op
+from pytensor.graph.replace import graph_replace
 
+_NULL = NullType()
 
-def _data_inputs(value, X):
-    """Leaf nodes in ``value``'s graph that look like the design matrix ``X``.
 
-    A leaf (no owner) with the same ndim as ``X`` and matching feature
-    dimensions is taken to be a copy of the model input — whether it is a
-    symbolic placeholder, a ``pm.Data`` / shared variable, or a constant array
-    baked in via ``pt.as_tensor_variable``. Matching is by type, so ``pm.Data``
-    (a SharedVariable whose type is ``TensorType``) is caught. PyMC random
-    variables and other computed nodes carry an owner and are excluded; scalars
-    never match, so literal constants like ``0.1`` are left alone.
-    """
-    feat = X.type.shape[1:]
-    return [
-        v
-        for v in ancestors([value])
-        if v.owner is None
-        and isinstance(v.type, TensorType)
-        and v.type.ndim == X.type.ndim
-        and v.type.ndim > 0
-        and all(a is None or b is None or a == b for a, b in zip(v.type.shape[1:], feat))
-    ]
-
-
-def _reroot(value, X):
-    """Re-root ``value`` onto ``X`` by replacing design-matrix-shaped leaves.
-
-    The input(s) to replace are discovered from the graph, so the caller only
-    supplies the new data. ``rebuild_strict=False`` lets a baked constant array
-    (whose row dimension is statically frozen) be replaced by ``X`` without
-    coercing ``X`` back to that frozen shape. A data-independent ``value``
-    (scalar or PyMC RV) is returned unchanged.
-
-    ``X`` must be a 2D design matrix ``(N, D)``. The discovery in
-    :func:`_data_inputs` keys on dimensionality: parameters are per-observation
-    1D vectors, so a 2D ``X`` is what distinguishes the data from the parameter
-    itself. A 1D ``X`` would collapse that gap and could match the parameter, so
-    it is rejected.
-    """
-    if X.type.ndim < 2:
-        raise ValueError(
-            "Re-rooting a likelihood parameter requires a 2D design matrix (N, D); "
-            f"got X with ndim={X.type.ndim}. Parameters are per-observation 1D "
-            "vectors, so a 1D X cannot be distinguished from the parameter itself."
-        )
-    value = pt.as_tensor_variable(value)
-    inputs = _data_inputs(value, X)
-    if not inputs:
-        return value
-    return clone_replace(value, {v: X for v in inputs}, rebuild_strict=False)
+class LikelihoodOp(Op):
+    """Structural marker keeping a likelihood inside the PyTensor graph.
 
+    The Apply node consumes the design matrix (optional; input 0 when present)
+    and the parameters as its *inputs*, and emits a single ``NoneType`` handle as
+    output. It computes nothing — its sole purpose is to be a real graph node, so
+    ``graph_replace`` / ``fgraph.clone`` / PyMC conditioning carry the likelihood
+    (and re-root its parameters) instead of it being Python-side state those
+    operations would silently drop. The Op *type* (the subclass) encodes the
+    family, so behaviour is recoverable via ``owner.op``; the parameters are the
+    node's inputs.
 
-class Likelihood:
-    """Base class for all PTGP likelihoods.
-
-    Subclasses must implement ``_log_prob(f, y)`` which returns the symbolic
-    log-likelihood log p(y|f).
-
-    Gaussian likelihoods override ``variational_expectation`` and
-    ``predict_mean_and_var`` with closed-form expressions.  Non-Gaussian
-    likelihoods inherit the default Gauss-Hermite quadrature implementations.
+    Because the parameters are *inputs* and the handle output is never consumed
+    by the loss, the node is never on the differentiation path nor in a compiled
+    loss graph — so it needs no gradient, view, shape, or strip machinery. To
+    keep it alive across graph operations, hold its output (or register it as a
+    PyMC named variable).
 
     Parameters
     ----------
-    invlink : callable, optional
-        Inverse link function mapping latent f to the natural parameter.
-        Subclasses set a default (e.g. exp for Poisson, sigmoid for Bernoulli).
+    param_names : tuple of str
+        Parameter names, in input order (after the optional design matrix).
     n_points : int
-        Number of Gauss-Hermite quadrature points (default 20).
+        Gauss-Hermite quadrature points for the default expectations.
+    invlink : callable or None
+        Inverse link function (set by subclasses that need one).
+    has_data : bool
+        Whether input 0 is the design matrix ``X`` (i.e. parameters depend on it).
     """
 
-    n_points: int = 20
-    invlink = None
-    param_names: tuple = ()
+    __props__ = ("param_names", "n_points", "invlink", "has_data")
 
-    def clone_replace_data(self, X):
-        """Return a copy with every data-dependent parameter re-rooted onto X.
+    def __init__(self, param_names=(), n_points=20, invlink=None, has_data=False):
+        self.param_names = tuple(param_names)
+        self.n_points = n_points
+        self.invlink = invlink
+        self.has_data = has_data
 
-        Each parameter named in ``param_names`` whose graph depends on a free
-        data input is re-expressed over ``X``; data-independent parameters are
-        left untouched. Used at predict time to evaluate parameters at the test
-        inputs rather than the training inputs they were built against.
-        """
-        new = copy.copy(self)
-        for name in self.param_names:
-            setattr(new, name, _reroot(getattr(self, name), X))
-        return new
+    def make_node(self, *args):
+        args = [pt.as_tensor_variable(a) for a in args]
+        return Apply(self, args, [_NULL()])
+
+    def perform(self, node, inputs, outputs):
+        outputs[0][0] = None
 
-    def _log_prob(self, f, y):
+    # --- symbolic behaviour: subclasses implement these three ---
+
+    def _log_prob(self, f, y, *params):
         """Symbolic log p(y|f). Subclasses must implement."""
         raise NotImplementedError
 
-    def _conditional_mean(self, f):
+    def _conditional_mean(self, f, *params):
         """E[y|f]. Subclasses must implement."""
         raise NotImplementedError
 
-    def _conditional_variance(self, f):
+    def _conditional_variance(self, f, *params):
         """Var[y|f]. Subclasses must implement."""
         raise NotImplementedError
 
-    def variational_expectation(self, y, mu, var):
-        """E_{q(f)}[log p(y|f)] where q(f) = N(mu, var).
+    # --- expectations (defaults via quadrature; subclasses may override) ---
 
-        Default: Gauss-Hermite quadrature.
-        """
-        return self._gauss_hermite(lambda f, y: self._log_prob(f, y), y, mu, var)
+    def variational_expectation(self, params, y, mu, var):
+        """E_{q(f)}[log p(y|f)] where q(f) = N(mu, var). Default: quadrature."""
+        return self._gauss_hermite(lambda f, yy: self._log_prob(f, yy, *params), y, mu, var)
 
-    def predict_mean_and_var(self, mu, var):
-        """Predictive mean and variance: E_{q(f)}[E[y|f]], E_{q(f)}[Var[y|f]] + Var_{q(f)}[E[y|f]].
-
-        Default: Gauss-Hermite quadrature.
-        """
+    def predict_mean_and_var(self, params, mu, var):
+        """Predictive mean and variance. Default: Gauss-Hermite quadrature."""
         E_mean = self._gauss_hermite(
-            lambda f, _: self._conditional_mean(f), pt.zeros_like(mu), mu, var
+            lambda f, _: self._conditional_mean(f, *params), pt.zeros_like(mu), mu, var
         )
         E_mean_sq = self._gauss_hermite(
-            lambda f, _: self._conditional_mean(f) ** 2, pt.zeros_like(mu), mu, var
+            lambda f, _: self._conditional_mean(f, *params) ** 2, pt.zeros_like(mu), mu, var
         )
         E_var = self._gauss_hermite(
-            lambda f, _: self._conditional_variance(f), pt.zeros_like(mu), mu, var
+            lambda f, _: self._conditional_variance(f, *params), pt.zeros_like(mu), mu, var
         )
         return E_mean, E_var + E_mean_sq - E_mean**2
 
-    def predict_log_density(self, y, mu, var):
-        """log E_{q(f)}[p(y|f)] — predictive log-density at test points.
-
-        Default: Gauss-Hermite quadrature in log-space for numerical stability.
-        """
-        return self._gauss_hermite_logspace(lambda f, y: self._log_prob(f, y), y, mu, var)
+    def predict_log_density(self, params, y, mu, var):
+        """log E_{q(f)}[p(y|f)]. Default: log-space Gauss-Hermite quadrature."""
+        return self._gauss_hermite_logspace(
+            lambda f, yy: self._log_prob(f, yy, *params), y, mu, var
+        )
 
     def _gauss_hermite(self, func, y, mu, var):
         """E_{q(f)}[func(f, y)] via Gauss-Hermite quadrature."""
@@ -143,27 +101,134 @@ def _gauss_hermite(self, func, y, mu, var):
         gh_points = pt.as_tensor_variable(gh_points)
         gh_weights = pt.as_tensor_variable(gh_weights / np.sqrt(np.pi))
 
-        # f = mu + sqrt(2 * var) * t_j, shape (N, n_points)
         sd = pt.sqrt(var)[:, None]
         F = mu[:, None] + pt.sqrt(2.0) * sd * gh_points[None, :]
-
-        # func(f, y) at each quadrature point, shape (N, n_points)
         vals = func(F, y[:, None])
-
         return pt.sum(vals * gh_weights[None, :], axis=1)
 
     def _gauss_hermite_logspace(self, func, y, mu, var):
-        """log E_{q(f)}[exp(func(f, y))] via Gauss-Hermite quadrature.
-
-        Uses logsumexp for numerical stability.
-        """
+        """log E_{q(f)}[exp(func(f, y))] via quadrature, using logsumexp."""
         gh_points, gh_weights = np.polynomial.hermite.hermgauss(self.n_points)
         gh_points = pt.as_tensor_variable(gh_points)
         log_weights = pt.as_tensor_variable(np.log(gh_weights / np.sqrt(np.pi)))
 
         sd = pt.sqrt(var)[:, None]
         F = mu[:, None] + pt.sqrt(2.0) * sd * gh_points[None, :]
-
         log_vals = func(F, y[:, None]) + log_weights[None, :]
-
         return pt.logsumexp(log_vals, axis=1)
+
+
+# --- Purely functional API ------------------------------------------------
+#
+# A likelihood is fully described by its node: ``owner.op`` is the family (with
+# behaviour), and the node's *inputs* are the optional design matrix followed by
+# the parameters. These free functions take such a node (a "lik" — the NoneType
+# handle) and dispatch through ``owner.op``, so a likelihood can be used with no
+# holder at all. The :class:`Likelihood` holder is a thin view built on these.
+
+
+def _params_of(node):
+    """The parameter inputs of a likelihood node (after the optional X slot)."""
+    off = 1 if node.op.has_data else 0
+    return list(node.inputs[off:])
+
+
+def op_of(lik):
+    """The :class:`LikelihoodOp` behind a likelihood node (``owner.op``)."""
+    return lik.owner.op
+
+
+def param(lik, name):
+    """The named parameter input of a likelihood node (e.g. ``"sigma"``)."""
+    node = lik.owner
+    off = 1 if node.op.has_data else 0
+    return node.inputs[off + node.op.param_names.index(name)]
+
+
+def at(lik, X):
+    """Re-root a likelihood node onto design matrix ``X``.
+
+    A no-op for a likelihood with no design matrix (``op.has_data`` is False) —
+    there is no ``X`` input to swap. Otherwise returns a new handle whose
+    parameter inputs are re-expressed over ``X`` (RV-safe via ``graph_replace``).
+    """
+    node = lik.owner
+    if not node.op.has_data:
+        return lik
+    return graph_replace([lik], {node.inputs[0]: X}, strict=False)[0]
+
+
+def variational_expectation(lik, y, mu, var):
+    """E_{q(f)}[log p(y|f)] for a likelihood node, dispatched via ``owner.op``."""
+    return lik.owner.op.variational_expectation(_params_of(lik.owner), y, mu, var)
+
+
+def predict_mean_and_var(lik, mu, var):
+    """Predictive mean and variance for a likelihood node."""
+    return lik.owner.op.predict_mean_and_var(_params_of(lik.owner), mu, var)
+
+
+def predict_log_density(lik, y, mu, var):
+    """Predictive log-density for a likelihood node."""
+    return lik.owner.op.predict_log_density(_params_of(lik.owner), y, mu, var)
+
+
+def _param_property(name):
+    """Expose a named parameter as ``self.<name>`` (the parameter input)."""
+    return property(lambda self: self._params[type(self).param_names.index(name)])
+
+
+class Likelihood:
+    """Public likelihood object — a thin view over a :class:`LikelihoodOp` node.
+
+    Every likelihood is a node: construction builds ``op(x?, *params)`` whose
+    single ``NoneType`` output (``self._lik``) is the handle, and whose inputs
+    are the optional design matrix followed by the parameter graphs. The handle
+    keeps the likelihood in the PyTensor graph (carried by ``graph_replace`` /
+    ``clone``); the methods read the parameters off the node's inputs and
+    dispatch behaviour through ``owner.op`` — the same calls the free functions
+    above make.
+
+    The node is never consumed by the loss (the loss uses the parameter graphs
+    directly, via ``.sigma`` etc.), so it stays off the gradient path and out of
+    compiled graphs. :meth:`at` re-roots the parameters when there's a design
+    matrix; it is a no-op otherwise (homoskedastic or parameterless).
+
+    Subclasses set ``op_cls``, ``param_names``, and expose each parameter via
+    :func:`_param_property`.
+    """
+
+    op_cls = LikelihoodOp
+    param_names = ()
+    n_points = 20
+
+    def __init__(self, x=None, n_points=20, invlink=None, **params):
+        self.n_points = n_points
+        self.invlink = invlink
+        has_data = bool(self.param_names) and x is not None
+        self._op = self.op_cls(self.param_names, n_points, invlink, has_data=has_data)
+        ordered = [pt.as_tensor_variable(params[name]) for name in self.param_names]
+        args = ([pt.as_tensor_variable(x)] if has_data else []) + ordered
+        self._lik = self._op(*args)  # NoneType handle — every likelihood is a node
+
+    @property
+    def _params(self):
+        return _params_of(self._lik.owner)
+
+    def at(self, X):
+        """Return a copy with parameters re-rooted onto ``X`` (no-op without a design matrix)."""
+        new = copy.copy(self)
+        new._lik = at(self._lik, X)
+        return new
+
+    def variational_expectation(self, y, mu, var):
+        """E_{q(f)}[log p(y|f)] where q(f) = N(mu, var)."""
+        return self._op.variational_expectation(self._params, y, mu, var)
+
+    def predict_mean_and_var(self, mu, var):
+        """Predictive mean and variance under q(f) = N(mu, var)."""
+        return self._op.predict_mean_and_var(self._params, mu, var)
+
+    def predict_log_density(self, y, mu, var):
+        """Predictive log-density log E_{q(f)}[p(y|f)] at test points."""
+        return self._op.predict_log_density(self._params, y, mu, var)
diff --git a/ptgp/likelihoods/bernoulli.py b/ptgp/likelihoods/bernoulli.py
index 583f7a4..0c0b6f8 100644
--- a/ptgp/likelihoods/bernoulli.py
+++ b/ptgp/likelihoods/bernoulli.py
@@ -1,6 +1,6 @@
 import pytensor.tensor as pt
 
-from ptgp.likelihoods.base import Likelihood
+from ptgp.likelihoods.base import Likelihood, LikelihoodOp
 
 
 def inv_probit(x):
@@ -9,22 +9,8 @@ def inv_probit(x):
     return 0.5 * (1.0 + pt.erf(x / pt.sqrt(2.0))) * (1.0 - 2.0 * jitter) + jitter
 
 
-class Bernoulli(Likelihood):
-    """Bernoulli likelihood: p(y=1|f) = invlink(f).
-
-    Default link is probit. Variational expectation via Gauss-Hermite quadrature.
-
-    Parameters
-    ----------
-    invlink : callable, optional
-        Inverse link function (default: probit). Use ``pt.sigmoid`` for logit link.
-    n_points : int
-        Number of Gauss-Hermite quadrature points (default 20).
-    """
-
-    def __init__(self, invlink=None, n_points=20):
-        self.invlink = invlink or inv_probit
-        self.n_points = n_points
+class BernoulliOp(LikelihoodOp):
+    """Bernoulli likelihood Op. Closed-form predictive for the probit link."""
 
     def _log_prob(self, f, y):
         p = self.invlink(f)
@@ -37,12 +23,28 @@ def _conditional_variance(self, f):
         p = self.invlink(f)
         return p * (1.0 - p)
 
-    def predict_mean_and_var(self, mu, var):
-        """Closed-form for probit link: p = Phi(mu / sqrt(1 + var)).
-
-        Falls back to quadrature for other link functions.
-        """
+    def predict_mean_and_var(self, params, mu, var):
         if self.invlink is inv_probit:
             p = inv_probit(mu / pt.sqrt(1.0 + var))
             return p, p - p**2
-        return super().predict_mean_and_var(mu, var)
+        return super().predict_mean_and_var(params, mu, var)
+
+
+class Bernoulli(Likelihood):
+    """Bernoulli likelihood: p(y=1|f) = invlink(f).
+
+    Default link is probit. Variational expectation via Gauss-Hermite quadrature.
+
+    Parameters
+    ----------
+    invlink : callable, optional
+        Inverse link function (default: probit). Use ``pt.sigmoid`` for logit link.
+    n_points : int
+        Number of Gauss-Hermite quadrature points (default 20).
+    """
+
+    op_cls = BernoulliOp
+    param_names = ()
+
+    def __init__(self, invlink=None, n_points=20):
+        super().__init__(n_points=n_points, invlink=invlink or inv_probit)
diff --git a/ptgp/likelihoods/gaussian.py b/ptgp/likelihoods/gaussian.py
index 9b1964a..3e0119f 100644
--- a/ptgp/likelihoods/gaussian.py
+++ b/ptgp/likelihoods/gaussian.py
@@ -1,10 +1,36 @@
 import pytensor.tensor as pt
 
-from ptgp.likelihoods.base import Likelihood
+from ptgp.likelihoods.base import Likelihood, LikelihoodOp, _param_property
 
 LOG2PI = pt.log(2.0 * pt.pi)
 
 
+class GaussianOp(LikelihoodOp):
+    """Gaussian likelihood Op: closed-form expectations (no quadrature)."""
+
+    def _log_prob(self, f, y, sigma):
+        return -0.5 * (LOG2PI + pt.log(sigma**2) + (y - f) ** 2 / sigma**2)
+
+    def _conditional_mean(self, f, sigma):
+        return f
+
+    def _conditional_variance(self, f, sigma):
+        return pt.ones_like(f) * sigma**2
+
+    def variational_expectation(self, params, y, mu, var):
+        (sigma,) = params
+        return -0.5 * (LOG2PI + pt.log(sigma**2) + ((y - mu) ** 2 + var) / sigma**2)
+
+    def predict_mean_and_var(self, params, mu, var):
+        (sigma,) = params
+        return mu, var + sigma**2
+
+    def predict_log_density(self, params, y, mu, var):
+        (sigma,) = params
+        total_var = var + sigma**2
+        return -0.5 * (LOG2PI + pt.log(total_var) + (y - mu) ** 2 / total_var)
+
+
 class Gaussian(Likelihood):
     """Gaussian likelihood p(y|f) = N(y; f, sigma^2).
 
@@ -14,31 +40,16 @@ class Gaussian(Likelihood):
     ----------
     sigma : tensor
         Observation noise standard deviation. May be a scalar (homoskedastic)
-        or a vector built against a symbolic ``X`` (heteroskedastic). At
-        predict time, :meth:`Likelihood.clone_replace_data` re-roots sigma's graph onto
-        the test inputs.
+        or a vector built against a design matrix ``X`` (heteroskedastic).
+    x : tensor, optional
+        The design matrix ``sigma`` was built against. Pass it when ``sigma`` is
+        heteroskedastic so :meth:`Likelihood.at` can re-root
+        sigma onto the test inputs at predict time.
     """
 
+    op_cls = GaussianOp
     param_names = ("sigma",)
+    sigma = _param_property("sigma")
 
-    def __init__(self, sigma):
-        self.sigma = pt.as_tensor_variable(sigma)
-
-    def _log_prob(self, f, y):
-        return -0.5 * (LOG2PI + pt.log(self.sigma**2) + (y - f) ** 2 / self.sigma**2)
-
-    def _conditional_mean(self, f):
-        return f
-
-    def _conditional_variance(self, f):
-        return pt.ones_like(f) * self.sigma**2
-
-    def variational_expectation(self, y, mu, var):
-        return -0.5 * (LOG2PI + pt.log(self.sigma**2) + ((y - mu) ** 2 + var) / self.sigma**2)
-
-    def predict_mean_and_var(self, mu, var):
-        return mu, var + self.sigma**2
-
-    def predict_log_density(self, y, mu, var):
-        total_var = var + self.sigma**2
-        return -0.5 * (LOG2PI + pt.log(total_var) + (y - mu) ** 2 / total_var)
+    def __init__(self, sigma, x=None):
+        super().__init__(x=x, sigma=sigma)
diff --git a/ptgp/likelihoods/negative_binomial.py b/ptgp/likelihoods/negative_binomial.py
index 4c0a537..90433fe 100644
--- a/ptgp/likelihoods/negative_binomial.py
+++ b/ptgp/likelihoods/negative_binomial.py
@@ -1,6 +1,27 @@
 import pytensor.tensor as pt
 
-from ptgp.likelihoods.base import Likelihood
+from ptgp.likelihoods.base import Likelihood, LikelihoodOp, _param_property
+
+
+class NegativeBinomialOp(LikelihoodOp):
+    """Negative binomial likelihood Op. Expectations via quadrature."""
+
+    def _log_prob(self, f, y, alpha):
+        mu = self.invlink(f)
+        return (
+            pt.gammaln(y + alpha)
+            - pt.gammaln(alpha)
+            - pt.gammaln(y + 1.0)
+            + alpha * pt.log(alpha / (alpha + mu))
+            + y * pt.log(mu / (alpha + mu))
+        )
+
+    def _conditional_mean(self, f, alpha):
+        return self.invlink(f)
+
+    def _conditional_variance(self, f, alpha):
+        mu = self.invlink(f)
+        return mu + mu**2 / alpha
 
 
 class NegativeBinomial(Likelihood):
@@ -18,29 +39,14 @@ class NegativeBinomial(Likelihood):
         Inverse link function (default: exp).
     n_points : int
         Number of Gauss-Hermite quadrature points (default 20).
+    x : tensor, optional
+        The design matrix ``alpha`` was built against, for a heteroskedastic
+        parameter re-rooted onto the test inputs at predict time.
     """
 
+    op_cls = NegativeBinomialOp
     param_names = ("alpha",)
+    alpha = _param_property("alpha")
 
-    def __init__(self, alpha, invlink=None, n_points=20):
-        self.alpha = pt.as_tensor_variable(alpha)
-        self.invlink = invlink or pt.exp
-        self.n_points = n_points
-
-    def _log_prob(self, f, y):
-        mu = self.invlink(f)
-        alpha = self.alpha
-        return (
-            pt.gammaln(y + alpha)
-            - pt.gammaln(alpha)
-            - pt.gammaln(y + 1.0)
-            + alpha * pt.log(alpha / (alpha + mu))
-            + y * pt.log(mu / (alpha + mu))
-        )
-
-    def _conditional_mean(self, f):
-        return self.invlink(f)
-
-    def _conditional_variance(self, f):
-        mu = self.invlink(f)
-        return mu + mu**2 / self.alpha
+    def __init__(self, alpha, invlink=None, n_points=20, x=None):
+        super().__init__(x=x, n_points=n_points, invlink=invlink or pt.exp, alpha=alpha)
diff --git a/ptgp/likelihoods/poisson.py b/ptgp/likelihoods/poisson.py
index d434ff4..b6d970c 100644
--- a/ptgp/likelihoods/poisson.py
+++ b/ptgp/likelihoods/poisson.py
@@ -1,6 +1,25 @@
 import pytensor.tensor as pt
 
-from ptgp.likelihoods.base import Likelihood
+from ptgp.likelihoods.base import Likelihood, LikelihoodOp
+
+
+class PoissonOp(LikelihoodOp):
+    """Poisson likelihood Op. Closed-form variational expectation for log link."""
+
+    def _log_prob(self, f, y):
+        lam = self.invlink(f)
+        return y * pt.log(lam) - lam - pt.gammaln(y + 1.0)
+
+    def _conditional_mean(self, f):
+        return self.invlink(f)
+
+    def _conditional_variance(self, f):
+        return self.invlink(f)
+
+    def variational_expectation(self, params, y, mu, var):
+        if self.invlink is pt.exp:
+            return y * mu - pt.exp(mu + var / 2.0) - pt.gammaln(y + 1.0)
+        return super().variational_expectation(params, y, mu, var)
 
 
 class Poisson(Likelihood):
@@ -17,25 +36,8 @@ class Poisson(Likelihood):
         Number of Gauss-Hermite quadrature points (default 20).
     """
 
-    def __init__(self, invlink=None, n_points=20):
-        self.invlink = invlink or pt.exp
-        self.n_points = n_points
-
-    def _log_prob(self, f, y):
-        lam = self.invlink(f)
-        return y * pt.log(lam) - lam - pt.gammaln(y + 1.0)
-
-    def _conditional_mean(self, f):
-        return self.invlink(f)
+    op_cls = PoissonOp
+    param_names = ()
 
-    def _conditional_variance(self, f):
-        return self.invlink(f)
-
-    def variational_expectation(self, y, mu, var):
-        """Closed-form for log link: E_q[y*f - exp(f) - log(y!)].
-
-        Falls back to quadrature for other link functions.
-        """
-        if self.invlink is pt.exp:
-            return y * mu - pt.exp(mu + var / 2.0) - pt.gammaln(y + 1.0)
-        return super().variational_expectation(y, mu, var)
+    def __init__(self, invlink=None, n_points=20):
+        super().__init__(n_points=n_points, invlink=invlink or pt.exp)
diff --git a/ptgp/likelihoods/student_t.py b/ptgp/likelihoods/student_t.py
index 60a4329..29145c1 100644
--- a/ptgp/likelihoods/student_t.py
+++ b/ptgp/likelihoods/student_t.py
@@ -1,6 +1,25 @@
 import pytensor.tensor as pt
 
-from ptgp.likelihoods.base import Likelihood
+from ptgp.likelihoods.base import Likelihood, LikelihoodOp, _param_property
+
+
+class StudentTOp(LikelihoodOp):
+    """Student-T likelihood Op. Expectations via Gauss-Hermite quadrature."""
+
+    def _log_prob(self, f, y, nu, sigma):
+        z = (y - f) / sigma
+        return (
+            pt.gammaln((nu + 1.0) / 2.0)
+            - pt.gammaln(nu / 2.0)
+            - 0.5 * pt.log(nu * pt.pi * sigma**2)
+            - 0.5 * (nu + 1.0) * pt.log1p(z**2 / nu)
+        )
+
+    def _conditional_mean(self, f, nu, sigma):
+        return f
+
+    def _conditional_variance(self, f, nu, sigma):
+        return pt.ones_like(f) * sigma**2 * nu / (nu - 2.0)
 
 
 class StudentT(Likelihood):
@@ -16,27 +35,15 @@ class StudentT(Likelihood):
         Scale parameter.
     n_points : int
         Number of Gauss-Hermite quadrature points (default 20).
+    x : tensor, optional
+        The design matrix ``nu``/``sigma`` were built against, for
+        heteroskedastic parameters re-rooted onto the test inputs at predict.
     """
 
+    op_cls = StudentTOp
     param_names = ("nu", "sigma")
+    nu = _param_property("nu")
+    sigma = _param_property("sigma")
 
-    def __init__(self, nu, sigma, n_points=20):
-        self.nu = pt.as_tensor_variable(nu)
-        self.sigma = pt.as_tensor_variable(sigma)
-        self.n_points = n_points
-
-    def _log_prob(self, f, y):
-        nu, sigma = self.nu, self.sigma
-        z = (y - f) / sigma
-        return (
-            pt.gammaln((nu + 1.0) / 2.0)
-            - pt.gammaln(nu / 2.0)
-            - 0.5 * pt.log(nu * pt.pi * sigma**2)
-            - 0.5 * (nu + 1.0) * pt.log1p(z**2 / nu)
-        )
-
-    def _conditional_mean(self, f):
-        return f
-
-    def _conditional_variance(self, f):
-        return pt.ones_like(f) * self.sigma**2 * self.nu / (self.nu - 2.0)
+    def __init__(self, nu, sigma, n_points=20, x=None):
+        super().__init__(x=x, n_points=n_points, nu=nu, sigma=sigma)
diff --git a/tests/likelihoods/test_base.py b/tests/likelihoods/test_base.py
index 71b9690..3bca5cb 100644
--- a/tests/likelihoods/test_base.py
+++ b/tests/likelihoods/test_base.py
@@ -63,31 +63,47 @@ def softplus(f):
 
 
 class TestCloneReplaceData:
-    @pytest.mark.parametrize("kind", ["symbolic", "baked", "pm_data"])
+    @pytest.mark.parametrize("kind", ["symbolic", "pm_data"])
     def test_reroot_matches_direct_build(self, kind):
-        """Re-rooting yields the parameter evaluated at the new inputs, however
-        the training data was supplied. ``train`` and ``new`` differ in length so
-        a baked array's statically frozen row dimension cannot be re-imposed."""
+        """Re-rooting yields the parameter evaluated at the new inputs, for the
+        variable-length data providers used in prediction. ``train`` and ``new``
+        differ in length to exercise re-rooting onto a different number of
+        points (a symbolic placeholder or pm.Data is ``(None, D)``)."""
         train = np.linspace(-1.0, 1.0, 5)[:, None]
         new = np.array([[0.0], [1.0], [2.0]])
         X_new = pt.matrix("X_new", shape=(None, 1))
         with pm.Model():
-            rerooted = Gaussian(_hetero(_provider(train, kind))).clone_replace_data(X_new).sigma
+            Xtr = _provider(train, kind)
+            rerooted = Gaussian(_hetero(Xtr), x=Xtr).at(X_new).sigma
         got = pytensor.function([X_new], rerooted)(new)
         np.testing.assert_allclose(got, _hetero(new))
 
+    def test_baked_constant_x_reroots_at_matching_length(self):
+        """A frozen numpy constant as ``x`` re-roots cleanly only at its own
+        length: its static row dimension propagates into the parameter graph at
+        build time, and re-rooting (which uses ``graph_replace`` to preserve the
+        identity of any RV hyperparameters) cannot un-specialize it afterward.
+        Variable-length re-rooting requires a symbolic placeholder or pm.Data."""
+        arr = np.linspace(-1.0, 1.0, 5)[:, None]
+        same_len = np.linspace(2.0, 3.0, 5)[:, None]
+        X_new = pt.matrix("X_new", shape=(None, 1))
+        Xc = pt.as_tensor_variable(arr)  # one node, reused as both param input and x=
+        rerooted = Gaussian(_hetero(Xc), x=Xc).at(X_new).sigma
+        got = pytensor.function([X_new], rerooted)(same_len)
+        np.testing.assert_allclose(got, _hetero(same_len))
+
     def test_reroot_matches_multidim_input(self):
         new = np.array([[0.0, 1.0], [2.0, 3.0]])
         X = pt.matrix("X", shape=(None, 2))
         X_new = pt.matrix("X_new", shape=(None, 2))
-        rerooted = Gaussian(0.1 + pt.sum(X**2, axis=1)).clone_replace_data(X_new).sigma
+        rerooted = Gaussian(0.1 + pt.sum(X**2, axis=1), x=X).at(X_new).sigma
         got = pytensor.function([X_new], rerooted)(new)
         np.testing.assert_allclose(got, 0.1 + np.sum(new**2, axis=1))
 
     def test_reroots_all_tensor_params(self):
         X = pt.matrix("X", shape=(None, 1))
         new = np.array([[1.0], [2.0], [3.0]])
-        out = StudentT(nu=3.0 + X[:, 0] ** 2, sigma=_hetero(X)).clone_replace_data(
+        out = StudentT(nu=3.0 + X[:, 0] ** 2, sigma=_hetero(X), x=X).at(
             pt.as_tensor_variable(new)
         )
         nu_val, sigma_val = _eval(out.nu, out.sigma)
@@ -97,56 +113,99 @@ def test_reroots_all_tensor_params(self):
     def test_negative_binomial_alpha(self):
         X = pt.matrix("X", shape=(None, 1))
         new = np.array([[0.0], [2.0]])
-        out = NegativeBinomial(alpha=_hetero(X)).clone_replace_data(pt.as_tensor_variable(new))
+        out = NegativeBinomial(alpha=_hetero(X), x=X).at(
+            pt.as_tensor_variable(new)
+        )
         np.testing.assert_allclose(_eval(out.alpha), _hetero(new))
 
     def test_leaves_scalar_params_untouched(self):
+        """A data-independent parameter keeps its value through re-rooting (it
+        shares the likelihood node, so its output object is rebuilt, but the
+        value it computes is unchanged)."""
         X = pt.matrix("X", shape=(None, 1))
-        lik = StudentT(nu=4.0, sigma=_hetero(X))
-        out = lik.clone_replace_data(pt.matrix("X_new", shape=(None, 1)))
-        assert out.nu is lik.nu
+        lik = StudentT(nu=4.0, sigma=_hetero(X), x=X)
+        out = lik.at(pt.as_tensor_variable(np.array([[0.0], [1.0], [2.0]])))
+        np.testing.assert_allclose(_eval(out.nu), 4.0)
 
     def test_returns_copy_leaving_original_untouched(self):
         X = pt.matrix("X", shape=(None, 1))
-        lik = Gaussian(_hetero(X))
+        lik = Gaussian(_hetero(X), x=X)
         original = lik.sigma
-        out = lik.clone_replace_data(pt.matrix("X_new", shape=(None, 1)))
+        out = lik.at(pt.matrix("X_new", shape=(None, 1)))
         assert lik.sigma is original
         assert out.sigma is not original
 
     def test_preserves_non_parameter_attributes(self):
         X = pt.matrix("X", shape=(None, 1))
-        lik = StudentT(nu=_hetero(X), sigma=0.5, n_points=33)
-        assert lik.clone_replace_data(pt.matrix("X_new", shape=(None, 1))).n_points == 33
+        lik = StudentT(nu=_hetero(X), sigma=0.5, n_points=33, x=X)
+        assert lik.at(pt.matrix("X_new", shape=(None, 1))).n_points == 33
 
     def test_scalar_hyperparam_survives_reroot(self):
         """A parameter mixing data with a scalar placeholder re-roots only the
-        data; the scalar still flows through to the result."""
+        data; the scalar leaf keeps its identity and flows through to the
+        result (graph_replace preserves sibling leaves)."""
         X = pt.matrix("X", shape=(None, 1))
         hyp = pt.scalar("hyp")
         new = np.array([[0.0], [1.0], [2.0]])
         X_new = pt.matrix("X_new", shape=(None, 1))
-        out = Gaussian(hyp + 0.05 * X[:, 0] ** 2).clone_replace_data(X_new)
+        out = Gaussian(hyp + 0.05 * X[:, 0] ** 2, x=X).at(X_new)
         got = pytensor.function([X_new, hyp], out.sigma)(new, 2.0)
         np.testing.assert_allclose(got, 2.0 + 0.05 * new[:, 0] ** 2)
 
     def test_eager_numpy_parameter_is_not_rerooted(self):
-        """Pure-numpy arithmetic collapses the data dependence to a fixed vector
-        before pytensor sees it, leaving nothing to re-root."""
+        """Pure-numpy arithmetic collapses the data dependence to a fixed vector;
+        with no ``x=`` the likelihood carries no design matrix, so ``at`` is a
+        no-op and the value is preserved."""
         arr = np.linspace(-1.0, 1.0, 5)[:, None]
-        sigma = pt.as_tensor_variable(0.1 + 0.05 * arr[:, 0] ** 2)
-        out = Gaussian(sigma).clone_replace_data(pt.matrix("X_new", shape=(None, 1)))
-        assert out.sigma is sigma
-
-    def test_mismatched_feature_count_is_not_rerooted(self):
+        expected = 0.1 + 0.05 * arr[:, 0] ** 2
+        lik = Gaussian(pt.as_tensor_variable(expected))
+        out = lik.at(pt.matrix("X_new", shape=(None, 1)))
+        assert out.sigma is lik.sigma  # at did nothing (no design matrix)
+        np.testing.assert_allclose(_eval(out.sigma), expected)
+
+    def test_no_x_means_not_rerooted(self):
+        """Omitting ``x=`` is an explicit opt-out: the likelihood carries no
+        design matrix, so ``at`` leaves the parameter untouched."""
         sigma = 0.1 + pt.sum(pt.matrix("Xtr", shape=(None, 3)) ** 2, axis=1)
-        out = Gaussian(sigma).clone_replace_data(pt.matrix("X_new", shape=(None, 1)))
-        assert out.sigma is sigma
-
-    def test_one_dimensional_X_is_rejected(self):
-        """A 1D X is indistinguishable from a per-observation parameter, so
-        re-rooting against it would risk replacing the parameter itself."""
+        lik = Gaussian(sigma)
+        out = lik.at(pt.matrix("X_new", shape=(None, 3)))
+        assert out.sigma is lik.sigma
+
+    def test_one_dimensional_X_new_is_supported(self):
+        """With an explicit, identity-keyed handle there is no parameter/data
+        ambiguity, so the replacement inputs need not be 2D — a 1D X_new
+        substitutes cleanly."""
+        X = pt.vector("X", shape=(None,))
+        new = np.array([0.0, 1.0, 2.0])
+        X_new = pt.vector("X_new", shape=(None,))
+        out = Gaussian(0.1 + 0.05 * X**2, x=X).at(X_new)
+        got = pytensor.function([X_new], out.sigma)(new)
+        np.testing.assert_allclose(got, 0.1 + 0.05 * new**2)
+
+
+class TestGradientThroughLikelihoodNode:
+    """The likelihood node is a pass-through, so gradients must flow through it
+    to the parameters' hyperparameters — for both homoskedastic (no design
+    matrix) and heteroskedastic nodes."""
+
+    def _ve_grad(self, lik, wrt, givens):
+        y = pt.as_tensor_variable(np.array([0.5, -0.3]))
+        mu = pt.as_tensor_variable(np.array([0.0, 0.2]))
+        var = pt.as_tensor_variable(np.array([1.0, 0.5]))
+        ve = pt.sum(lik.variational_expectation(y, mu, var))
+        g = pytensor.grad(ve, wrt)
+        return pytensor.function(list(givens), g)
+
+    def test_grad_homoskedastic(self):
+        alpha = pt.scalar("alpha")
+        lik = Gaussian(alpha)  # node, has_data=False
+        val = self._ve_grad(lik, alpha, [alpha])(0.7)
+        assert np.isfinite(val)
+
+    def test_grad_heteroskedastic(self):
         X = pt.matrix("X", shape=(None, 1))
-        lik = Gaussian(0.1 + 0.05 * X[:, 0] ** 2)
-        with pytest.raises(ValueError, match=r"requires a 2D design matrix"):
-            lik.clone_replace_data(pt.vector("X_new", shape=(None,)))
+        alpha = pt.scalar("alpha")
+        lik = Gaussian(alpha + 0.05 * X[:, 0] ** 2, x=X)  # node, has_data=True
+        fn = self._ve_grad(lik, alpha, [X, alpha])
+        val = fn(np.array([[0.0], [1.0]]), 0.7)
+        assert np.isfinite(val)
diff --git a/tests/likelihoods/test_functional_api.py b/tests/likelihoods/test_functional_api.py
new file mode 100644
index 0000000..9956580
--- /dev/null
+++ b/tests/likelihoods/test_functional_api.py
@@ -0,0 +1,104 @@
+"""The likelihood works in a purely functional way: a likelihood node carries
+its Op, design-matrix input, and parameters, so it can be used without the
+stateful holder at all. Everything dispatches through ``owner.op``."""
+
+import numpy as np
+import pymc as pm
+import pytensor
+import pytensor.tensor as pt
+
+from pytensor.graph.fg import FunctionGraph
+
+from ptgp.likelihoods import (
+    Gaussian,
+    op_of,
+    param,
+    predict_mean_and_var,
+    variational_expectation,
+    at,
+)
+from ptgp.likelihoods.gaussian import GaussianOp
+from pytensor.graph.replace import clone_replace
+
+
+def _eval(*tensors):
+    f = pytensor.function([], list(tensors) if len(tensors) > 1 else tensors[0])
+    return f()
+
+
+def test_likelihood_survives_clone_and_reroots_on_the_clone():
+    """Regression: the likelihood handle lives in the graph as a node, so cloning
+    a model graph carries it — we recover the likelihood from the *clone* and
+    re-root it onto new inputs. Only possible because it is a graph node, not
+    Python-side state on the holder."""
+    X = pt.matrix("X", shape=(None, 1))
+    lik = Gaussian(0.1 + 0.05 * X[:, 0] ** 2, x=X)  # holder; the handle is lik._lik
+    assert isinstance(op_of(lik._lik), GaussianOp)
+    X_new = pt.matrix("X_new", shape=(None, 1))
+    X_new_test = np.array([[0.0], [1.0], [2.0]])
+    mu = pt.as_tensor_variable(np.zeros(3))
+    var = pt.as_tensor_variable(np.ones(3))
+
+    _, orig_var = lik.at(X_new).predict_mean_and_var(mu, var)
+    orig_eval = orig_var.eval({X_new: X_new_test})
+
+    [cloned_handle] = clone_replace([lik._lik])
+    assert cloned_handle is not lik._lik
+    assert isinstance(op_of(cloned_handle), GaussianOp)
+    _, cloned_var = predict_mean_and_var(at(cloned_handle, X_new), mu, var)
+    cloned_eval = cloned_var.eval({X_new: X_new_test})
+
+    expected = 1.0 + (0.1 + 0.05 * X_new_test[:, 0] ** 2) ** 2  # var + sigma(X_new)**2
+    np.testing.assert_allclose(orig_eval, expected)
+    np.testing.assert_allclose(cloned_eval, expected)
+
+
+def test_node_built_without_holder_is_fully_usable():
+    """Build a likelihood node directly from the Op — no holder — and drive it
+    through the functional API: param access, re-rooting, and prediction all
+    come from ``owner.op``."""
+    X = pt.matrix("X", shape=(None, 1))
+    sigma = 0.1 + 0.05 * X[:, 0] ** 2
+    lik = GaussianOp(("sigma",), has_data=True)(X, sigma)  # a node, no Likelihood object
+
+    assert isinstance(op_of(lik), GaussianOp)
+    assert param(lik, "sigma") is sigma  # the parameter is the node's input
+
+    new = np.array([[0.0], [1.0], [2.0]])
+    rerooted = at(lik, pt.as_tensor_variable(new))
+    np.testing.assert_allclose(_eval(param(rerooted, "sigma")), 0.1 + 0.05 * new[:, 0] ** 2)
+
+
+def test_functional_matches_holder():
+    """The free functions and the holder methods are the same computation."""
+    X = pt.matrix("X", shape=(None, 1))
+    mu = pt.as_tensor_variable(np.array([0.0, 1.0]))
+    var = pt.as_tensor_variable(np.array([0.5, 0.2]))
+    new = pt.as_tensor_variable(np.array([[0.5], [1.5]]))
+
+    holder = Gaussian(0.1 + 0.05 * X[:, 0] ** 2, x=X)
+    node = holder._lik  # the likelihood handle (NoneType node); .sigma is the param
+
+    hm, hv = holder.at(new).predict_mean_and_var(mu, var)
+    fm, fv = predict_mean_and_var(at(node, new), mu, var)
+    np.testing.assert_allclose(_eval(hm), _eval(fm))
+    np.testing.assert_allclose(_eval(hv), _eval(fv))
+
+
+def test_functional_variational_expectation_preserves_rv_identity():
+    """The functional path is RV-safe: an RV hyperparameter in the parameter
+    survives re-rooting, so it can be evaluated inside a model context."""
+    with pm.Model():
+        alpha = pm.HalfNormal("alpha")
+        X = pt.matrix("X", shape=(None, 1))
+        lik = GaussianOp(("sigma",), has_data=True)(X, alpha + 0.05 * X[:, 0] ** 2)
+        new = pt.as_tensor_variable(np.array([[0.0], [2.0]]))
+        ve = variational_expectation(
+            at(lik, new),
+            pt.as_tensor_variable(np.array([0.0, 0.0])),
+            pt.as_tensor_variable(np.array([0.0, 0.0])),
+            pt.as_tensor_variable(np.array([1.0, 1.0])),
+        )
+    # alpha is still a free input of the re-rooted expectation graph
+    val = pytensor.function([alpha], ve)(0.3)
+    assert np.isfinite(val).all()
diff --git a/tests/likelihoods/test_negative_binomial.py b/tests/likelihoods/test_negative_binomial.py
index 854129e..4245c8b 100644
--- a/tests/likelihoods/test_negative_binomial.py
+++ b/tests/likelihoods/test_negative_binomial.py
@@ -37,8 +37,8 @@ def test_converges_to_poisson(self):
         # Use quadrature for both so comparison is apples-to-apples
         poisson_lik = Poisson(n_points=50)
         ve_poisson = _eval(
-            poisson_lik._gauss_hermite(
-                poisson_lik._log_prob,
+            poisson_lik._op._gauss_hermite(
+                poisson_lik._op._log_prob,
                 pt.as_tensor_variable(y),
                 pt.as_tensor_variable(mu),
                 pt.as_tensor_variable(var),
diff --git a/tests/likelihoods/test_poisson.py b/tests/likelihoods/test_poisson.py
index 6835f0a..22f56e0 100644
--- a/tests/likelihoods/test_poisson.py
+++ b/tests/likelihoods/test_poisson.py
@@ -23,11 +23,11 @@ def test_closed_form_matches_quadrature(self):
                 pt.as_tensor_variable(y), pt.as_tensor_variable(mu), pt.as_tensor_variable(var)
             )
         )
-        # Use base class quadrature via _gauss_hermite directly
+        # Use base-op quadrature via _gauss_hermite directly
         lik = Poisson(n_points=50)
         ve_quad = _eval(
-            lik._gauss_hermite(
-                lik._log_prob,
+            lik._op._gauss_hermite(
+                lik._op._log_prob,
                 pt.as_tensor_variable(y),
                 pt.as_tensor_variable(mu),
                 pt.as_tensor_variable(var),
diff --git a/tests/test_heteroskedastic_predict.py b/tests/test_heteroskedastic_predict.py
index 75be4aa..229e586 100644
--- a/tests/test_heteroskedastic_predict.py
+++ b/tests/test_heteroskedastic_predict.py
@@ -33,7 +33,7 @@ def test_incl_lik_adds_pointwise_noise(self):
         X_train, y_train, X_new = _data()
         X = pt.matrix("X", shape=(None, 1))
         gp = Unapproximated(
-            kernel=ExpQuad(input_dim=1, ls=1.0), mean=Zero(), sigma=_hetero_sigma(X)
+            kernel=ExpQuad(input_dim=1, ls=1.0), mean=Zero(), sigma=_hetero_sigma(X), x=X
         )
         X_new_t = pt.matrix("X_new", shape=(None, 1))
         y_t = pt.vector("y", shape=(None,))
@@ -54,7 +54,7 @@ def _build(self, X):
         return VFE(
             kernel=ExpQuad(input_dim=1, ls=1.0),
             mean=Zero(),
-            sigma=_hetero_sigma(X),
+            sigma=_hetero_sigma(X), x=X,
             inducing_variable=Points(pt.as_tensor_variable(Z)),
         )
 
@@ -85,7 +85,7 @@ def test_incl_lik_adds_pointwise_noise(self):
         svgp = SVGP(
             kernel=ExpQuad(input_dim=1, ls=1.0),
             mean=Zero(),
-            likelihood=Gaussian(_hetero_sigma(X)),
+            likelihood=Gaussian(_hetero_sigma(X), x=X),
             inducing_variable=Points(pt.as_tensor_variable(Z)),
             variational_params=vp,
         )
@@ -149,10 +149,11 @@ def test_vfe_scalar(self):
 
 
 def test_pm_data_sigma_is_detected_as_data_dependent():
-    """sigma built against a pm.Data SharedVariable must still be treated
-    as data-dependent. The heuristic filters by type (TensorType), not by
-    class (SharedVariable) — without this, pm.Data is silently invisible
-    to clone_replace_data and predictions use the wrong noise.
+    """sigma built against a pm.Data SharedVariable must still be re-rooted.
+
+    The design matrix handle is keyed by identity (the ``x=`` argument), so a
+    pm.Data SharedVariable works exactly like a symbolic placeholder — passing
+    it as ``x=`` lets at swap it for the test inputs.
     """
     X_train_arr = np.linspace(-1.5, 1.5, 10)[:, None].astype(np.float64)
     X_new_arr = np.linspace(-2.0, 2.0, 5)[:, None].astype(np.float64)
@@ -162,7 +163,7 @@ class (SharedVariable) — without this, pm.Data is silently invisible
         gp = Unapproximated(
             kernel=ExpQuad(input_dim=1, ls=1.0),
             mean=Zero(),
-            sigma=_hetero_sigma(X),
+            sigma=_hetero_sigma(X), x=X,
         )
     X_new_t = pt.matrix("X_new", shape=(None, 1))
     y_t = pt.vector("y", shape=(None,))
@@ -186,7 +187,7 @@ def test_collapsed_elbo_with_heteroskedastic_sigma():
     vfe = VFE(
         kernel=ExpQuad(input_dim=1, ls=1.0),
         mean=Zero(),
-        sigma=_hetero_sigma(X),
+        sigma=_hetero_sigma(X), x=X,
         inducing_variable=Points(pt.as_tensor_variable(Z)),
     )
     elbo = collapsed_elbo(vfe, X, y).elbo

From 3c07b831c98efbc0f6a612f0614baef4250acaac Mon Sep 17 00:00:00 2001
From: Ricardo Vieira <ricardo.vieira1994@gmail.com>
Date: Fri, 29 May 2026 18:13:17 +0200
Subject: [PATCH 4/8] Now with custom Likelihood variables

---
 ptgp/likelihoods/__init__.py                |  12 +-
 ptgp/likelihoods/base.py                    | 206 ++++++++++----------
 ptgp/likelihoods/bernoulli.py               |  26 +--
 ptgp/likelihoods/gaussian.py                |  28 ++-
 ptgp/likelihoods/negative_binomial.py       |  25 ++-
 ptgp/likelihoods/poisson.py                 |  27 +--
 ptgp/likelihoods/student_t.py               |  22 +--
 tests/likelihoods/test_functional_api.py    |  27 ++-
 tests/likelihoods/test_negative_binomial.py |   5 +-
 tests/likelihoods/test_poisson.py           |   5 +-
 10 files changed, 183 insertions(+), 200 deletions(-)

diff --git a/ptgp/likelihoods/__init__.py b/ptgp/likelihoods/__init__.py
index a14b3c8..0b533f8 100644
--- a/ptgp/likelihoods/__init__.py
+++ b/ptgp/likelihoods/__init__.py
@@ -1,11 +1,13 @@
 from ptgp.likelihoods.base import (
-    Likelihood,
+    LikelihoodOp,
+    LikelihoodType,
+    LikelihoodVariable,
+    at,
     op_of,
     param,
     predict_log_density,
     predict_mean_and_var,
     variational_expectation,
-    at,
 )
 from ptgp.likelihoods.bernoulli import Bernoulli
 from ptgp.likelihoods.gaussian import Gaussian
@@ -14,12 +16,16 @@
 from ptgp.likelihoods.student_t import StudentT
 
 __all__ = [
-    "Likelihood",
+    # Family helpers — build a LikelihoodVariable.
     "Gaussian",
     "Bernoulli",
     "StudentT",
     "Poisson",
     "NegativeBinomial",
+    # Graph types.
+    "LikelihoodOp",
+    "LikelihoodType",
+    "LikelihoodVariable",
     # Purely functional API — operate on a likelihood node via ``owner.op``.
     "op_of",
     "param",
diff --git a/ptgp/likelihoods/base.py b/ptgp/likelihoods/base.py
index 8d7e4e1..1d88521 100644
--- a/ptgp/likelihoods/base.py
+++ b/ptgp/likelihoods/base.py
@@ -1,57 +1,111 @@
-import copy
-
 import numpy as np
 import pytensor.tensor as pt
 
-from pytensor.graph.basic import Apply
-from pytensor.graph.null_type import NullType
+from pytensor.graph.basic import Apply, Variable
 from pytensor.graph.op import Op
 from pytensor.graph.replace import graph_replace
+from pytensor.graph.type import Type
+
+
+class LikelihoodVariable(Variable):
+    """A likelihood that lives in the PyTensor graph *and* carries the accessor API.
+
+    It is the output of a :class:`LikelihoodOp` node, so ``owner.op`` recovers
+    the family (with behaviour) and ``clone`` / ``graph_replace`` carry it. It
+    also exposes the methods we used to put on a holder, each dispatched through
+    ``owner.op`` (the free functions below):
+
+    - parameter access — ``lik.sigma`` / ``lik.nu`` / ... (the node's inputs),
+    - :meth:`at` — re-root the parameters onto new design inputs,
+    - :meth:`variational_expectation` / :meth:`predict_mean_and_var` /
+      :meth:`predict_log_density`.
+    """
+
+    def at(self, X):
+        """Re-root onto design matrix ``X`` (no-op without a design matrix)."""
+        return at(self, X)
+
+    def variational_expectation(self, y, mu, var):
+        """E_{q(f)}[log p(y|f)] where q(f) = N(mu, var)."""
+        return variational_expectation(self, y, mu, var)
+
+    def predict_mean_and_var(self, mu, var):
+        """Predictive mean and variance under q(f) = N(mu, var)."""
+        return predict_mean_and_var(self, mu, var)
 
-_NULL = NullType()
+    def predict_log_density(self, y, mu, var):
+        """Predictive log-density log E_{q(f)}[p(y|f)] at test points."""
+        return predict_log_density(self, y, mu, var)
+
+    def __getattr__(self, name):
+        # Only fires for attributes not found normally. Expose the named
+        # parameters (node inputs) and a little Op config; everything else
+        # (incl. PyTensor internals) raises AttributeError as usual.
+        if name.startswith("_") or name in ("owner", "type", "index", "tag", "name", "auto_name"):
+            raise AttributeError(name)
+        owner = self.__dict__.get("owner")
+        if owner is None:
+            raise AttributeError(name)
+        op = owner.op
+        if name in op.param_names:
+            off = 1 if op.has_data else 0
+            return owner.inputs[off + op.param_names.index(name)]
+        if name in ("n_points", "invlink", "param_names", "has_data"):
+            return getattr(op, name)
+        raise AttributeError(name)
+
+
+class LikelihoodType(Type):
+    """Type of a likelihood handle. Carries no runtime value — the node it comes
+    from is a structural marker, never consumed by the loss or compiled."""
+
+    def filter(self, value, strict=False, allow_downcast=None):
+        return value
+
+    def make_variable(self, name=None):
+        return LikelihoodVariable(self, None, name=name)
+
+    def __eq__(self, other):
+        return type(self) is type(other)
+
+    def __hash__(self):
+        return hash(type(self))
+
+
+_LIK_TYPE = LikelihoodType()
 
 
 class LikelihoodOp(Op):
     """Structural marker keeping a likelihood inside the PyTensor graph.
 
     The Apply node consumes the design matrix (optional; input 0 when present)
-    and the parameters as its *inputs*, and emits a single ``NoneType`` handle as
-    output. It computes nothing — its sole purpose is to be a real graph node, so
-    ``graph_replace`` / ``fgraph.clone`` / PyMC conditioning carry the likelihood
-    (and re-root its parameters) instead of it being Python-side state those
-    operations would silently drop. The Op *type* (the subclass) encodes the
-    family, so behaviour is recoverable via ``owner.op``; the parameters are the
-    node's inputs.
+    and the parameters as its *inputs*, and emits a single :class:`LikelihoodType`
+    handle (a :class:`LikelihoodVariable`) as output. It computes nothing — it is
+    a real graph node purely so ``graph_replace`` / ``fgraph.clone`` / PyMC
+    conditioning carry the likelihood (and re-root its parameters). The Op *type*
+    (the subclass) encodes the family; behaviour is recoverable via ``owner.op``.
 
     Because the parameters are *inputs* and the handle output is never consumed
     by the loss, the node is never on the differentiation path nor in a compiled
-    loss graph — so it needs no gradient, view, shape, or strip machinery. To
-    keep it alive across graph operations, hold its output (or register it as a
-    PyMC named variable).
-
-    Parameters
-    ----------
-    param_names : tuple of str
-        Parameter names, in input order (after the optional design matrix).
-    n_points : int
-        Gauss-Hermite quadrature points for the default expectations.
-    invlink : callable or None
-        Inverse link function (set by subclasses that need one).
-    has_data : bool
-        Whether input 0 is the design matrix ``X`` (i.e. parameters depend on it).
+    loss graph — no gradient, view, shape, or strip machinery is needed.
+
+    Subclasses set the class attributes ``param_names`` and (optionally)
+    ``default_invlink`` and implement the behaviour methods.
     """
 
     __props__ = ("param_names", "n_points", "invlink", "has_data")
+    param_names = ()
+    default_invlink = None
 
-    def __init__(self, param_names=(), n_points=20, invlink=None, has_data=False):
-        self.param_names = tuple(param_names)
+    def __init__(self, n_points=20, invlink=None, has_data=False):
+        self.param_names = type(self).param_names
         self.n_points = n_points
-        self.invlink = invlink
+        self.invlink = invlink if invlink is not None else type(self).default_invlink
         self.has_data = has_data
 
     def make_node(self, *args):
         args = [pt.as_tensor_variable(a) for a in args]
-        return Apply(self, args, [_NULL()])
+        return Apply(self, args, [_LIK_TYPE()])
 
     def perform(self, node, inputs, outputs):
         outputs[0][0] = None
@@ -118,13 +172,28 @@ def _gauss_hermite_logspace(self, func, y, mu, var):
         return pt.logsumexp(log_vals, axis=1)
 
 
+def build(op_cls, params, x=None, n_points=20, invlink=None):
+    """Build a :class:`LikelihoodVariable` from an Op class and ordered params.
+
+    ``params`` is the list of parameter graphs in ``op_cls.param_names`` order
+    (empty for parameterless likelihoods). ``x`` (the design matrix) is prepended
+    as input 0 when given, marking the parameters as data-dependent. Used by the
+    family helpers (:func:`~ptgp.likelihoods.Gaussian`, etc.).
+    """
+    has_data = bool(op_cls.param_names) and x is not None
+    op = op_cls(n_points=n_points, invlink=invlink, has_data=has_data)
+    args = [pt.as_tensor_variable(p) for p in params]
+    if has_data:
+        args = [pt.as_tensor_variable(x), *args]
+    return op(*args)
+
+
 # --- Purely functional API ------------------------------------------------
 #
-# A likelihood is fully described by its node: ``owner.op`` is the family (with
-# behaviour), and the node's *inputs* are the optional design matrix followed by
-# the parameters. These free functions take such a node (a "lik" — the NoneType
-# handle) and dispatch through ``owner.op``, so a likelihood can be used with no
-# holder at all. The :class:`Likelihood` holder is a thin view built on these.
+# Operate on a likelihood node (a :class:`LikelihoodVariable`) and dispatch
+# through ``owner.op``. The variable's methods are thin wrappers over these, so
+# either style works; the free functions are handy when a dummy node's outputs
+# are plain TensorVariables rather than a LikelihoodVariable.
 
 
 def _params_of(node):
@@ -148,9 +217,9 @@ def param(lik, name):
 def at(lik, X):
     """Re-root a likelihood node onto design matrix ``X``.
 
-    A no-op for a likelihood with no design matrix (``op.has_data`` is False) —
-    there is no ``X`` input to swap. Otherwise returns a new handle whose
-    parameter inputs are re-expressed over ``X`` (RV-safe via ``graph_replace``).
+    A no-op for a likelihood with no design matrix (``op.has_data`` is False).
+    Otherwise returns a new handle whose parameter inputs are re-expressed over
+    ``X`` (RV-safe via ``graph_replace``).
     """
     node = lik.owner
     if not node.op.has_data:
@@ -171,64 +240,3 @@ def predict_mean_and_var(lik, mu, var):
 def predict_log_density(lik, y, mu, var):
     """Predictive log-density for a likelihood node."""
     return lik.owner.op.predict_log_density(_params_of(lik.owner), y, mu, var)
-
-
-def _param_property(name):
-    """Expose a named parameter as ``self.<name>`` (the parameter input)."""
-    return property(lambda self: self._params[type(self).param_names.index(name)])
-
-
-class Likelihood:
-    """Public likelihood object — a thin view over a :class:`LikelihoodOp` node.
-
-    Every likelihood is a node: construction builds ``op(x?, *params)`` whose
-    single ``NoneType`` output (``self._lik``) is the handle, and whose inputs
-    are the optional design matrix followed by the parameter graphs. The handle
-    keeps the likelihood in the PyTensor graph (carried by ``graph_replace`` /
-    ``clone``); the methods read the parameters off the node's inputs and
-    dispatch behaviour through ``owner.op`` — the same calls the free functions
-    above make.
-
-    The node is never consumed by the loss (the loss uses the parameter graphs
-    directly, via ``.sigma`` etc.), so it stays off the gradient path and out of
-    compiled graphs. :meth:`at` re-roots the parameters when there's a design
-    matrix; it is a no-op otherwise (homoskedastic or parameterless).
-
-    Subclasses set ``op_cls``, ``param_names``, and expose each parameter via
-    :func:`_param_property`.
-    """
-
-    op_cls = LikelihoodOp
-    param_names = ()
-    n_points = 20
-
-    def __init__(self, x=None, n_points=20, invlink=None, **params):
-        self.n_points = n_points
-        self.invlink = invlink
-        has_data = bool(self.param_names) and x is not None
-        self._op = self.op_cls(self.param_names, n_points, invlink, has_data=has_data)
-        ordered = [pt.as_tensor_variable(params[name]) for name in self.param_names]
-        args = ([pt.as_tensor_variable(x)] if has_data else []) + ordered
-        self._lik = self._op(*args)  # NoneType handle — every likelihood is a node
-
-    @property
-    def _params(self):
-        return _params_of(self._lik.owner)
-
-    def at(self, X):
-        """Return a copy with parameters re-rooted onto ``X`` (no-op without a design matrix)."""
-        new = copy.copy(self)
-        new._lik = at(self._lik, X)
-        return new
-
-    def variational_expectation(self, y, mu, var):
-        """E_{q(f)}[log p(y|f)] where q(f) = N(mu, var)."""
-        return self._op.variational_expectation(self._params, y, mu, var)
-
-    def predict_mean_and_var(self, mu, var):
-        """Predictive mean and variance under q(f) = N(mu, var)."""
-        return self._op.predict_mean_and_var(self._params, mu, var)
-
-    def predict_log_density(self, y, mu, var):
-        """Predictive log-density log E_{q(f)}[p(y|f)] at test points."""
-        return self._op.predict_log_density(self._params, y, mu, var)
diff --git a/ptgp/likelihoods/bernoulli.py b/ptgp/likelihoods/bernoulli.py
index 0c0b6f8..0aec3e2 100644
--- a/ptgp/likelihoods/bernoulli.py
+++ b/ptgp/likelihoods/bernoulli.py
@@ -1,6 +1,6 @@
 import pytensor.tensor as pt
 
-from ptgp.likelihoods.base import Likelihood, LikelihoodOp
+from ptgp.likelihoods.base import LikelihoodOp, build
 
 
 def inv_probit(x):
@@ -12,6 +12,9 @@ def inv_probit(x):
 class BernoulliOp(LikelihoodOp):
     """Bernoulli likelihood Op. Closed-form predictive for the probit link."""
 
+    param_names = ()
+    default_invlink = staticmethod(inv_probit)
+
     def _log_prob(self, f, y):
         p = self.invlink(f)
         return y * pt.log(p) + (1.0 - y) * pt.log(1.0 - p)
@@ -30,21 +33,10 @@ def predict_mean_and_var(self, params, mu, var):
         return super().predict_mean_and_var(params, mu, var)
 
 
-class Bernoulli(Likelihood):
-    """Bernoulli likelihood: p(y=1|f) = invlink(f).
-
-    Default link is probit. Variational expectation via Gauss-Hermite quadrature.
+def Bernoulli(invlink=None, n_points=20):
+    """Build a Bernoulli likelihood p(y=1|f) = invlink(f).
 
-    Parameters
-    ----------
-    invlink : callable, optional
-        Inverse link function (default: probit). Use ``pt.sigmoid`` for logit link.
-    n_points : int
-        Number of Gauss-Hermite quadrature points (default 20).
+    Returns a :class:`~ptgp.likelihoods.base.LikelihoodVariable`. Default link is
+    probit (closed-form predictive); pass ``invlink=pt.sigmoid`` for logit.
     """
-
-    op_cls = BernoulliOp
-    param_names = ()
-
-    def __init__(self, invlink=None, n_points=20):
-        super().__init__(n_points=n_points, invlink=invlink or inv_probit)
+    return build(BernoulliOp, [], n_points=n_points, invlink=invlink)
diff --git a/ptgp/likelihoods/gaussian.py b/ptgp/likelihoods/gaussian.py
index 3e0119f..136f75d 100644
--- a/ptgp/likelihoods/gaussian.py
+++ b/ptgp/likelihoods/gaussian.py
@@ -1,6 +1,6 @@
 import pytensor.tensor as pt
 
-from ptgp.likelihoods.base import Likelihood, LikelihoodOp, _param_property
+from ptgp.likelihoods.base import LikelihoodOp, build
 
 LOG2PI = pt.log(2.0 * pt.pi)
 
@@ -8,6 +8,8 @@
 class GaussianOp(LikelihoodOp):
     """Gaussian likelihood Op: closed-form expectations (no quadrature)."""
 
+    param_names = ("sigma",)
+
     def _log_prob(self, f, y, sigma):
         return -0.5 * (LOG2PI + pt.log(sigma**2) + (y - f) ** 2 / sigma**2)
 
@@ -31,25 +33,19 @@ def predict_log_density(self, params, y, mu, var):
         return -0.5 * (LOG2PI + pt.log(total_var) + (y - mu) ** 2 / total_var)
 
 
-class Gaussian(Likelihood):
-    """Gaussian likelihood p(y|f) = N(y; f, sigma^2).
+def Gaussian(sigma, x=None):
+    """Build a Gaussian likelihood p(y|f) = N(y; f, sigma^2).
 
-    All methods have closed-form expressions (no quadrature needed).
+    Returns a :class:`~ptgp.likelihoods.base.LikelihoodVariable` — a graph node
+    exposing ``.sigma``, ``.at``, ``.predict_mean_and_var``, etc.
 
     Parameters
     ----------
     sigma : tensor
-        Observation noise standard deviation. May be a scalar (homoskedastic)
-        or a vector built against a design matrix ``X`` (heteroskedastic).
+        Observation noise standard deviation. Scalar (homoskedastic) or a vector
+        built against a design matrix ``X`` (heteroskedastic).
     x : tensor, optional
-        The design matrix ``sigma`` was built against. Pass it when ``sigma`` is
-        heteroskedastic so :meth:`Likelihood.at` can re-root
-        sigma onto the test inputs at predict time.
+        The design matrix ``sigma`` was built against; pass it for
+        heteroskedastic noise so ``.at`` can re-root sigma onto test inputs.
     """
-
-    op_cls = GaussianOp
-    param_names = ("sigma",)
-    sigma = _param_property("sigma")
-
-    def __init__(self, sigma, x=None):
-        super().__init__(x=x, sigma=sigma)
+    return build(GaussianOp, [sigma], x=x)
diff --git a/ptgp/likelihoods/negative_binomial.py b/ptgp/likelihoods/negative_binomial.py
index 90433fe..2c91c18 100644
--- a/ptgp/likelihoods/negative_binomial.py
+++ b/ptgp/likelihoods/negative_binomial.py
@@ -1,11 +1,14 @@
 import pytensor.tensor as pt
 
-from ptgp.likelihoods.base import Likelihood, LikelihoodOp, _param_property
+from ptgp.likelihoods.base import LikelihoodOp, build
 
 
 class NegativeBinomialOp(LikelihoodOp):
     """Negative binomial likelihood Op. Expectations via quadrature."""
 
+    param_names = ("alpha",)
+    default_invlink = staticmethod(pt.exp)
+
     def _log_prob(self, f, y, alpha):
         mu = self.invlink(f)
         return (
@@ -24,12 +27,12 @@ def _conditional_variance(self, f, alpha):
         return mu + mu**2 / alpha
 
 
-class NegativeBinomial(Likelihood):
-    """Negative binomial likelihood: p(y|f) = NB(y; invlink(f), alpha).
+def NegativeBinomial(alpha, invlink=None, n_points=20, x=None):
+    """Build a negative binomial likelihood NB(y; invlink(f), alpha).
 
-    Parameterized as NB(y; mu, alpha) where mu = invlink(f) and alpha is the
-    overdispersion parameter.  Variance is mu + mu^2 / alpha.
-    Default link is log (invlink=exp).
+    Returns a :class:`~ptgp.likelihoods.base.LikelihoodVariable`. Variance is
+    ``mu + mu**2 / alpha`` with ``mu = invlink(f)``; default link is log
+    (``invlink=exp``).
 
     Parameters
     ----------
@@ -41,12 +44,6 @@ class NegativeBinomial(Likelihood):
         Number of Gauss-Hermite quadrature points (default 20).
     x : tensor, optional
         The design matrix ``alpha`` was built against, for a heteroskedastic
-        parameter re-rooted onto the test inputs at predict time.
+        parameter re-rooted onto test inputs via ``.at``.
     """
-
-    op_cls = NegativeBinomialOp
-    param_names = ("alpha",)
-    alpha = _param_property("alpha")
-
-    def __init__(self, alpha, invlink=None, n_points=20, x=None):
-        super().__init__(x=x, n_points=n_points, invlink=invlink or pt.exp, alpha=alpha)
+    return build(NegativeBinomialOp, [alpha], x=x, n_points=n_points, invlink=invlink)
diff --git a/ptgp/likelihoods/poisson.py b/ptgp/likelihoods/poisson.py
index b6d970c..d898b37 100644
--- a/ptgp/likelihoods/poisson.py
+++ b/ptgp/likelihoods/poisson.py
@@ -1,11 +1,14 @@
 import pytensor.tensor as pt
 
-from ptgp.likelihoods.base import Likelihood, LikelihoodOp
+from ptgp.likelihoods.base import LikelihoodOp, build
 
 
 class PoissonOp(LikelihoodOp):
     """Poisson likelihood Op. Closed-form variational expectation for log link."""
 
+    param_names = ()
+    default_invlink = staticmethod(pt.exp)
+
     def _log_prob(self, f, y):
         lam = self.invlink(f)
         return y * pt.log(lam) - lam - pt.gammaln(y + 1.0)
@@ -22,22 +25,10 @@ def variational_expectation(self, params, y, mu, var):
         return super().variational_expectation(params, y, mu, var)
 
 
-class Poisson(Likelihood):
-    """Poisson likelihood: p(y|f) = Poisson(y; invlink(f)).
-
-    Default link is log (invlink=exp). Has a closed-form variational
-    expectation with the log link; falls back to quadrature for other links.
+def Poisson(invlink=None, n_points=20):
+    """Build a Poisson likelihood p(y|f) = Poisson(y; invlink(f)).
 
-    Parameters
-    ----------
-    invlink : callable, optional
-        Inverse link function (default: exp).
-    n_points : int
-        Number of Gauss-Hermite quadrature points (default 20).
+    Returns a :class:`~ptgp.likelihoods.base.LikelihoodVariable`. Default link is
+    log (closed-form variational expectation); other links fall back to quadrature.
     """
-
-    op_cls = PoissonOp
-    param_names = ()
-
-    def __init__(self, invlink=None, n_points=20):
-        super().__init__(n_points=n_points, invlink=invlink or pt.exp)
+    return build(PoissonOp, [], n_points=n_points, invlink=invlink)
diff --git a/ptgp/likelihoods/student_t.py b/ptgp/likelihoods/student_t.py
index 29145c1..e9c05e8 100644
--- a/ptgp/likelihoods/student_t.py
+++ b/ptgp/likelihoods/student_t.py
@@ -1,11 +1,13 @@
 import pytensor.tensor as pt
 
-from ptgp.likelihoods.base import Likelihood, LikelihoodOp, _param_property
+from ptgp.likelihoods.base import LikelihoodOp, build
 
 
 class StudentTOp(LikelihoodOp):
     """Student-T likelihood Op. Expectations via Gauss-Hermite quadrature."""
 
+    param_names = ("nu", "sigma")
+
     def _log_prob(self, f, y, nu, sigma):
         z = (y - f) / sigma
         return (
@@ -22,10 +24,11 @@ def _conditional_variance(self, f, nu, sigma):
         return pt.ones_like(f) * sigma**2 * nu / (nu - 2.0)
 
 
-class StudentT(Likelihood):
-    """Student-T likelihood p(y|f) = StudentT(y; f, sigma, nu).
+def StudentT(nu, sigma, n_points=20, x=None):
+    """Build a Student-T likelihood p(y|f) = StudentT(y; f, sigma, nu).
 
-    Variational expectation via Gauss-Hermite quadrature.
+    Returns a :class:`~ptgp.likelihoods.base.LikelihoodVariable`. Variational
+    expectation via Gauss-Hermite quadrature.
 
     Parameters
     ----------
@@ -37,13 +40,6 @@ class StudentT(Likelihood):
         Number of Gauss-Hermite quadrature points (default 20).
     x : tensor, optional
         The design matrix ``nu``/``sigma`` were built against, for
-        heteroskedastic parameters re-rooted onto the test inputs at predict.
+        heteroskedastic parameters re-rooted onto test inputs via ``.at``.
     """
-
-    op_cls = StudentTOp
-    param_names = ("nu", "sigma")
-    nu = _param_property("nu")
-    sigma = _param_property("sigma")
-
-    def __init__(self, nu, sigma, n_points=20, x=None):
-        super().__init__(x=x, n_points=n_points, nu=nu, sigma=sigma)
+    return build(StudentTOp, [nu, sigma], x=x, n_points=n_points)
diff --git a/tests/likelihoods/test_functional_api.py b/tests/likelihoods/test_functional_api.py
index 9956580..b9d8a6e 100644
--- a/tests/likelihoods/test_functional_api.py
+++ b/tests/likelihoods/test_functional_api.py
@@ -27,13 +27,9 @@ def _eval(*tensors):
 
 
 def test_likelihood_survives_clone_and_reroots_on_the_clone():
-    """Regression: the likelihood handle lives in the graph as a node, so cloning
-    a model graph carries it — we recover the likelihood from the *clone* and
-    re-root it onto new inputs. Only possible because it is a graph node, not
-    Python-side state on the holder."""
     X = pt.matrix("X", shape=(None, 1))
-    lik = Gaussian(0.1 + 0.05 * X[:, 0] ** 2, x=X)  # holder; the handle is lik._lik
-    assert isinstance(op_of(lik._lik), GaussianOp)
+    lik = Gaussian(0.1 + 0.05 * X[:, 0] ** 2, x=X)
+    assert isinstance(lik.owner.op, GaussianOp)
     X_new = pt.matrix("X_new", shape=(None, 1))
     X_new_test = np.array([[0.0], [1.0], [2.0]])
     mu = pt.as_tensor_variable(np.zeros(3))
@@ -42,10 +38,10 @@ def test_likelihood_survives_clone_and_reroots_on_the_clone():
     _, orig_var = lik.at(X_new).predict_mean_and_var(mu, var)
     orig_eval = orig_var.eval({X_new: X_new_test})
 
-    [cloned_handle] = clone_replace([lik._lik])
-    assert cloned_handle is not lik._lik
-    assert isinstance(op_of(cloned_handle), GaussianOp)
-    _, cloned_var = predict_mean_and_var(at(cloned_handle, X_new), mu, var)
+    [cloned_lik] = clone_replace([lik])
+    assert cloned_lik is not lik
+    assert isinstance(cloned_lik.owner.op, GaussianOp)
+    _, cloned_var = predict_mean_and_var(at(cloned_lik, X_new), mu, var)
     cloned_eval = cloned_var.eval({X_new: X_new_test})
 
     expected = 1.0 + (0.1 + 0.05 * X_new_test[:, 0] ** 2) ** 2  # var + sigma(X_new)**2
@@ -59,7 +55,7 @@ def test_node_built_without_holder_is_fully_usable():
     come from ``owner.op``."""
     X = pt.matrix("X", shape=(None, 1))
     sigma = 0.1 + 0.05 * X[:, 0] ** 2
-    lik = GaussianOp(("sigma",), has_data=True)(X, sigma)  # a node, no Likelihood object
+    lik = GaussianOp(has_data=True)(X, sigma)  # a node, via the Op directly
 
     assert isinstance(op_of(lik), GaussianOp)
     assert param(lik, "sigma") is sigma  # the parameter is the node's input
@@ -76,11 +72,10 @@ def test_functional_matches_holder():
     var = pt.as_tensor_variable(np.array([0.5, 0.2]))
     new = pt.as_tensor_variable(np.array([[0.5], [1.5]]))
 
-    holder = Gaussian(0.1 + 0.05 * X[:, 0] ** 2, x=X)
-    node = holder._lik  # the likelihood handle (NoneType node); .sigma is the param
+    lik = Gaussian(0.1 + 0.05 * X[:, 0] ** 2, x=X)  # a LikelihoodVariable
 
-    hm, hv = holder.at(new).predict_mean_and_var(mu, var)
-    fm, fv = predict_mean_and_var(at(node, new), mu, var)
+    hm, hv = lik.at(new).predict_mean_and_var(mu, var)  # method API
+    fm, fv = predict_mean_and_var(at(lik, new), mu, var)  # functional API
     np.testing.assert_allclose(_eval(hm), _eval(fm))
     np.testing.assert_allclose(_eval(hv), _eval(fv))
 
@@ -91,7 +86,7 @@ def test_functional_variational_expectation_preserves_rv_identity():
     with pm.Model():
         alpha = pm.HalfNormal("alpha")
         X = pt.matrix("X", shape=(None, 1))
-        lik = GaussianOp(("sigma",), has_data=True)(X, alpha + 0.05 * X[:, 0] ** 2)
+        lik = GaussianOp(has_data=True)(X, alpha + 0.05 * X[:, 0] ** 2)
         new = pt.as_tensor_variable(np.array([[0.0], [2.0]]))
         ve = variational_expectation(
             at(lik, new),
diff --git a/tests/likelihoods/test_negative_binomial.py b/tests/likelihoods/test_negative_binomial.py
index 4245c8b..3bf37ab 100644
--- a/tests/likelihoods/test_negative_binomial.py
+++ b/tests/likelihoods/test_negative_binomial.py
@@ -36,9 +36,10 @@ def test_converges_to_poisson(self):
 
         # Use quadrature for both so comparison is apples-to-apples
         poisson_lik = Poisson(n_points=50)
+        pop = poisson_lik.owner.op
         ve_poisson = _eval(
-            poisson_lik._op._gauss_hermite(
-                poisson_lik._op._log_prob,
+            pop._gauss_hermite(
+                pop._log_prob,
                 pt.as_tensor_variable(y),
                 pt.as_tensor_variable(mu),
                 pt.as_tensor_variable(var),
diff --git a/tests/likelihoods/test_poisson.py b/tests/likelihoods/test_poisson.py
index 22f56e0..534a019 100644
--- a/tests/likelihoods/test_poisson.py
+++ b/tests/likelihoods/test_poisson.py
@@ -25,9 +25,10 @@ def test_closed_form_matches_quadrature(self):
         )
         # Use base-op quadrature via _gauss_hermite directly
         lik = Poisson(n_points=50)
+        op = lik.owner.op
         ve_quad = _eval(
-            lik._op._gauss_hermite(
-                lik._op._log_prob,
+            op._gauss_hermite(
+                op._log_prob,
                 pt.as_tensor_variable(y),
                 pt.as_tensor_variable(mu),
                 pt.as_tensor_variable(var),

From f86f6a0d15aa2cbcc23bb16a1cfe70039045e68e Mon Sep 17 00:00:00 2001
From: Ricardo Vieira <ricardo.vieira1994@gmail.com>
Date: Fri, 29 May 2026 19:35:16 +0200
Subject: [PATCH 5/8] GP data barrier

---
 ptgp/likelihoods/__init__.py             |  4 ++
 ptgp/likelihoods/base.py                 | 73 ++++++++++++++++++++++--
 tests/likelihoods/test_functional_api.py | 25 ++++++++
 3 files changed, 96 insertions(+), 6 deletions(-)

diff --git a/ptgp/likelihoods/__init__.py b/ptgp/likelihoods/__init__.py
index 0b533f8..2f0cc9e 100644
--- a/ptgp/likelihoods/__init__.py
+++ b/ptgp/likelihoods/__init__.py
@@ -1,8 +1,10 @@
 from ptgp.likelihoods.base import (
+    GPData,
     LikelihoodOp,
     LikelihoodType,
     LikelihoodVariable,
     at,
+    gp_data,
     op_of,
     param,
     predict_log_density,
@@ -26,6 +28,8 @@
     "LikelihoodOp",
     "LikelihoodType",
     "LikelihoodVariable",
+    "GPData",
+    "gp_data",
     # Purely functional API — operate on a likelihood node via ``owner.op``.
     "op_of",
     "param",
diff --git a/ptgp/likelihoods/base.py b/ptgp/likelihoods/base.py
index 1d88521..46272db 100644
--- a/ptgp/likelihoods/base.py
+++ b/ptgp/likelihoods/base.py
@@ -4,7 +4,9 @@
 from pytensor.graph.basic import Apply, Variable
 from pytensor.graph.op import Op
 from pytensor.graph.replace import graph_replace
+from pytensor.graph.rewriting.basic import node_rewriter
 from pytensor.graph.type import Type
+from pytensor.tensor.rewriting.basic import register_specialize
 
 
 class LikelihoodVariable(Variable):
@@ -172,20 +174,79 @@ def _gauss_hermite_logspace(self, func, y, mu, var):
         return pt.logsumexp(log_vals, axis=1)
 
 
+class GPData(Op):
+    """Opaque identity barrier marking the design matrix in a parameter graph.
+
+    The design matrix is routed through this Op before it reaches the
+    parameters (``sigma = f(gp_data(X))``), which keeps a *stable* handle to it:
+    rewrites cannot see through the barrier, so an algebraic cancellation
+    (e.g. ``exp(log(x)) -> x``) cannot dissolve the path from the parameter back
+    to ``X`` and sever the handle. It is also the likelihood node's design-matrix
+    input (input 0), so it is structurally findable and clone-safe.
+
+    It carries no value of its own (identity passthrough) and is stripped at the
+    *specialize* stage (:func:`_strip_gp_data`) — i.e. only when building the
+    final executable, after re-rooting, never during the canonicalization passes
+    a model graph might go through while the handle still matters.
+    """
+
+    __props__ = ()
+    view_map = {0: [0]}
+
+    def make_node(self, x):
+        x = pt.as_tensor_variable(x)
+        return Apply(self, [x], [x.type()])
+
+    def perform(self, node, inputs, outputs):
+        outputs[0][0] = inputs[0]
+
+    def infer_shape(self, fgraph, node, input_shapes):
+        return input_shapes
+
+    def pullback(self, inputs, outputs, cotangents):
+        return list(cotangents)  # identity
+
+
+_gp_data_op = GPData()
+
+
+def gp_data(x):
+    """Wrap the design matrix ``x`` in an opaque :class:`GPData` barrier."""
+    return _gp_data_op(pt.as_tensor_variable(x))
+
+
+@register_specialize
+@node_rewriter([GPData])
+def _strip_gp_data(fgraph, node):
+    """Elide the design-matrix barrier when building the final executable.
+
+    Registered at the *specialize* stage (not canonicalize), so it does not fire
+    during the canonicalization passes the barrier is meant to survive — it only
+    drops the identity Op once all graph transforms (incl. re-rooting) are done.
+    """
+    return [node.inputs[0]]
+
+
 def build(op_cls, params, x=None, n_points=20, invlink=None):
     """Build a :class:`LikelihoodVariable` from an Op class and ordered params.
 
     ``params`` is the list of parameter graphs in ``op_cls.param_names`` order
-    (empty for parameterless likelihoods). ``x`` (the design matrix) is prepended
-    as input 0 when given, marking the parameters as data-dependent. Used by the
-    family helpers (:func:`~ptgp.likelihoods.Gaussian`, etc.).
+    (empty for parameterless likelihoods). When a design matrix ``x`` is given,
+    it is wrapped in a :class:`GPData` barrier, the parameters are re-expressed
+    over that barrier (so it sits between ``X`` and the parameters), and the
+    barrier becomes the node's input 0 — the data demarcator. Used by the family
+    helpers (:func:`~ptgp.likelihoods.Gaussian`, etc.).
     """
     has_data = bool(op_cls.param_names) and x is not None
     op = op_cls(n_points=n_points, invlink=invlink, has_data=has_data)
-    args = [pt.as_tensor_variable(p) for p in params]
+    params = [pt.as_tensor_variable(p) for p in params]
     if has_data:
-        args = [pt.as_tensor_variable(x), *args]
-    return op(*args)
+        x = pt.as_tensor_variable(x)
+        xd = gp_data(x)
+        # Route the parameters through the barrier so it sits between X and them.
+        params = list(graph_replace(params, {x: xd}, strict=False))
+        params = [xd, *params]
+    return op(*params)
 
 
 # --- Purely functional API ------------------------------------------------
diff --git a/tests/likelihoods/test_functional_api.py b/tests/likelihoods/test_functional_api.py
index b9d8a6e..8a364be 100644
--- a/tests/likelihoods/test_functional_api.py
+++ b/tests/likelihoods/test_functional_api.py
@@ -97,3 +97,28 @@ def test_functional_variational_expectation_preserves_rv_identity():
     # alpha is still a free input of the re-rooted expectation graph
     val = pytensor.function([alpha], ve)(0.3)
     assert np.isfinite(val).all()
+
+
+def test_design_matrix_handle_survives_rewrite():
+    """Regression — the "don't lose the handle" case: when the design matrix is an
+    intermediate node (X = log(x)) and a parameter exps it (sigma = exp(X)), an
+    algebraic rewrite would cancel exp(log(x)) -> x and sever our handle to X. The
+    design matrix is routed through an opaque gp_data barrier, so canonicalization
+    cannot see through it: X survives the rewrite and re-rooting still reaches it.
+    """
+    from pytensor.graph.rewriting.utils import rewrite_graph
+    from pytensor.graph.traversal import ancestors
+
+    x = pt.matrix("x", shape=(None, 1))
+    X = pt.log(x)  # design matrix is an intermediate node, not a leaf
+    lik = Gaussian(pt.exp(X[:, 0]), x=X)  # sigma = exp(log(x)) — would cancel without a barrier
+    sigma = lik.owner.inputs[1]  # the parameter, routed through the gp_data barrier
+
+    rewritten = rewrite_graph(sigma, include=("canonicalize",))
+    # The barrier blocked the exp(log(x)) -> x cancellation, so X (=log(x)) survives.
+    assert X in set(ancestors([rewritten]))
+
+    # And re-rooting the (held) likelihood still reaches the design matrix.
+    X_new = pt.matrix("X_new", shape=(None, 1))
+    got = lik.at(X_new).sigma.eval({X_new: np.array([[2.0], [3.0]])})
+    np.testing.assert_allclose(got, np.exp([2.0, 3.0]))

From 20c226e5878dba7df207b4816de01af05a7e5572 Mon Sep 17 00:00:00 2001
From: jessegrabowski <jessegrabowski@gmail.com>
Date: Fri, 29 May 2026 16:29:57 -0500
Subject: [PATCH 6/8] Remove section-divider comments from test modules

---
 tests/test_rewrites.py       | 21 ---------------------
 tests/test_svgp_scipy_ref.py |  9 ---------
 tests/test_utils.py          | 14 --------------
 3 files changed, 44 deletions(-)

diff --git a/tests/test_rewrites.py b/tests/test_rewrites.py
index 345dcc1..0035258 100644
--- a/tests/test_rewrites.py
+++ b/tests/test_rewrites.py
@@ -25,10 +25,6 @@
 # Install the rewrites under test (side-effect import).
 import ptgp.rewrites  # noqa: F401
 
-# ---------------------------------------------------------------------------
-# Helpers for the structural rewrites below.
-# ---------------------------------------------------------------------------
-
 
 def _has_op(graph, op_type):
     fg = FunctionGraph(outputs=[graph] if not isinstance(graph, list) else graph, clone=False)
@@ -40,11 +36,6 @@ def _has_op(graph, op_type):
     return False
 
 
-# ---------------------------------------------------------------------------
-# Det(L @ L.T) -> (prod(diag(L)))**2 rewrite (Rule H).
-# ---------------------------------------------------------------------------
-
-
 def test_det_of_LLT_takes_diag_product_when_L_lower_triangular():
     """``Det(L @ L.T)`` lowers to ``(prod(diag(L)))**2`` — no Det Apply remains."""
     from pytensor.tensor.linalg.summary import det
@@ -70,11 +61,6 @@ def test_det_of_LLT_diag_product_is_numerically_correct():
     np.testing.assert_allclose(f(L_val), expected, atol=1e-10)
 
 
-# ---------------------------------------------------------------------------
-# ExtractDiag(A @ A.T) -> sum(A**2, axis=-1) rewrite (Rule I).
-# ---------------------------------------------------------------------------
-
-
 def test_diag_of_AAT_to_row_norms_squared_eliminates_dot():
     """``ExtractDiag(A @ A.T)`` lowers to ``sum(A**2, axis=-1)`` — no ExtractDiag."""
     A = pt.dmatrix("A")
@@ -111,11 +97,6 @@ def test_diag_of_ATA_is_numerically_correct():
     np.testing.assert_allclose(f(A_val), np.diag(A_val.T @ A_val), atol=1e-12)
 
 
-# ---------------------------------------------------------------------------
-# MatrixInverse(PSD A) -> cho_solve(L, eye) rewrite (Rule J).
-# ---------------------------------------------------------------------------
-
-
 def test_matrix_inverse_of_LTL_uses_solve_triangular():
     """``MatrixInverse(L.T @ L)`` lowers to two solve_triangular calls — no fresh Cholesky."""
     L = pta.assume(pt.dmatrix("L"), lower_triangular=True)
@@ -181,12 +162,10 @@ def test_matrix_inverse_lowering_is_numerically_correct():
     np.testing.assert_allclose(f(K_val), np.linalg.inv(K_val), atol=1e-10)
 
 
-# ---------------------------------------------------------------------------
 # Joint-graph cubic-op floor pinning (was: Blocker A / merge_composites_*).
 # These pin the invariant the deleted composite-merge passes were claimed to
 # enforce; they pass without those passes because pytensor 3.0.3's fusion
 # pipeline already arrives at the floor.
-# ---------------------------------------------------------------------------
 
 
 def _gp_mll_like_joint():
diff --git a/tests/test_svgp_scipy_ref.py b/tests/test_svgp_scipy_ref.py
index 3c8cb58..52167c6 100644
--- a/tests/test_svgp_scipy_ref.py
+++ b/tests/test_svgp_scipy_ref.py
@@ -32,9 +32,6 @@
 ATOL = 1e-5
 
 
-# ---- Reference ELBO machinery (numpy + scipy, no PTGP internals) ----------
-
-
 def _matern52_numpy(X1, X2, ls, eta):
     """Matern52 kernel in numpy: k(r) = eta^2 (1 + sqrt(5)r + 5r^2/3) exp(-sqrt(5)r)."""
     sqd = np.sum(X1**2, axis=-1)[:, None] + np.sum(X2**2, axis=-1)[None, :] - 2.0 * X1 @ X2.T
@@ -98,9 +95,6 @@ def _fixed_config(rng, N=40, M=8, x_range=(-2.0, 2.0)):
     return X, Z, q_mu, L
 
 
-# ---- StudentT -------------------------------------------------------------
-
-
 def _studentt_logprob(nu, sigma):
     """Return a numpy log_prob(f, y) for StudentT(y; f, sigma, nu)."""
 
@@ -237,9 +231,6 @@ def test_elbo_match(self):
         np.testing.assert_allclose(e_ptgp, e_ref, atol=ATOL)
 
 
-# ---- NegativeBinomial -----------------------------------------------------
-
-
 def _negbinom_logprob(alpha):
     """Return a numpy log_prob(f, y) for NegativeBinomial(y; exp(f), alpha)."""
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index a549a62..f2696cf 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -8,10 +8,6 @@
 
 from ptgp.utils import _build_index_labels, check_init
 
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
 
 def _make_simple_model_and_objective():
     """1-D GP on synthetic data; returns (fun, theta0, X, y, model)."""
@@ -42,11 +38,6 @@ def _make_simple_model_and_objective():
     return fun, theta0, X, y, model
 
 
-# ---------------------------------------------------------------------------
-# check_init tests
-# ---------------------------------------------------------------------------
-
-
 class TestCheckInit:
     def test_returns_true_on_valid_init(self):
         fun, theta0, X, y, model = _make_simple_model_and_objective()
@@ -119,11 +110,6 @@ def test_top_k_capped_at_theta_size(self, capsys):
         assert len(table_lines) == theta0.size
 
 
-# ---------------------------------------------------------------------------
-# _build_index_labels unit tests
-# ---------------------------------------------------------------------------
-
-
 class TestBuildIndexLabels:
     def test_returns_none_without_model(self):
         result = _build_index_labels(5, model=None, extra_vars=None, extra_init=None)

From 503dabd97017f473796b655af8e04ea63bd7c819 Mon Sep 17 00:00:00 2001
From: jessegrabowski <jessegrabowski@gmail.com>
Date: Fri, 29 May 2026 16:32:08 -0500
Subject: [PATCH 7/8] Rework likelihood inverse links and node construction

---
 ptgp/likelihoods/base.py              | 98 +++++++++++++++------------
 ptgp/likelihoods/bernoulli.py         | 34 +++++-----
 ptgp/likelihoods/gaussian.py          |  5 +-
 ptgp/likelihoods/negative_binomial.py | 22 +++---
 ptgp/likelihoods/poisson.py           | 30 +++++---
 ptgp/likelihoods/student_t.py         |  7 +-
 tests/likelihoods/test_base.py        | 29 ++++----
 7 files changed, 122 insertions(+), 103 deletions(-)

diff --git a/ptgp/likelihoods/base.py b/ptgp/likelihoods/base.py
index 46272db..39776de 100644
--- a/ptgp/likelihoods/base.py
+++ b/ptgp/likelihoods/base.py
@@ -9,6 +9,26 @@
 from pytensor.tensor.rewriting.basic import register_specialize
 
 
+def inv_probit(x):
+    """Probit inverse link Phi(x), clamped to (jitter, 1 - jitter)."""
+    jitter = 1e-3
+    return 0.5 * (1.0 + pt.erf(x / pt.sqrt(2.0))) * (1.0 - 2.0 * jitter) + jitter
+
+
+def inv_cloglog(x):
+    """Complementary log-log inverse link, clamped to (jitter, 1 - jitter)."""
+    jitter = 1e-3
+    return (1.0 - pt.exp(-pt.exp(x))) * (1.0 - 2.0 * jitter) + jitter
+
+
+LINKS = {
+    "log": pt.exp,
+    "logit": pt.sigmoid,
+    "probit": inv_probit,
+    "cloglog": inv_cloglog,
+}
+
+
 class LikelihoodVariable(Variable):
     """A likelihood that lives in the PyTensor graph *and* carries the accessor API.
 
@@ -40,9 +60,8 @@ def predict_log_density(self, y, mu, var):
         return predict_log_density(self, y, mu, var)
 
     def __getattr__(self, name):
-        # Only fires for attributes not found normally. Expose the named
-        # parameters (node inputs) and a little Op config; everything else
-        # (incl. PyTensor internals) raises AttributeError as usual.
+        # The guard keeps __getattr__ from recursing on its own attribute access
+        # and lets dunders and PyTensor internals fall through as AttributeError.
         if name.startswith("_") or name in ("owner", "type", "index", "tag", "name", "auto_name"):
             raise AttributeError(name)
         owner = self.__dict__.get("owner")
@@ -50,9 +69,8 @@ def __getattr__(self, name):
             raise AttributeError(name)
         op = owner.op
         if name in op.param_names:
-            off = 1 if op.has_data else 0
-            return owner.inputs[off + op.param_names.index(name)]
-        if name in ("n_points", "invlink", "param_names", "has_data"):
+            return param(self, name)
+        if name in ("n_points", "link", "param_names", "has_data"):
             return getattr(op, name)
         raise AttributeError(name)
 
@@ -91,20 +109,34 @@ class LikelihoodOp(Op):
     by the loss, the node is never on the differentiation path nor in a compiled
     loss graph — no gradient, view, shape, or strip machinery is needed.
 
-    Subclasses set the class attributes ``param_names`` and (optionally)
-    ``default_invlink`` and implement the behaviour methods.
+    Subclasses set the class attributes ``param_names`` and ``allowed_links``
+    and implement the behaviour methods.
     """
 
-    __props__ = ("param_names", "n_points", "invlink", "has_data")
+    __props__ = ("param_names", "n_points", "link", "has_data")
     param_names = ()
-    default_invlink = None
+    allowed_links = ()
 
-    def __init__(self, n_points=20, invlink=None, has_data=False):
+    def __init__(self, n_points=20, link=None, has_data=False):
         self.param_names = type(self).param_names
         self.n_points = n_points
-        self.invlink = invlink if invlink is not None else type(self).default_invlink
+        self.link = link if link is not None else self.default_link
+        if self.link is not None and self.link not in self.allowed_links:
+            raise ValueError(
+                f"{type(self).__name__} does not support link {self.link!r}; "
+                f"choose from {self.allowed_links}."
+            )
         self.has_data = has_data
 
+    @property
+    def default_link(self):
+        """Default link name: the first of ``allowed_links`` (``None`` if empty)."""
+        return self.allowed_links[0] if self.allowed_links else None
+
+    def _invlink(self, f):
+        """Map the latent ``f`` to the natural parameter via the inverse link."""
+        return LINKS[self.link](f)
+
     def make_node(self, *args):
         args = [pt.as_tensor_variable(a) for a in args]
         return Apply(self, args, [_LIK_TYPE()])
@@ -112,8 +144,6 @@ def make_node(self, *args):
     def perform(self, node, inputs, outputs):
         outputs[0][0] = None
 
-    # --- symbolic behaviour: subclasses implement these three ---
-
     def _log_prob(self, f, y, *params):
         """Symbolic log p(y|f). Subclasses must implement."""
         raise NotImplementedError
@@ -126,8 +156,6 @@ def _conditional_variance(self, f, *params):
         """Var[y|f]. Subclasses must implement."""
         raise NotImplementedError
 
-    # --- expectations (defaults via quadrature; subclasses may override) ---
-
     def variational_expectation(self, params, y, mu, var):
         """E_{q(f)}[log p(y|f)] where q(f) = N(mu, var). Default: quadrature."""
         return self._gauss_hermite(lambda f, yy: self._log_prob(f, yy, *params), y, mu, var)
@@ -227,34 +255,19 @@ def _strip_gp_data(fgraph, node):
     return [node.inputs[0]]
 
 
-def build(op_cls, params, x=None, n_points=20, invlink=None):
-    """Build a :class:`LikelihoodVariable` from an Op class and ordered params.
+def to_inputs(params, x):
+    """Node inputs for ``params`` and an optional design matrix ``x``.
 
-    ``params`` is the list of parameter graphs in ``op_cls.param_names`` order
-    (empty for parameterless likelihoods). When a design matrix ``x`` is given,
-    it is wrapped in a :class:`GPData` barrier, the parameters are re-expressed
-    over that barrier (so it sits between ``X`` and the parameters), and the
-    barrier becomes the node's input 0 — the data demarcator. Used by the family
-    helpers (:func:`~ptgp.likelihoods.Gaussian`, etc.).
+    Without ``x`` the inputs are just the parameters. With ``x``, it is wrapped in
+    a :class:`GPData` barrier and the parameters are re-expressed over it, so the
+    barrier becomes input 0 — the data demarcator that :func:`at` re-roots.
     """
-    has_data = bool(op_cls.param_names) and x is not None
-    op = op_cls(n_points=n_points, invlink=invlink, has_data=has_data)
     params = [pt.as_tensor_variable(p) for p in params]
-    if has_data:
-        x = pt.as_tensor_variable(x)
-        xd = gp_data(x)
-        # Route the parameters through the barrier so it sits between X and them.
-        params = list(graph_replace(params, {x: xd}, strict=False))
-        params = [xd, *params]
-    return op(*params)
-
-
-# --- Purely functional API ------------------------------------------------
-#
-# Operate on a likelihood node (a :class:`LikelihoodVariable`) and dispatch
-# through ``owner.op``. The variable's methods are thin wrappers over these, so
-# either style works; the free functions are handy when a dummy node's outputs
-# are plain TensorVariables rather than a LikelihoodVariable.
+    if x is None:
+        return params
+    x = pt.as_tensor_variable(x)
+    xd = gp_data(x)
+    return [xd, *graph_replace(params, {x: xd}, strict=False)]
 
 
 def _params_of(node):
@@ -271,8 +284,7 @@ def op_of(lik):
 def param(lik, name):
     """The named parameter input of a likelihood node (e.g. ``"sigma"``)."""
     node = lik.owner
-    off = 1 if node.op.has_data else 0
-    return node.inputs[off + node.op.param_names.index(name)]
+    return _params_of(node)[node.op.param_names.index(name)]
 
 
 def at(lik, X):
diff --git a/ptgp/likelihoods/bernoulli.py b/ptgp/likelihoods/bernoulli.py
index 0aec3e2..4a3deb1 100644
--- a/ptgp/likelihoods/bernoulli.py
+++ b/ptgp/likelihoods/bernoulli.py
@@ -1,42 +1,44 @@
 import pytensor.tensor as pt
 
-from ptgp.likelihoods.base import LikelihoodOp, build
-
-
-def inv_probit(x):
-    """Probit link: Phi(x) = 0.5 * (1 + erf(x / sqrt(2))), clamped to (jitter, 1-jitter)."""
-    jitter = 1e-3
-    return 0.5 * (1.0 + pt.erf(x / pt.sqrt(2.0))) * (1.0 - 2.0 * jitter) + jitter
+from ptgp.likelihoods.base import LikelihoodOp, inv_probit
 
 
 class BernoulliOp(LikelihoodOp):
     """Bernoulli likelihood Op. Closed-form predictive for the probit link."""
 
     param_names = ()
-    default_invlink = staticmethod(inv_probit)
+    allowed_links = ("probit", "logit", "cloglog")
 
     def _log_prob(self, f, y):
-        p = self.invlink(f)
+        p = self._invlink(f)
         return y * pt.log(p) + (1.0 - y) * pt.log(1.0 - p)
 
     def _conditional_mean(self, f):
-        return self.invlink(f)
+        return self._invlink(f)
 
     def _conditional_variance(self, f):
-        p = self.invlink(f)
+        p = self._invlink(f)
         return p * (1.0 - p)
 
     def predict_mean_and_var(self, params, mu, var):
-        if self.invlink is inv_probit:
+        if self.link == "probit":
             p = inv_probit(mu / pt.sqrt(1.0 + var))
             return p, p - p**2
         return super().predict_mean_and_var(params, mu, var)
 
 
-def Bernoulli(invlink=None, n_points=20):
+def Bernoulli(link=None, n_points=20):
     """Build a Bernoulli likelihood p(y=1|f) = invlink(f).
 
-    Returns a :class:`~ptgp.likelihoods.base.LikelihoodVariable`. Default link is
-    probit (closed-form predictive); pass ``invlink=pt.sigmoid`` for logit.
+    Returns a :class:`~ptgp.likelihoods.base.LikelihoodVariable`. The probit link
+    (default) has a closed-form predictive; other links fall back to quadrature.
+
+    Parameters
+    ----------
+    link : str, optional
+        Inverse link name, one of ``"probit"`` (default), ``"logit"``, or
+        ``"cloglog"``.
+    n_points : int
+        Number of Gauss-Hermite quadrature points (default 20).
     """
-    return build(BernoulliOp, [], n_points=n_points, invlink=invlink)
+    return BernoulliOp(n_points=n_points, link=link)()
diff --git a/ptgp/likelihoods/gaussian.py b/ptgp/likelihoods/gaussian.py
index 136f75d..6b74818 100644
--- a/ptgp/likelihoods/gaussian.py
+++ b/ptgp/likelihoods/gaussian.py
@@ -1,6 +1,6 @@
 import pytensor.tensor as pt
 
-from ptgp.likelihoods.base import LikelihoodOp, build
+from ptgp.likelihoods.base import LikelihoodOp, to_inputs
 
 LOG2PI = pt.log(2.0 * pt.pi)
 
@@ -48,4 +48,5 @@ def Gaussian(sigma, x=None):
         The design matrix ``sigma`` was built against; pass it for
         heteroskedastic noise so ``.at`` can re-root sigma onto test inputs.
     """
-    return build(GaussianOp, [sigma], x=x)
+    op = GaussianOp(has_data=x is not None)
+    return op(*to_inputs([sigma], x))
diff --git a/ptgp/likelihoods/negative_binomial.py b/ptgp/likelihoods/negative_binomial.py
index 2c91c18..8cfad3e 100644
--- a/ptgp/likelihoods/negative_binomial.py
+++ b/ptgp/likelihoods/negative_binomial.py
@@ -1,16 +1,16 @@
 import pytensor.tensor as pt
 
-from ptgp.likelihoods.base import LikelihoodOp, build
+from ptgp.likelihoods.base import LikelihoodOp, to_inputs
 
 
 class NegativeBinomialOp(LikelihoodOp):
     """Negative binomial likelihood Op. Expectations via quadrature."""
 
     param_names = ("alpha",)
-    default_invlink = staticmethod(pt.exp)
+    allowed_links = ("log", "cloglog")
 
     def _log_prob(self, f, y, alpha):
-        mu = self.invlink(f)
+        mu = self._invlink(f)
         return (
             pt.gammaln(y + alpha)
             - pt.gammaln(alpha)
@@ -20,30 +20,30 @@ def _log_prob(self, f, y, alpha):
         )
 
     def _conditional_mean(self, f, alpha):
-        return self.invlink(f)
+        return self._invlink(f)
 
     def _conditional_variance(self, f, alpha):
-        mu = self.invlink(f)
+        mu = self._invlink(f)
         return mu + mu**2 / alpha
 
 
-def NegativeBinomial(alpha, invlink=None, n_points=20, x=None):
+def NegativeBinomial(alpha, link=None, n_points=20, x=None):
     """Build a negative binomial likelihood NB(y; invlink(f), alpha).
 
     Returns a :class:`~ptgp.likelihoods.base.LikelihoodVariable`. Variance is
-    ``mu + mu**2 / alpha`` with ``mu = invlink(f)``; default link is log
-    (``invlink=exp``).
+    ``mu + mu**2 / alpha`` with ``mu = invlink(f)``; all links use quadrature.
 
     Parameters
     ----------
     alpha : tensor or PyMC random variable
         Overdispersion parameter.
-    invlink : callable, optional
-        Inverse link function (default: exp).
+    link : str, optional
+        Inverse link name, one of ``"log"`` (default) or ``"cloglog"``.
     n_points : int
         Number of Gauss-Hermite quadrature points (default 20).
     x : tensor, optional
         The design matrix ``alpha`` was built against, for a heteroskedastic
         parameter re-rooted onto test inputs via ``.at``.
     """
-    return build(NegativeBinomialOp, [alpha], x=x, n_points=n_points, invlink=invlink)
+    op = NegativeBinomialOp(n_points=n_points, link=link, has_data=x is not None)
+    return op(*to_inputs([alpha], x))
diff --git a/ptgp/likelihoods/poisson.py b/ptgp/likelihoods/poisson.py
index d898b37..5466754 100644
--- a/ptgp/likelihoods/poisson.py
+++ b/ptgp/likelihoods/poisson.py
@@ -1,34 +1,42 @@
 import pytensor.tensor as pt
 
-from ptgp.likelihoods.base import LikelihoodOp, build
+from ptgp.likelihoods.base import LikelihoodOp
 
 
 class PoissonOp(LikelihoodOp):
     """Poisson likelihood Op. Closed-form variational expectation for log link."""
 
     param_names = ()
-    default_invlink = staticmethod(pt.exp)
+    allowed_links = ("log",)
 
     def _log_prob(self, f, y):
-        lam = self.invlink(f)
+        lam = self._invlink(f)
         return y * pt.log(lam) - lam - pt.gammaln(y + 1.0)
 
     def _conditional_mean(self, f):
-        return self.invlink(f)
+        return self._invlink(f)
 
     def _conditional_variance(self, f):
-        return self.invlink(f)
+        return self._invlink(f)
 
     def variational_expectation(self, params, y, mu, var):
-        if self.invlink is pt.exp:
+        if self.link == "log":
             return y * mu - pt.exp(mu + var / 2.0) - pt.gammaln(y + 1.0)
         return super().variational_expectation(params, y, mu, var)
 
 
-def Poisson(invlink=None, n_points=20):
-    """Build a Poisson likelihood p(y|f) = Poisson(y; invlink(f)).
+def Poisson(link=None, n_points=20):
+    """Build a Poisson likelihood p(y|f) with rate ``invlink(f)``.
 
-    Returns a :class:`~ptgp.likelihoods.base.LikelihoodVariable`. Default link is
-    log (closed-form variational expectation); other links fall back to quadrature.
+    Returns a :class:`~ptgp.likelihoods.base.LikelihoodVariable`. The log link
+    (the default and only supported link) has a closed-form variational
+    expectation.
+
+    Parameters
+    ----------
+    link : str, optional
+        Inverse link name; only ``"log"`` is supported (the default).
+    n_points : int
+        Number of Gauss-Hermite quadrature points (default 20).
     """
-    return build(PoissonOp, [], n_points=n_points, invlink=invlink)
+    return PoissonOp(n_points=n_points, link=link)()
diff --git a/ptgp/likelihoods/student_t.py b/ptgp/likelihoods/student_t.py
index e9c05e8..7f4974c 100644
--- a/ptgp/likelihoods/student_t.py
+++ b/ptgp/likelihoods/student_t.py
@@ -1,6 +1,6 @@
 import pytensor.tensor as pt
 
-from ptgp.likelihoods.base import LikelihoodOp, build
+from ptgp.likelihoods.base import LikelihoodOp, to_inputs
 
 
 class StudentTOp(LikelihoodOp):
@@ -25,7 +25,7 @@ def _conditional_variance(self, f, nu, sigma):
 
 
 def StudentT(nu, sigma, n_points=20, x=None):
-    """Build a Student-T likelihood p(y|f) = StudentT(y; f, sigma, nu).
+    """Build a Student-T likelihood p(y|f) with scale sigma and degrees of freedom nu.
 
     Returns a :class:`~ptgp.likelihoods.base.LikelihoodVariable`. Variational
     expectation via Gauss-Hermite quadrature.
@@ -42,4 +42,5 @@ def StudentT(nu, sigma, n_points=20, x=None):
         The design matrix ``nu``/``sigma`` were built against, for
         heteroskedastic parameters re-rooted onto test inputs via ``.at``.
     """
-    return build(StudentTOp, [nu, sigma], x=x, n_points=n_points)
+    op = StudentTOp(n_points=n_points, has_data=x is not None)
+    return op(*to_inputs([nu, sigma], x))
diff --git a/tests/likelihoods/test_base.py b/tests/likelihoods/test_base.py
index 3bca5cb..2d1d7e7 100644
--- a/tests/likelihoods/test_base.py
+++ b/tests/likelihoods/test_base.py
@@ -38,7 +38,7 @@ def test_bernoulli_logit_link(self):
             )
         )
         ve_logit = _eval(
-            Bernoulli(invlink=pt.sigmoid).variational_expectation(
+            Bernoulli(link="logit").variational_expectation(
                 pt.as_tensor_variable(y), pt.as_tensor_variable(mu), pt.as_tensor_variable(var)
             )
         )
@@ -46,20 +46,19 @@ def test_bernoulli_logit_link(self):
         assert np.all(ve_probit < 0) and np.all(ve_logit < 0)
         assert not np.allclose(ve_probit, ve_logit)
 
-    def test_poisson_custom_link_uses_quadrature(self):
-        """Poisson with non-exp link should fall back to quadrature and still work."""
-        mu, var = np.array([1.0]), np.array([0.1])
-        y = np.array([2.0])
-
-        def softplus(f):
-            return pt.log1p(pt.exp(f))
-
+    def test_bernoulli_cloglog_link_uses_quadrature(self):
+        """cloglog has no closed-form predictive, so it routes through quadrature."""
+        mu, var, y = np.array([0.0, 1.0]), np.array([0.5, 0.5]), np.array([1.0, 0.0])
         ve = _eval(
-            Poisson(invlink=softplus).variational_expectation(
+            Bernoulli(link="cloglog").variational_expectation(
                 pt.as_tensor_variable(y), pt.as_tensor_variable(mu), pt.as_tensor_variable(var)
             )
         )
-        assert np.isfinite(ve).all()
+        assert np.all(ve < 0)
+
+    def test_unsupported_link_raises(self):
+        with pytest.raises(ValueError, match="does not support link"):
+            Poisson(link="logit")
 
 
 class TestCloneReplaceData:
@@ -103,9 +102,7 @@ def test_reroot_matches_multidim_input(self):
     def test_reroots_all_tensor_params(self):
         X = pt.matrix("X", shape=(None, 1))
         new = np.array([[1.0], [2.0], [3.0]])
-        out = StudentT(nu=3.0 + X[:, 0] ** 2, sigma=_hetero(X), x=X).at(
-            pt.as_tensor_variable(new)
-        )
+        out = StudentT(nu=3.0 + X[:, 0] ** 2, sigma=_hetero(X), x=X).at(pt.as_tensor_variable(new))
         nu_val, sigma_val = _eval(out.nu, out.sigma)
         np.testing.assert_allclose(nu_val, 3.0 + new[:, 0] ** 2)
         np.testing.assert_allclose(sigma_val, _hetero(new))
@@ -113,9 +110,7 @@ def test_reroots_all_tensor_params(self):
     def test_negative_binomial_alpha(self):
         X = pt.matrix("X", shape=(None, 1))
         new = np.array([[0.0], [2.0]])
-        out = NegativeBinomial(alpha=_hetero(X), x=X).at(
-            pt.as_tensor_variable(new)
-        )
+        out = NegativeBinomial(alpha=_hetero(X), x=X).at(pt.as_tensor_variable(new))
         np.testing.assert_allclose(_eval(out.alpha), _hetero(new))
 
     def test_leaves_scalar_params_untouched(self):

From 096cf55ec295de3ef11ba6de0d0d914e5d675c73 Mon Sep 17 00:00:00 2001
From: jessegrabowski <jessegrabowski@gmail.com>
Date: Fri, 29 May 2026 16:32:13 -0500
Subject: [PATCH 8/8] Expand likelihood test coverage

---
 tests/likelihoods/test_bernoulli.py         | 20 +++++++++-----
 tests/likelihoods/test_gaussian.py          | 10 +++----
 tests/likelihoods/test_negative_binomial.py | 13 +++++++++
 tests/likelihoods/test_poisson.py           | 20 ++++++++++++++
 tests/likelihoods/test_student_t.py         | 30 +++++++++++++++++++++
 5 files changed, 82 insertions(+), 11 deletions(-)

diff --git a/tests/likelihoods/test_bernoulli.py b/tests/likelihoods/test_bernoulli.py
index ca75fa8..9ffa2e5 100644
--- a/tests/likelihoods/test_bernoulli.py
+++ b/tests/likelihoods/test_bernoulli.py
@@ -9,6 +9,7 @@
 from gpjax.likelihoods import Bernoulli as GPJaxBernoulli
 
 from ptgp.likelihoods import Bernoulli
+from ptgp.likelihoods.base import LikelihoodOp
 
 ATOL = 1e-5
 
@@ -42,13 +43,20 @@ def test_ve_against_gpjax(self):
 
         np.testing.assert_allclose(ve, gpjax_ve, atol=ATOL)
 
-    def test_predict_mean_and_var_closed_form(self):
-        mu, var = np.array([0.0, 2.0, -2.0]), np.array([0.1, 0.5, 1.0])
-        pm_val, pv_val = _eval(
-            *Bernoulli().predict_mean_and_var(pt.as_tensor_variable(mu), pt.as_tensor_variable(var))
+    def test_probit_closed_form_predict_matches_quadrature(self):
+        """The probit closed-form predictive should agree with the base quadrature
+        path it overrides — a correctness check, not just bounds."""
+        mu, var = np.array([0.0, 0.7, -1.2]), np.array([0.2, 0.5, 1.0])
+        op = Bernoulli(n_points=64).owner.op
+        m_closed, v_closed = op.predict_mean_and_var(
+            [], pt.as_tensor_variable(mu), pt.as_tensor_variable(var)
+        )
+        m_quad, v_quad = LikelihoodOp.predict_mean_and_var(
+            op, [], pt.as_tensor_variable(mu), pt.as_tensor_variable(var)
         )
-        assert np.all(pm_val >= 0.0) and np.all(pm_val <= 1.0)
-        assert np.all(pv_val >= 0.0) and np.all(pv_val <= 0.25)
+        mc, vc, mq, vq = _eval(m_closed, v_closed, m_quad, v_quad)
+        np.testing.assert_allclose(mc, mq, atol=1e-4)
+        np.testing.assert_allclose(vc, vq, atol=1e-4)
 
     def test_ve_negative(self):
         mu, var = np.array([0.0, 2.0, -2.0]), np.array([0.1, 0.5, 1.0])
diff --git a/tests/likelihoods/test_gaussian.py b/tests/likelihoods/test_gaussian.py
index 3593739..24f4b11 100644
--- a/tests/likelihoods/test_gaussian.py
+++ b/tests/likelihoods/test_gaussian.py
@@ -43,18 +43,18 @@ def test_ve_against_gpjax(self):
 
         np.testing.assert_allclose(ve, gpjax_ve, atol=ATOL)
 
-    def test_zero_var_matches_log_prob(self):
+    def test_zero_var_collapses_to_log_likelihood(self):
+        """At zero latent variance the VE reduces to the pointwise log N(y; mu, sigma^2)."""
         mu, y, sigma = np.array([0.0, 1.0]), np.array([0.1, 0.9]), 0.3
-        lik = Gaussian(sigma)
         ve = _eval(
-            lik.variational_expectation(
+            Gaussian(sigma).variational_expectation(
                 pt.as_tensor_variable(y),
                 pt.as_tensor_variable(mu),
                 pt.as_tensor_variable(np.zeros(2)),
             )
         )
-        lp = _eval(lik._log_prob(pt.as_tensor_variable(mu), pt.as_tensor_variable(y)))
-        np.testing.assert_allclose(ve, lp, atol=1e-12)
+        expected = -0.5 * (np.log(2 * np.pi * sigma**2) + (y - mu) ** 2 / sigma**2)
+        np.testing.assert_allclose(ve, expected, atol=1e-12)
 
     def test_predict_mean_and_var(self):
         mu, var, sigma = np.array([1.0, 2.0]), np.array([0.5, 1.0]), 0.3
diff --git a/tests/likelihoods/test_negative_binomial.py b/tests/likelihoods/test_negative_binomial.py
index 3bf37ab..61d79ca 100644
--- a/tests/likelihoods/test_negative_binomial.py
+++ b/tests/likelihoods/test_negative_binomial.py
@@ -29,6 +29,19 @@ def test_quadrature_convergence(self):
         )
         np.testing.assert_allclose(ve_20, ve_50, atol=1e-6)
 
+    def test_predict_variance_includes_overdispersion(self):
+        """At zero latent variance the predictive variance is mu + mu**2 / alpha,
+        exceeding the Poisson mean-equals-variance."""
+        mu, var, alpha = np.array([0.5, 1.0]), np.zeros(2), 3.0
+        lam = np.exp(mu)
+        pm, pv = _eval(
+            *NegativeBinomial(alpha=alpha).predict_mean_and_var(
+                pt.as_tensor_variable(mu), pt.as_tensor_variable(var)
+            )
+        )
+        np.testing.assert_allclose(pm, lam, atol=1e-10)
+        np.testing.assert_allclose(pv, lam + lam**2 / alpha, atol=1e-10)
+
     def test_converges_to_poisson(self):
         """NB with large alpha should approach Poisson."""
         mu, var = np.array([0.5, 1.0]), np.array([0.1, 0.3])
diff --git a/tests/likelihoods/test_poisson.py b/tests/likelihoods/test_poisson.py
index 534a019..1988c38 100644
--- a/tests/likelihoods/test_poisson.py
+++ b/tests/likelihoods/test_poisson.py
@@ -4,6 +4,8 @@
 import pytensor
 import pytensor.tensor as pt
 
+from scipy.special import gammaln
+
 from ptgp.likelihoods import Poisson
 
 
@@ -47,3 +49,21 @@ def test_ve_values(self):
             )
         )
         np.testing.assert_allclose(ve, np.array([-1.0]), atol=1e-12)
+
+    def test_quadrature_predict_collapses_at_zero_variance(self):
+        """At zero latent variance the base quadrature predictives reduce to the
+        conditional moments and the pointwise log-density at f = mu."""
+        mu, y, var = np.array([0.5, 1.0]), np.array([1.0, 3.0]), np.zeros(2)
+        lam = np.exp(mu)
+        lik = Poisson()
+        pm, pv = _eval(
+            *lik.predict_mean_and_var(pt.as_tensor_variable(mu), pt.as_tensor_variable(var))
+        )
+        pld = _eval(
+            lik.predict_log_density(
+                pt.as_tensor_variable(y), pt.as_tensor_variable(mu), pt.as_tensor_variable(var)
+            )
+        )
+        np.testing.assert_allclose(pm, lam, atol=1e-10)
+        np.testing.assert_allclose(pv, lam, atol=1e-10)
+        np.testing.assert_allclose(pld, y * mu - lam - gammaln(y + 1.0), atol=1e-10)
diff --git a/tests/likelihoods/test_student_t.py b/tests/likelihoods/test_student_t.py
index 2a47a8e..3417257 100644
--- a/tests/likelihoods/test_student_t.py
+++ b/tests/likelihoods/test_student_t.py
@@ -43,3 +43,33 @@ def test_quadrature_convergence(self):
             )
         )
         np.testing.assert_allclose(ve_20, ve_50, atol=1e-6)
+
+    def test_predict_variance_uses_studentt_formula(self):
+        """At zero latent variance the predictive variance is the Student-T
+        variance sigma**2 * nu / (nu - 2)."""
+        mu, var = np.array([0.5, -1.0]), np.zeros(2)
+        nu, sigma = 5.0, 0.7
+        pm, pv = _eval(
+            *StudentT(nu=nu, sigma=sigma).predict_mean_and_var(
+                pt.as_tensor_variable(mu), pt.as_tensor_variable(var)
+            )
+        )
+        np.testing.assert_allclose(pm, mu, atol=1e-6)
+        np.testing.assert_allclose(pv, sigma**2 * nu / (nu - 2.0), atol=1e-6)
+
+    def test_heavier_tails_than_gaussian_for_outlier(self):
+        """A far-from-mean observation is penalized less under Student-T than
+        Gaussian at matched scale — the heavy-tail behavior."""
+        mu, var, y = np.array([0.0]), np.array([0.1]), np.array([10.0])
+        sigma = 1.0
+        ve_studentt = _eval(
+            StudentT(nu=3.0, sigma=sigma).variational_expectation(
+                pt.as_tensor_variable(y), pt.as_tensor_variable(mu), pt.as_tensor_variable(var)
+            )
+        )
+        ve_gaussian = _eval(
+            Gaussian(sigma).variational_expectation(
+                pt.as_tensor_variable(y), pt.as_tensor_variable(mu), pt.as_tensor_variable(var)
+            )
+        )
+        assert ve_studentt > ve_gaussian