From 5380b0daf2d097dd4c62827c7bda39ac0978ca8f Mon Sep 17 00:00:00 2001
From: arrjon <jonas.arruda@uni-bonn.de>
Date: Fri, 26 Sep 2025 13:09:45 +0200
Subject: [PATCH 01/18] add mala sampler

---
 pypesto/sample/__init__.py            |   1 +
 pypesto/sample/adaptive_metropolis.py |   7 ++
 pypesto/sample/mala.py                | 139 ++++++++++++++++++++++++++
 pypesto/sample/metropolis.py          |  15 +++
 test/sample/test_sample.py            |  23 +++--
 test/sample/util.py                   |   4 +-
 6 files changed, 181 insertions(+), 8 deletions(-)
 create mode 100644 pypesto/sample/mala.py

diff --git a/pypesto/sample/__init__.py b/pypesto/sample/__init__.py
index 96529a319..c34539196 100644
--- a/pypesto/sample/__init__.py
+++ b/pypesto/sample/__init__.py
@@ -17,6 +17,7 @@
     laplace_approximation_log_evidence,
     parallel_tempering_log_evidence,
 )
+from .mala import Mala
 from .metropolis import MetropolisSampler
 from .parallel_tempering import ParallelTemperingSampler
 from .sample import sample
diff --git a/pypesto/sample/adaptive_metropolis.py b/pypesto/sample/adaptive_metropolis.py
index c0b9e13dd..9901088bd 100644
--- a/pypesto/sample/adaptive_metropolis.py
+++ b/pypesto/sample/adaptive_metropolis.py
@@ -62,6 +62,7 @@ def __init__(self, options: dict = None):
         self._mean_hist = None
         self._cov_hist = None
         self._cov_scale = None
+        self._cov_chol = None
 
     @classmethod
     def default_options(cls):
@@ -86,6 +87,8 @@ def default_options(cls):
             "target_acceptance_rate": 0.234,
             # show progress
             "show_progress": None,
+            # compute cholesky decomposition of the covariance matrix
+            "compute_cholesky": False,
         }
 
     def initialize(self, problem: Problem, x0: np.ndarray):
@@ -102,6 +105,8 @@ def initialize(self, problem: Problem, x0: np.ndarray):
         self._mean_hist = self.trace_x[-1]
         self._cov_hist = self._cov
         self._cov_scale = 1.0
+        if self.options["compute_cholesky"]:
+            self._cov_chol = np.linalg.cholesky(self._cov)
 
     def _propose_parameter(self, x: np.ndarray):
         x_new = np.random.multivariate_normal(x, self._cov)
@@ -137,6 +142,8 @@ def _update_proposal(
 
         # regularize proposal covariance
         self._cov = regularize_covariance(cov=self._cov, reg_factor=reg_factor)
+        if self.options["compute_cholesky"]:
+            self._cov_chol = np.linalg.cholesky(self._cov)
 
 
 def update_history_statistics(
diff --git a/pypesto/sample/mala.py b/pypesto/sample/mala.py
new file mode 100644
index 000000000..cb4583a9f
--- /dev/null
+++ b/pypesto/sample/mala.py
@@ -0,0 +1,139 @@
+import numpy as np
+
+from ..problem import Problem
+from .adaptive_metropolis import AdaptiveMetropolisSampler
+
+
+class Mala(AdaptiveMetropolisSampler):
+    """Metropolis-Adjusted Langevin Algorithm (MALA) sampler with preconditioning.
+
+    MALA is a gradient-based MCMC method that uses the gradient of the
+    log-posterior to guide the proposal distribution. This allows for
+    more efficient exploration of the parameter space compared to
+    standard random-walk Metropolis-Hastings.
+
+    The proposal distribution is:
+    x_new = x + (step_size / 2) * M * grad_log_p(x) + sqrt(step_size) * sqrt(M) * noise
+
+    where grad_log_p(x) is the gradient of the log-posterior at x,
+    M is the preconditioning matrix, step_size is the discretization step size,
+    and noise is standard Gaussian noise.
+
+    Preconditioning can significantly improve convergence for poorly conditioned
+    problems by rescaling the parameter space. Here, we use an adaptive covariance
+    matrix as the preconditioning matrix M, which is updated during sampling.
+    We aim to converge to a fixed target acceptance rate of 0.574, as suggested
+    by theoretical results for MALA.
+
+    For reference, see:
+    * Roberts et al. 1996.
+      Exponential convergence of Langevin distributions and their
+      discrete approximations
+      (https://doi.org/10.2307/3318418)
+    * Girolami & Calderhead 2011.
+      Riemann manifold Langevin and Hamiltonian Monte Carlo methods
+      (https://doi.org/10.1111/j.1467-9868.2010.00765.x)
+    """
+
+    def __init__(self, options: dict = None):
+        super().__init__(options)
+
+    @classmethod
+    def default_options(cls):
+        """Return the default options for the sampler."""
+        return {
+            # step size for the Langevin dynamics
+            "step_size": 0.01,
+            # controls adaptation degeneration velocity of the proposals
+            # in [0, 1], with 0 -> no adaptation, i.e. classical
+            # Metropolis-Hastings
+            "decay_constant": 0.51,
+            # number of samples before adaptation decreases significantly.
+            # a higher value reduces the impact of early adaptation
+            "threshold_sample": 1,
+            # regularization factor for ill-conditioned cov matrices of
+            # the adapted proposal density. regularization might happen if the
+            # eigenvalues of the cov matrix strongly differ in order
+            # of magnitude. in this case, the algorithm adds a small
+            # diag matrix to the cov matrix with elements of this factor
+            "reg_factor": 1e-6,
+            # initial covariance matrix. defaults to a unit matrix
+            "cov0": None,
+            # target acceptance rate
+            "target_acceptance_rate": 0.574,
+            # show progress
+            "show_progress": None,
+            # compute cholesky decomposition of the covariance matrix
+            "compute_cholesky": True,
+        }
+
+    def initialize(self, problem: Problem, x0: np.ndarray):
+        """Initialize the sampler."""
+        # Check if gradient is available
+        if not problem.objective.has_grad:
+            raise ValueError("MALA sampler requires gradient information.")
+
+        super().initialize(problem, x0)
+
+    def _propose_parameter(self, x: np.ndarray):
+        """Propose a parameter using preconditioned MALA dynamics."""
+        step_size = self.options["step_size"]
+
+        # Get gradient of log-posterior at current position
+        grad = -self.neglogpost.get_grad(x)
+
+        # Apply preconditioning to gradient
+        precond_grad = self._cov @ grad
+
+        # Generate standard Gaussian noise
+        noise = np.random.randn(len(x))
+
+        # Apply preconditioning to noise (via Cholesky decomposition)
+        precond_noise = self._cov_chol @ noise
+
+        # Preconditioned MALA proposal: x + (h/2) * M * grad + sqrt(h) * sqrt(M) * noise
+        drift = (step_size / 2.0) * precond_grad
+        diffusion = np.sqrt(step_size) * precond_noise
+
+        x_new: np.ndarray = x + drift + diffusion
+        return x_new
+
+    def _compute_transition_log_prob(
+        self, x_from: np.ndarray, x_to: np.ndarray
+    ):
+        """Compute the log probability of transitioning from x_from to x_to with preconditioning."""
+        step_size = self.options["step_size"]
+
+        # Get gradient at position
+        grad_from = -self.neglogpost.get_grad(x_from)
+
+        # Apply preconditioning to gradient
+        precond_grad_from = self._cov @ grad_from
+
+        # Mean of the preconditioned proposal distribution
+        mean = x_from + (step_size / 2.0) * precond_grad_from
+
+        # For preconditioned MALA, the covariance is step_size * M
+        # We need to compute the log probability under N(mean, step_size * M)
+        diff = x_to - mean
+
+        # Use Cholesky decomposition for efficient computation
+        # We need to solve L @ L^T @ z = diff, i.e., z = M^{-1} @ diff
+        # This is done by first solving L @ y = diff, then L^T @ z = y
+        try:
+            # Forward substitution: L @ y = diff
+            y = np.linalg.solve(self._cov_chol, diff)
+            # Quadratic form: diff^T @ M^{-1} @ diff = y^T @ y
+            log_prob = -0.5 * np.dot(y, y) / step_size
+        except np.linalg.LinAlgError:
+            # If matrix is singular, return -inf
+            return -np.inf
+
+        # Add normalization constant: -0.5 * log|2π * step_size * M|
+        # = -0.5 * (d * log(2π * step_size) + log|M|)
+        # log|M| = 2 * log|L| = 2 * sum(log(diag(L)))
+        d = len(x_from)
+        log_det_cov = 2 * np.sum(np.log(np.diag(self._cov_chol)))
+        log_prob -= 0.5 * (d * np.log(2 * np.pi * step_size) + log_det_cov)
+
+        return log_prob
diff --git a/pypesto/sample/metropolis.py b/pypesto/sample/metropolis.py
index 7213dfcc3..86675e139 100644
--- a/pypesto/sample/metropolis.py
+++ b/pypesto/sample/metropolis.py
@@ -151,6 +151,12 @@ def _perform_step(
             # log acceptance probability (temper log posterior)
             log_p_acc = min(beta * (lpost_new - lpost), 0)
 
+        # This accounts for an asymmetric proposal distribution
+        log_q_forward = self._compute_transition_log_prob(x, x_new)
+        log_q_backward = self._compute_transition_log_prob(x_new, x)
+        proposal_correction = log_q_backward - log_q_forward
+        log_p_acc = min(log_p_acc + proposal_correction, 0)
+
         # flip a coin
         u = np.random.uniform(0, 1)
 
@@ -178,6 +184,15 @@ def _update_proposal(
     ):
         """Update the proposal density. Default: Do nothing."""
 
+    def _compute_transition_log_prob(
+        self, x_from: np.ndarray, x_to: np.ndarray
+    ):
+        """Compute the transition log probability for symmetric proposal.
+
+        For a symmetric proposal distribution, this is zero.
+        """
+        return 0.0
+
     def get_last_sample(self) -> InternalSample:
         """Get the last sample in the chain.
 
diff --git a/test/sample/test_sample.py b/test/sample/test_sample.py
index 7d9f8e4dd..fb1cb91ff 100644
--- a/test/sample/test_sample.py
+++ b/test/sample/test_sample.py
@@ -41,12 +41,13 @@
 @pytest.fixture(
     params=[
         "Metropolis",
-        "AdaptiveMetropolis",
-        "ParallelTempering",
-        "AdaptiveParallelTempering",
-        "Pymc",
-        "Emcee",
-        "Dynesty",
+        # "AdaptiveMetropolis",
+        # "ParallelTempering",
+        # "AdaptiveParallelTempering",
+        # "Pymc",
+        # "Emcee",
+        # "Dynesty",
+        "Mala",
     ]
 )
 def sampler(request):
@@ -62,6 +63,13 @@ def sampler(request):
                 "show_progress": False,
             },
         )
+    elif request.param == "Mala":
+        return sample.Mala(
+            options={
+                "show_progress": False,
+                "step_size": 0.01,
+            },
+        )
     elif request.param == "ParallelTempering":
         return sample.ParallelTemperingSampler(
             internal_sampler=sample.MetropolisSampler(),
@@ -103,6 +111,9 @@ def problem(request):
 
 def test_pipeline(sampler, problem):
     """Check that a typical pipeline runs through."""
+    if isinstance(sampler, sample.Mala):
+        if not problem.objective.has_grad:
+            pytest.skip("MALA requires gradient information.")
     # optimization
     optimizer = optimize.ScipyOptimizer(options={"maxiter": 10})
     result = optimize.minimize(
diff --git a/test/sample/util.py b/test/sample/util.py
index b191962bd..6a93e028d 100644
--- a/test/sample/util.py
+++ b/test/sample/util.py
@@ -35,7 +35,7 @@ def gaussian_llh(x):
 
 def gaussian_nllh_grad(x):
     """Negative log-likelihood gradient for Gaussian."""
-    return np.array([((x - MU) / (SIGMA**2))])
+    return (x - MU) / (SIGMA**2)
 
 
 def gaussian_nllh_hess(x):
@@ -49,7 +49,7 @@ def gaussian_problem():
     def nllh(x):
         return -gaussian_llh(x)
 
-    objective = pypesto.Objective(fun=nllh)
+    objective = pypesto.Objective(fun=nllh, grad=gaussian_nllh_grad)
     problem = pypesto.Problem(
         objective=objective, lb=LB_GAUSSIAN, ub=UB_GAUSSIAN
     )

From 949bfcfa2579b3ca8334366f8058f162baddd31e Mon Sep 17 00:00:00 2001
From: arrjon <jonas.arruda@uni-bonn.de>
Date: Fri, 26 Sep 2025 13:36:05 +0200
Subject: [PATCH 02/18] improve regularize_covariance

---
 pypesto/sample/adaptive_metropolis.py | 27 +++++++++++++++++++--------
 pypesto/sample/mala.py                |  4 +---
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/pypesto/sample/adaptive_metropolis.py b/pypesto/sample/adaptive_metropolis.py
index 9901088bd..92b5c7259 100644
--- a/pypesto/sample/adaptive_metropolis.py
+++ b/pypesto/sample/adaptive_metropolis.py
@@ -77,9 +77,7 @@ def default_options(cls):
             "threshold_sample": 1,
             # regularization factor for ill-conditioned cov matrices of
             # the adapted proposal density. regularization might happen if the
-            # eigenvalues of the cov matrix strongly differ in order
-            # of magnitude. in this case, the algorithm adds a small
-            # diag matrix to the cov matrix with elements of this factor
+            # eigenvalues of the cov matrix strongly differ in order of magnitude.
             "reg_factor": 1e-6,
             # initial covariance matrix. defaults to a unit matrix
             "cov0": None,
@@ -212,8 +210,21 @@ def regularize_covariance(cov: np.ndarray, reg_factor: float) -> np.ndarray:
     cov:
         Regularized estimate of the covariance matrix of the sample.
     """
-    eig = np.linalg.eigvals(cov)
-    eig_min = min(eig)
-    if eig_min <= 0:
-        cov += (abs(eig_min) + reg_factor) * np.eye(cov.shape[0])
-    return cov
+    # eigh assumes exact symmetry, so we symmetrize cov first
+    cov = 0.5 * (cov + cov.T)
+    if not np.all(np.isfinite(cov)):
+        s = np.nanmean(np.diag(cov))
+        s = 1.0 if not np.isfinite(s) or s <= 0 else s
+        return s * np.eye(cov.shape[0])
+
+    # eigh guarantees real eigenvalues and orthonormal eigenvectors (Covariance is symmetric positive semidefinite)
+    w, Q = np.linalg.eigh(cov)
+    w = np.maximum(w, reg_factor)
+    # project to the set of positive definite matrices
+    cov_reg = Q @ (w[:, None] * Q.T)
+
+    # eig = np.linalg.eigvals(cov)
+    # eig_min = min(eig)
+    # if eig_min <= 0:
+    #     cov += (abs(eig_min) + reg_factor) * np.eye(cov.shape[0])
+    return cov_reg
diff --git a/pypesto/sample/mala.py b/pypesto/sample/mala.py
index cb4583a9f..cc92609f3 100644
--- a/pypesto/sample/mala.py
+++ b/pypesto/sample/mala.py
@@ -53,9 +53,7 @@ def default_options(cls):
             "threshold_sample": 1,
             # regularization factor for ill-conditioned cov matrices of
             # the adapted proposal density. regularization might happen if the
-            # eigenvalues of the cov matrix strongly differ in order
-            # of magnitude. in this case, the algorithm adds a small
-            # diag matrix to the cov matrix with elements of this factor
+            # eigenvalues of the cov matrix strongly differ in order of magnitude.
             "reg_factor": 1e-6,
             # initial covariance matrix. defaults to a unit matrix
             "cov0": None,

From 8b683378c37ff748c9adf7777475a6e3f70ad08b Mon Sep 17 00:00:00 2001
From: arrjon <jonas.arruda@uni-bonn.de>
Date: Fri, 26 Sep 2025 13:59:27 +0200
Subject: [PATCH 03/18] improve regularize_covariance

---
 pypesto/sample/adaptive_metropolis.py | 69 +++++++++++++++++----------
 pypesto/sample/mala.py                |  6 +--
 test/sample/test_sample.py            | 12 ++---
 3 files changed, 54 insertions(+), 33 deletions(-)

diff --git a/pypesto/sample/adaptive_metropolis.py b/pypesto/sample/adaptive_metropolis.py
index 92b5c7259..c5c8d57f4 100644
--- a/pypesto/sample/adaptive_metropolis.py
+++ b/pypesto/sample/adaptive_metropolis.py
@@ -1,3 +1,4 @@
+import logging
 import numbers
 
 import numpy as np
@@ -78,15 +79,15 @@ def default_options(cls):
             # regularization factor for ill-conditioned cov matrices of
             # the adapted proposal density. regularization might happen if the
             # eigenvalues of the cov matrix strongly differ in order of magnitude.
-            "reg_factor": 1e-6,
+            "reg_factor": 1e-8,
+            # maximum number of attempts to regularize the covariance matrix
+            "max_tries": 10,
             # initial covariance matrix. defaults to a unit matrix
             "cov0": None,
             # target acceptance rate
             "target_acceptance_rate": 0.234,
             # show progress
             "show_progress": None,
-            # compute cholesky decomposition of the covariance matrix
-            "compute_cholesky": False,
         }
 
     def initialize(self, problem: Problem, x0: np.ndarray):
@@ -99,12 +100,12 @@ def initialize(self, problem: Problem, x0: np.ndarray):
                 cov0 = float(cov0) * np.eye(len(x0))
         else:
             cov0 = np.eye(len(x0))
-        self._cov = regularize_covariance(cov0, self.options["reg_factor"])
+        self._cov_chol, self._cov = regularize_covariance(
+            cov0, self.options["reg_factor"], self.options["max_tries"]
+        )
         self._mean_hist = self.trace_x[-1]
         self._cov_hist = self._cov
         self._cov_scale = 1.0
-        if self.options["compute_cholesky"]:
-            self._cov_chol = np.linalg.cholesky(self._cov)
 
     def _propose_parameter(self, x: np.ndarray):
         x_new = np.random.multivariate_normal(x, self._cov)
@@ -117,6 +118,7 @@ def _update_proposal(
         decay_constant = self.options["decay_constant"]
         threshold_sample = self.options["threshold_sample"]
         reg_factor = self.options["reg_factor"]
+        max_tries = self.options["max_tries"]
         target_acceptance_rate = self.options["target_acceptance_rate"]
 
         # compute historical mean and covariance
@@ -139,9 +141,9 @@ def _update_proposal(
         self._cov = self._cov_scale * self._cov_hist
 
         # regularize proposal covariance
-        self._cov = regularize_covariance(cov=self._cov, reg_factor=reg_factor)
-        if self.options["compute_cholesky"]:
-            self._cov_chol = np.linalg.cholesky(self._cov)
+        self._cov_chol, self._cov = regularize_covariance(
+            cov=self._cov, reg_factor=reg_factor, max_tries=max_tries
+        )
 
 
 def update_history_statistics(
@@ -191,7 +193,11 @@ def update_history_statistics(
     return mean, cov
 
 
-def regularize_covariance(cov: np.ndarray, reg_factor: float) -> np.ndarray:
+def regularize_covariance(
+    cov: np.ndarray,
+    reg_factor: float,
+    max_tries: int,
+) -> tuple[np.ndarray, np.ndarray]:
     """
     Regularize the estimated covariance matrix of the sample.
 
@@ -204,27 +210,42 @@ def regularize_covariance(cov: np.ndarray, reg_factor: float) -> np.ndarray:
         Estimate of the covariance matrix of the sample.
     reg_factor:
         Regularization factor. Larger values result in stronger regularization.
+    max_tries:
+        Maximum number of attempts to regularize the covariance matrix.
 
     Returns
     -------
+    L:
+        Cholesky decomposition of the regularized covariance matrix.
     cov:
         Regularized estimate of the covariance matrix of the sample.
     """
-    # eigh assumes exact symmetry, so we symmetrize cov first
+    # we symmetrize cov first
     cov = 0.5 * (cov + cov.T)
+
+    d = cov.shape[0]
+    eye = np.eye(d)
     if not np.all(np.isfinite(cov)):
         s = np.nanmean(np.diag(cov))
         s = 1.0 if not np.isfinite(s) or s <= 0 else s
-        return s * np.eye(cov.shape[0])
-
-    # eigh guarantees real eigenvalues and orthonormal eigenvectors (Covariance is symmetric positive semidefinite)
-    w, Q = np.linalg.eigh(cov)
-    w = np.maximum(w, reg_factor)
-    # project to the set of positive definite matrices
-    cov_reg = Q @ (w[:, None] * Q.T)
-
-    # eig = np.linalg.eigvals(cov)
-    # eig_min = min(eig)
-    # if eig_min <= 0:
-    #     cov += (abs(eig_min) + reg_factor) * np.eye(cov.shape[0])
-    return cov_reg
+        return np.sqrt(s) * eye, s * eye
+
+    # scale for the initial jitter
+    s = np.mean(np.diag(cov))
+    s = 1.0 if not np.isfinite(s) or s <= 0 else s
+
+    jitter = reg_factor * s
+    for _ in range(max_tries):
+        try:
+            new_cov = cov + jitter * eye
+            L = np.linalg.cholesky(new_cov)
+            return L, new_cov
+        except np.linalg.LinAlgError:
+            jitter *= 2.0
+    logging.warning(
+        "Could not regularize covariance matrix. Falling back to "
+        "scaled identity."
+    )
+
+    # final safe fallback
+    return np.sqrt(s) * eye, s * eye
diff --git a/pypesto/sample/mala.py b/pypesto/sample/mala.py
index cc92609f3..a889fd37d 100644
--- a/pypesto/sample/mala.py
+++ b/pypesto/sample/mala.py
@@ -54,15 +54,15 @@ def default_options(cls):
             # regularization factor for ill-conditioned cov matrices of
             # the adapted proposal density. regularization might happen if the
             # eigenvalues of the cov matrix strongly differ in order of magnitude.
-            "reg_factor": 1e-6,
+            "reg_factor": 1e-8,
+            # maximum number of attempts to regularize the covariance matrix
+            "max_tries": 10,
             # initial covariance matrix. defaults to a unit matrix
             "cov0": None,
             # target acceptance rate
             "target_acceptance_rate": 0.574,
             # show progress
             "show_progress": None,
-            # compute cholesky decomposition of the covariance matrix
-            "compute_cholesky": True,
         }
 
     def initialize(self, problem: Problem, x0: np.ndarray):
diff --git a/test/sample/test_sample.py b/test/sample/test_sample.py
index fb1cb91ff..2e88ebe26 100644
--- a/test/sample/test_sample.py
+++ b/test/sample/test_sample.py
@@ -41,12 +41,12 @@
 @pytest.fixture(
     params=[
         "Metropolis",
-        # "AdaptiveMetropolis",
-        # "ParallelTempering",
-        # "AdaptiveParallelTempering",
-        # "Pymc",
-        # "Emcee",
-        # "Dynesty",
+        "AdaptiveMetropolis",
+        "ParallelTempering",
+        "AdaptiveParallelTempering",
+        "Pymc",
+        "Emcee",
+        "Dynesty",
         "Mala",
     ]
 )

From 802a5ed507b19e1b4fe9b2caa13fcd23ced7bb6d Mon Sep 17 00:00:00 2001
From: arrjon <jonas.arruda@uni-bonn.de>
Date: Fri, 26 Sep 2025 14:10:16 +0200
Subject: [PATCH 04/18] improve regularize_covariance

---
 pypesto/sample/adaptive_metropolis.py | 3 +--
 pypesto/sample/mala.py                | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/pypesto/sample/adaptive_metropolis.py b/pypesto/sample/adaptive_metropolis.py
index c5c8d57f4..03104184d 100644
--- a/pypesto/sample/adaptive_metropolis.py
+++ b/pypesto/sample/adaptive_metropolis.py
@@ -77,8 +77,7 @@ def default_options(cls):
             # a higher value reduces the impact of early adaptation
             "threshold_sample": 1,
             # regularization factor for ill-conditioned cov matrices of
-            # the adapted proposal density. regularization might happen if the
-            # eigenvalues of the cov matrix strongly differ in order of magnitude.
+            # the adapted proposal density.
             "reg_factor": 1e-8,
             # maximum number of attempts to regularize the covariance matrix
             "max_tries": 10,
diff --git a/pypesto/sample/mala.py b/pypesto/sample/mala.py
index a889fd37d..171c462d9 100644
--- a/pypesto/sample/mala.py
+++ b/pypesto/sample/mala.py
@@ -52,8 +52,7 @@ def default_options(cls):
             # a higher value reduces the impact of early adaptation
             "threshold_sample": 1,
             # regularization factor for ill-conditioned cov matrices of
-            # the adapted proposal density. regularization might happen if the
-            # eigenvalues of the cov matrix strongly differ in order of magnitude.
+            # the adapted proposal density.
             "reg_factor": 1e-8,
             # maximum number of attempts to regularize the covariance matrix
             "max_tries": 10,

From 260f4c3eeb0876c4f340ad131110c6ba655149a3 Mon Sep 17 00:00:00 2001
From: arrjon <jonas.arruda@uni-bonn.de>
Date: Fri, 26 Sep 2025 14:12:55 +0200
Subject: [PATCH 05/18] improve regularize_covariance

---
 pypesto/sample/adaptive_metropolis.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pypesto/sample/adaptive_metropolis.py b/pypesto/sample/adaptive_metropolis.py
index 03104184d..753952e37 100644
--- a/pypesto/sample/adaptive_metropolis.py
+++ b/pypesto/sample/adaptive_metropolis.py
@@ -229,6 +229,13 @@ def regularize_covariance(
         s = 1.0 if not np.isfinite(s) or s <= 0 else s
         return np.sqrt(s) * eye, s * eye
 
+    # Try Cholesky on original matrix first
+    try:
+        L = np.linalg.cholesky(cov)
+        return L, cov
+    except np.linalg.LinAlgError:
+        pass  # Need regularization
+
     # scale for the initial jitter
     s = np.mean(np.diag(cov))
     s = 1.0 if not np.isfinite(s) or s <= 0 else s

From 81b44dcfa6099bf27e86712c2eb4246188566d80 Mon Sep 17 00:00:00 2001
From: arrjon <jonas.arruda@uni-bonn.de>
Date: Fri, 26 Sep 2025 14:39:50 +0200
Subject: [PATCH 06/18] less gradient calls

---
 pypesto/sample/adaptive_metropolis.py |  8 +++++---
 pypesto/sample/mala.py                | 10 ++++------
 pypesto/sample/metropolis.py          | 19 ++++++++++++++-----
 test/sample/test_sample.py            |  4 +++-
 4 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/pypesto/sample/adaptive_metropolis.py b/pypesto/sample/adaptive_metropolis.py
index 753952e37..31bf5e5fa 100644
--- a/pypesto/sample/adaptive_metropolis.py
+++ b/pypesto/sample/adaptive_metropolis.py
@@ -100,15 +100,17 @@ def initialize(self, problem: Problem, x0: np.ndarray):
         else:
             cov0 = np.eye(len(x0))
         self._cov_chol, self._cov = regularize_covariance(
-            cov0, self.options["reg_factor"], self.options["max_tries"]
+            cov0,
+            reg_factor=self.options["reg_factor"],
+            max_tries=self.options["max_tries"],
         )
         self._mean_hist = self.trace_x[-1]
         self._cov_hist = self._cov
         self._cov_scale = 1.0
 
     def _propose_parameter(self, x: np.ndarray):
-        x_new = np.random.multivariate_normal(x, self._cov)
-        return x_new
+        x_new: np.ndarray = np.random.multivariate_normal(x, self._cov)
+        return x_new, None  # no gradient needed
 
     def _update_proposal(
         self, x: np.ndarray, lpost: float, log_p_acc: float, n_sample_cur: int
diff --git a/pypesto/sample/mala.py b/pypesto/sample/mala.py
index 171c462d9..ee4cb29a8 100644
--- a/pypesto/sample/mala.py
+++ b/pypesto/sample/mala.py
@@ -71,6 +71,7 @@ def initialize(self, problem: Problem, x0: np.ndarray):
             raise ValueError("MALA sampler requires gradient information.")
 
         super().initialize(problem, x0)
+        self._current_x_grad = -self.neglogpost.get_grad(x0)
 
     def _propose_parameter(self, x: np.ndarray):
         """Propose a parameter using preconditioned MALA dynamics."""
@@ -93,19 +94,16 @@ def _propose_parameter(self, x: np.ndarray):
         diffusion = np.sqrt(step_size) * precond_noise
 
         x_new: np.ndarray = x + drift + diffusion
-        return x_new
+        return x_new, grad
 
     def _compute_transition_log_prob(
-        self, x_from: np.ndarray, x_to: np.ndarray
+        self, x_from: np.ndarray, x_to: np.ndarray, x_from_grad: np.ndarray
     ):
         """Compute the log probability of transitioning from x_from to x_to with preconditioning."""
         step_size = self.options["step_size"]
 
-        # Get gradient at position
-        grad_from = -self.neglogpost.get_grad(x_from)
-
         # Apply preconditioning to gradient
-        precond_grad_from = self._cov @ grad_from
+        precond_grad_from = self._cov @ x_from_grad
 
         # Mean of the preconditioned proposal distribution
         mean = x_from + (step_size / 2.0) * precond_grad_from
diff --git a/pypesto/sample/metropolis.py b/pypesto/sample/metropolis.py
index 86675e139..605b1cb17 100644
--- a/pypesto/sample/metropolis.py
+++ b/pypesto/sample/metropolis.py
@@ -46,6 +46,7 @@ def __init__(self, options: dict = None):
         self.trace_neglogpost: Union[Sequence[float], None] = None
         self.trace_neglogprior: Union[Sequence[float], None] = None
         self.temper_lpost: bool = False
+        self._current_x_grad: Union[np.ndarray, None] = None
 
     @classmethod
     def default_options(cls):
@@ -111,7 +112,7 @@ def _perform_step(
         Propose new parameter, evaluate and check whether to accept.
         """
         # propose step
-        x_new: np.ndarray = self._propose_parameter(x)
+        x_new, x_new_grad = self._propose_parameter(x)
 
         # check if step lies within bounds
         if any(x_new < self.problem.lb) or any(x_new > self.problem.ub):
@@ -152,8 +153,12 @@ def _perform_step(
             log_p_acc = min(beta * (lpost_new - lpost), 0)
 
         # This accounts for an asymmetric proposal distribution
-        log_q_forward = self._compute_transition_log_prob(x, x_new)
-        log_q_backward = self._compute_transition_log_prob(x_new, x)
+        log_q_forward = self._compute_transition_log_prob(
+            x, x_new, self._current_x_grad
+        )
+        log_q_backward = self._compute_transition_log_prob(
+            x_new, x, x_new_grad
+        )
         proposal_correction = log_q_backward - log_q_forward
         log_p_acc = min(log_p_acc + proposal_correction, 0)
 
@@ -166,6 +171,7 @@ def _perform_step(
             x = x_new
             lpost = lpost_new
             lprior = lprior_new
+            self._current_x_grad = x_new_grad
 
         # update proposal
         self._update_proposal(
@@ -177,7 +183,7 @@ def _perform_step(
     def _propose_parameter(self, x: np.ndarray):
         """Propose a step."""
         x_new: np.ndarray = x + self.options["std"] * np.random.randn(len(x))
-        return x_new
+        return x_new, None  # no gradient needed
 
     def _update_proposal(
         self, x: np.ndarray, lpost: float, log_p_acc: float, n_sample_cur: int
@@ -185,7 +191,10 @@ def _update_proposal(
         """Update the proposal density. Default: Do nothing."""
 
     def _compute_transition_log_prob(
-        self, x_from: np.ndarray, x_to: np.ndarray
+        self,
+        x_from: np.ndarray,
+        x_to: np.ndarray,
+        x_from_grad: Union[np.ndarray, None],
     ):
         """Compute the transition log probability for symmetric proposal.
 
diff --git a/test/sample/test_sample.py b/test/sample/test_sample.py
index 2e88ebe26..9fe46a0b5 100644
--- a/test/sample/test_sample.py
+++ b/test/sample/test_sample.py
@@ -213,7 +213,9 @@ def test_regularize_covariance():
     """
     matrix = np.array([[-1.0, -4.0], [-4.0, 1.0]])
     assert np.any(np.linalg.eigvals(matrix) < 0)
-    reg = sample.adaptive_metropolis.regularize_covariance(matrix, 1e-6)
+    reg = sample.adaptive_metropolis.regularize_covariance(
+        matrix, 1e-6, max_tries=1
+    )
     assert np.all(np.linalg.eigvals(reg) >= 0)
 
 

From 7934aabdf3c767c60d306e771efc4b537c7f3033 Mon Sep 17 00:00:00 2001
From: arrjon <jonas.arruda@uni-bonn.de>
Date: Fri, 26 Sep 2025 15:02:15 +0200
Subject: [PATCH 07/18] less gradient calls

---
 pypesto/sample/__init__.py | 2 +-
 pypesto/sample/mala.py     | 2 +-
 test/sample/test_sample.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pypesto/sample/__init__.py b/pypesto/sample/__init__.py
index c34539196..611c346cc 100644
--- a/pypesto/sample/__init__.py
+++ b/pypesto/sample/__init__.py
@@ -17,7 +17,7 @@
     laplace_approximation_log_evidence,
     parallel_tempering_log_evidence,
 )
-from .mala import Mala
+from .mala import MalaSampler
 from .metropolis import MetropolisSampler
 from .parallel_tempering import ParallelTemperingSampler
 from .sample import sample
diff --git a/pypesto/sample/mala.py b/pypesto/sample/mala.py
index ee4cb29a8..318792a24 100644
--- a/pypesto/sample/mala.py
+++ b/pypesto/sample/mala.py
@@ -4,7 +4,7 @@
 from .adaptive_metropolis import AdaptiveMetropolisSampler
 
 
-class Mala(AdaptiveMetropolisSampler):
+class MalaSampler(AdaptiveMetropolisSampler):
     """Metropolis-Adjusted Langevin Algorithm (MALA) sampler with preconditioning.
 
     MALA is a gradient-based MCMC method that uses the gradient of the
diff --git a/test/sample/test_sample.py b/test/sample/test_sample.py
index 9fe46a0b5..8720c0700 100644
--- a/test/sample/test_sample.py
+++ b/test/sample/test_sample.py
@@ -64,7 +64,7 @@ def sampler(request):
             },
         )
     elif request.param == "Mala":
-        return sample.Mala(
+        return sample.MalaSampler(
             options={
                 "show_progress": False,
                 "step_size": 0.01,

From 8812281b01176891a37b1970472e24acdbe20a59 Mon Sep 17 00:00:00 2001
From: arrjon <jonas.arruda@uni-bonn.de>
Date: Fri, 26 Sep 2025 15:30:43 +0200
Subject: [PATCH 08/18] add to sampler study

---
 doc/example/sampler_study.ipynb | 37 +++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/doc/example/sampler_study.ipynb b/doc/example/sampler_study.ipynb
index f5e0ccf95..e3946ff01 100644
--- a/doc/example/sampler_study.ipynb
+++ b/doc/example/sampler_study.ipynb
@@ -488,6 +488,43 @@
     "result.sample_result.betas"
    ]
   },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "### Metropolis-Adjusted Langevin Algorithm\n"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "So far, we have used only the function values to guide the sampling. If gradients are available, they can be used to guide the sampling more effectively. The `pypesto.sample.MetropolisAdjustedLangevinAlgorithm` implements this idea. It can be used within the parallel tempering framework as well."
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "sampler = sample.AdaptiveParallelTemperingSampler(\n",
+    "    internal_sampler=sample.MalaSampler(), n_chains=3\n",
+    ")\n",
+    "result = sample.sample(\n",
+    "    problem, 1e3, sampler, x0=np.array([0.5]), filename=None\n",
+    ")"
+   ]
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "sample.geweke_test(result)\n",
+    "for i_chain in range(len(result.sample_result.betas)):\n",
+    "    visualize.sampling_1d_marginals(\n",
+    "        result, i_chain=i_chain, suptitle=f\"Chain: {i_chain}\"\n",
+    "    )"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

From 5ee528a8db98d1ee9e662293ccdebd17509d0560 Mon Sep 17 00:00:00 2001
From: arrjon <jonas.arruda@uni-bonn.de>
Date: Mon, 29 Sep 2025 10:06:39 +0200
Subject: [PATCH 09/18] improved step size

---
 pypesto/sample/mala.py | 58 ++++++++++++++++++++++++++++++++----------
 1 file changed, 45 insertions(+), 13 deletions(-)

diff --git a/pypesto/sample/mala.py b/pypesto/sample/mala.py
index 318792a24..09be2cbd3 100644
--- a/pypesto/sample/mala.py
+++ b/pypesto/sample/mala.py
@@ -25,6 +25,12 @@ class MalaSampler(AdaptiveMetropolisSampler):
     We aim to converge to a fixed target acceptance rate of 0.574, as suggested
     by theoretical results for MALA.
 
+    By default, the step size is set to sqrt((1/dim)^(1/3)), where dim is the
+    dimension of the target distribution, following Boisvert-Beaudry and Bedard (2022).
+    This scaling helps maintain efficient exploration as the dimensionality increases,
+    and leads to an asymptotically performant sampler, in the sense that exploring the
+    parameter space requires O(dim^(1/3)) steps.
+
     For reference, see:
     * Roberts et al. 1996.
       Exponential convergence of Langevin distributions and their
@@ -33,17 +39,29 @@ class MalaSampler(AdaptiveMetropolisSampler):
     * Girolami & Calderhead 2011.
       Riemann manifold Langevin and Hamiltonian Monte Carlo methods
       (https://doi.org/10.1111/j.1467-9868.2010.00765.x)
+    * Boisvert-Beaudry and Bedard 2022.
+      MALA with annealed proposals: a generalization of locally and globally balanced proposal distributions
     """
 
-    def __init__(self, options: dict = None):
+    def __init__(self, step_size: float = None, options: dict = None):
+        """Initialize the sampler.
+
+        Parameters
+        ----------
+        step_size:
+            Step size for the Langevin dynamics. If None, it will be set to
+            sqrt((1/n_parameter)^(1/3)) during initialization.
+        options:
+            Options for the sampler.
+        """
         super().__init__(options)
+        self.step_size = step_size
+        self._dimension_of_target_weight = 1.0
 
     @classmethod
     def default_options(cls):
         """Return the default options for the sampler."""
         return {
-            # step size for the Langevin dynamics
-            "step_size": 0.01,
             # controls adaptation degeneration velocity of the proposals
             # in [0, 1], with 0 -> no adaptation, i.e. classical
             # Metropolis-Hastings
@@ -70,13 +88,20 @@ def initialize(self, problem: Problem, x0: np.ndarray):
         if not problem.objective.has_grad:
             raise ValueError("MALA sampler requires gradient information.")
 
+        # Boisvert-Beaudry and Bedard (2022)
+        # Set step size to problem dimension if not specified
+        self.step_size = (
+            np.sqrt((1 / problem.dim) ** (1 / 3))
+            if self.step_size is None
+            else self.step_size
+        )
+        self._dimension_of_target_weight = 1 + (1 / problem.dim) ** (1 / 3)
+
         super().initialize(problem, x0)
         self._current_x_grad = -self.neglogpost.get_grad(x0)
 
     def _propose_parameter(self, x: np.ndarray):
         """Propose a parameter using preconditioned MALA dynamics."""
-        step_size = self.options["step_size"]
-
         # Get gradient of log-posterior at current position
         grad = -self.neglogpost.get_grad(x)
 
@@ -90,23 +115,28 @@ def _propose_parameter(self, x: np.ndarray):
         precond_noise = self._cov_chol @ noise
 
         # Preconditioned MALA proposal: x + (h/2) * M * grad + sqrt(h) * sqrt(M) * noise
-        drift = (step_size / 2.0) * precond_grad
-        diffusion = np.sqrt(step_size) * precond_noise
+        drift = (self.step_size / 2.0) * precond_grad
+        diffusion = np.sqrt(self.step_size) * precond_noise
 
-        x_new: np.ndarray = x + drift + diffusion
+        x_new: np.ndarray = (
+            x + self._dimension_of_target_weight * drift + diffusion
+        )
         return x_new, grad
 
     def _compute_transition_log_prob(
         self, x_from: np.ndarray, x_to: np.ndarray, x_from_grad: np.ndarray
     ):
         """Compute the log probability of transitioning from x_from to x_to with preconditioning."""
-        step_size = self.options["step_size"]
-
         # Apply preconditioning to gradient
         precond_grad_from = self._cov @ x_from_grad
 
         # Mean of the preconditioned proposal distribution
-        mean = x_from + (step_size / 2.0) * precond_grad_from
+        mean = (
+            x_from
+            + self._dimension_of_target_weight
+            * (self.step_size / 2.0)
+            * precond_grad_from
+        )
 
         # For preconditioned MALA, the covariance is step_size * M
         # We need to compute the log probability under N(mean, step_size * M)
@@ -119,7 +149,7 @@ def _compute_transition_log_prob(
             # Forward substitution: L @ y = diff
             y = np.linalg.solve(self._cov_chol, diff)
             # Quadratic form: diff^T @ M^{-1} @ diff = y^T @ y
-            log_prob = -0.5 * np.dot(y, y) / step_size
+            log_prob = -0.5 * np.dot(y, y) / self.step_size
         except np.linalg.LinAlgError:
             # If matrix is singular, return -inf
             return -np.inf
@@ -129,6 +159,8 @@ def _compute_transition_log_prob(
         # log|M| = 2 * log|L| = 2 * sum(log(diag(L)))
         d = len(x_from)
         log_det_cov = 2 * np.sum(np.log(np.diag(self._cov_chol)))
-        log_prob -= 0.5 * (d * np.log(2 * np.pi * step_size) + log_det_cov)
+        log_prob -= 0.5 * (
+            d * np.log(2 * np.pi * self.step_size) + log_det_cov
+        )
 
         return log_prob

From 0b0c02508d2f7ab6348a88de0c2ed39a84b0dfe3 Mon Sep 17 00:00:00 2001
From: arrjon <jonas.arruda@uni-bonn.de>
Date: Mon, 29 Sep 2025 10:09:21 +0200
Subject: [PATCH 10/18] improved step size

---
 pypesto/sample/mala.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/pypesto/sample/mala.py b/pypesto/sample/mala.py
index 09be2cbd3..77e265c5e 100644
--- a/pypesto/sample/mala.py
+++ b/pypesto/sample/mala.py
@@ -1,8 +1,12 @@
+import logging
+
 import numpy as np
 
 from ..problem import Problem
 from .adaptive_metropolis import AdaptiveMetropolisSampler
 
+logger = logging.getLogger(__name__)
+
 
 class MalaSampler(AdaptiveMetropolisSampler):
     """Metropolis-Adjusted Langevin Algorithm (MALA) sampler with preconditioning.
@@ -90,11 +94,9 @@ def initialize(self, problem: Problem, x0: np.ndarray):
 
         # Boisvert-Beaudry and Bedard (2022)
         # Set step size to problem dimension if not specified
-        self.step_size = (
-            np.sqrt((1 / problem.dim) ** (1 / 3))
-            if self.step_size is None
-            else self.step_size
-        )
+        if self.step_size is None:
+            self.step_size = np.sqrt((1 / problem.dim) ** (1 / 3))
+            logger.info(f"Step size set to {self.step_size}")
         self._dimension_of_target_weight = 1 + (1 / problem.dim) ** (1 / 3)
 
         super().initialize(problem, x0)

From 7a6778bd077068e7bdbf2029eac91a0293eacc11 Mon Sep 17 00:00:00 2001
From: Jonas Arruda <69197639+arrjon@users.noreply.github.com>
Date: Mon, 6 Oct 2025 18:52:45 +0100
Subject: [PATCH 11/18] Update test/sample/test_sample.py

Co-authored-by: Daniel Weindl <dweindl@users.noreply.github.com>
---
 test/sample/test_sample.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/sample/test_sample.py b/test/sample/test_sample.py
index 8720c0700..e695027bc 100644
--- a/test/sample/test_sample.py
+++ b/test/sample/test_sample.py
@@ -111,7 +111,7 @@ def problem(request):
 
 def test_pipeline(sampler, problem):
     """Check that a typical pipeline runs through."""
-    if isinstance(sampler, sample.Mala):
+    if isinstance(sampler, sample.MalaSampler):
         if not problem.objective.has_grad:
             pytest.skip("MALA requires gradient information.")
     # optimization

From 2c954f07cc3f0719293221bacd762004f335bd6c Mon Sep 17 00:00:00 2001
From: Jonas Arruda <69197639+arrjon@users.noreply.github.com>
Date: Mon, 6 Oct 2025 18:52:53 +0100
Subject: [PATCH 12/18] Update pypesto/sample/mala.py

Co-authored-by: Daniel Weindl <dweindl@users.noreply.github.com>
---
 pypesto/sample/mala.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pypesto/sample/mala.py b/pypesto/sample/mala.py
index 77e265c5e..31b66a5b6 100644
--- a/pypesto/sample/mala.py
+++ b/pypesto/sample/mala.py
@@ -36,6 +36,7 @@ class MalaSampler(AdaptiveMetropolisSampler):
     parameter space requires O(dim^(1/3)) steps.
 
     For reference, see:
+
     * Roberts et al. 1996.
       Exponential convergence of Langevin distributions and their
       discrete approximations

From f081b9fba3e2435e680d870b359221c25376aa40 Mon Sep 17 00:00:00 2001
From: arrjon <jonas.arruda@uni-bonn.de>
Date: Mon, 6 Oct 2025 19:02:10 +0100
Subject: [PATCH 13/18] fix mala for parallel tempering

---
 pypesto/sample/adaptive_metropolis.py |  2 +-
 pypesto/sample/mala.py                |  4 ++--
 pypesto/sample/metropolis.py          |  4 ++--
 test/sample/test_sample.py            | 11 +++++------
 4 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/pypesto/sample/adaptive_metropolis.py b/pypesto/sample/adaptive_metropolis.py
index 31bf5e5fa..64b63a3f1 100644
--- a/pypesto/sample/adaptive_metropolis.py
+++ b/pypesto/sample/adaptive_metropolis.py
@@ -108,7 +108,7 @@ def initialize(self, problem: Problem, x0: np.ndarray):
         self._cov_hist = self._cov
         self._cov_scale = 1.0
 
-    def _propose_parameter(self, x: np.ndarray):
+    def _propose_parameter(self, x: np.ndarray, beta: float):
         x_new: np.ndarray = np.random.multivariate_normal(x, self._cov)
         return x_new, None  # no gradient needed
 
diff --git a/pypesto/sample/mala.py b/pypesto/sample/mala.py
index 31b66a5b6..56a887723 100644
--- a/pypesto/sample/mala.py
+++ b/pypesto/sample/mala.py
@@ -103,10 +103,10 @@ def initialize(self, problem: Problem, x0: np.ndarray):
         super().initialize(problem, x0)
         self._current_x_grad = -self.neglogpost.get_grad(x0)
 
-    def _propose_parameter(self, x: np.ndarray):
+    def _propose_parameter(self, x: np.ndarray, beta: float):
         """Propose a parameter using preconditioned MALA dynamics."""
         # Get gradient of log-posterior at current position
-        grad = -self.neglogpost.get_grad(x)
+        grad = -beta * self.neglogpost.get_grad(x)
 
         # Apply preconditioning to gradient
         precond_grad = self._cov @ grad
diff --git a/pypesto/sample/metropolis.py b/pypesto/sample/metropolis.py
index 605b1cb17..cb4c34890 100644
--- a/pypesto/sample/metropolis.py
+++ b/pypesto/sample/metropolis.py
@@ -112,7 +112,7 @@ def _perform_step(
         Propose new parameter, evaluate and check whether to accept.
         """
         # propose step
-        x_new, x_new_grad = self._propose_parameter(x)
+        x_new, x_new_grad = self._propose_parameter(x, beta=beta)
 
         # check if step lies within bounds
         if any(x_new < self.problem.lb) or any(x_new > self.problem.ub):
@@ -180,7 +180,7 @@ def _perform_step(
 
         return x, lpost, lprior
 
-    def _propose_parameter(self, x: np.ndarray):
+    def _propose_parameter(self, x: np.ndarray, beta: float):
         """Propose a step."""
         x_new: np.ndarray = x + self.options["std"] * np.random.randn(len(x))
         return x_new, None  # no gradient needed
diff --git a/test/sample/test_sample.py b/test/sample/test_sample.py
index e695027bc..d880268cd 100644
--- a/test/sample/test_sample.py
+++ b/test/sample/test_sample.py
@@ -42,11 +42,11 @@
     params=[
         "Metropolis",
         "AdaptiveMetropolis",
-        "ParallelTempering",
-        "AdaptiveParallelTempering",
-        "Pymc",
-        "Emcee",
-        "Dynesty",
+        # "ParallelTempering",
+        # "AdaptiveParallelTempering",
+        # "Pymc",
+        # "Emcee",
+        # "Dynesty",
         "Mala",
     ]
 )
@@ -67,7 +67,6 @@ def sampler(request):
         return sample.MalaSampler(
             options={
                 "show_progress": False,
-                "step_size": 0.01,
             },
         )
     elif request.param == "ParallelTempering":

From 565a312e0c596214158a8e5ebf14c9302daef792 Mon Sep 17 00:00:00 2001
From: arrjon <jonas.arruda@uni-bonn.de>
Date: Wed, 8 Oct 2025 10:53:51 +0100
Subject: [PATCH 14/18] beta in mala

---
 pypesto/sample/mala.py       | 5 +++--
 pypesto/sample/metropolis.py | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/pypesto/sample/mala.py b/pypesto/sample/mala.py
index 56a887723..a74ebbd43 100644
--- a/pypesto/sample/mala.py
+++ b/pypesto/sample/mala.py
@@ -106,10 +106,11 @@ def initialize(self, problem: Problem, x0: np.ndarray):
     def _propose_parameter(self, x: np.ndarray, beta: float):
         """Propose a parameter using preconditioned MALA dynamics."""
         # Get gradient of log-posterior at current position
-        grad = -beta * self.neglogpost.get_grad(x)
+        grad = -self.neglogpost.get_grad(x)
+        # todo: in parallel tempering, this should be the tempered gradient, but only the likelihood is tempered
 
         # Apply preconditioning to gradient
-        precond_grad = self._cov @ grad
+        precond_grad = self._cov @ (beta * grad)
 
         # Generate standard Gaussian noise
         noise = np.random.randn(len(x))
diff --git a/pypesto/sample/metropolis.py b/pypesto/sample/metropolis.py
index cb4c34890..60e30514c 100644
--- a/pypesto/sample/metropolis.py
+++ b/pypesto/sample/metropolis.py
@@ -154,10 +154,10 @@ def _perform_step(
 
         # This accounts for an asymmetric proposal distribution
         log_q_forward = self._compute_transition_log_prob(
-            x, x_new, self._current_x_grad
+            x, x_new, beta * self._current_x_grad
         )
         log_q_backward = self._compute_transition_log_prob(
-            x_new, x, x_new_grad
+            x_new, x, beta * x_new_grad
         )
         proposal_correction = log_q_backward - log_q_forward
         log_p_acc = min(log_p_acc + proposal_correction, 0)

From bf4a4742a016d4bbfab67d42952aeef724cacdf1 Mon Sep 17 00:00:00 2001
From: arrjon <jonas.arruda@uni-bonn.de>
Date: Fri, 10 Oct 2025 11:35:39 +0200
Subject: [PATCH 15/18] add test

---
 test/sample/test_sample.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/sample/test_sample.py b/test/sample/test_sample.py
index d880268cd..0a4c7ede5 100644
--- a/test/sample/test_sample.py
+++ b/test/sample/test_sample.py
@@ -42,11 +42,11 @@
     params=[
         "Metropolis",
         "AdaptiveMetropolis",
-        # "ParallelTempering",
-        # "AdaptiveParallelTempering",
-        # "Pymc",
-        # "Emcee",
-        # "Dynesty",
+        "ParallelTempering",
+        "AdaptiveParallelTempering",
+        "Pymc",
+        "Emcee",
+        "Dynesty",
         "Mala",
     ]
 )

From c57946bc900eb23919bc5897ec6e139d95f83341 Mon Sep 17 00:00:00 2001
From: arrjon <jonas.arruda@uni-bonn.de>
Date: Wed, 15 Oct 2025 09:33:32 +0200
Subject: [PATCH 16/18] fix no gradient

---
 pypesto/sample/adaptive_metropolis.py | 2 +-
 pypesto/sample/metropolis.py          | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pypesto/sample/adaptive_metropolis.py b/pypesto/sample/adaptive_metropolis.py
index 64b63a3f1..00e73c473 100644
--- a/pypesto/sample/adaptive_metropolis.py
+++ b/pypesto/sample/adaptive_metropolis.py
@@ -110,7 +110,7 @@ def initialize(self, problem: Problem, x0: np.ndarray):
 
     def _propose_parameter(self, x: np.ndarray, beta: float):
         x_new: np.ndarray = np.random.multivariate_normal(x, self._cov)
-        return x_new, None  # no gradient needed
+        return x_new, np.nan  # no gradient needed
 
     def _update_proposal(
         self, x: np.ndarray, lpost: float, log_p_acc: float, n_sample_cur: int
diff --git a/pypesto/sample/metropolis.py b/pypesto/sample/metropolis.py
index 60e30514c..0fc13ffb5 100644
--- a/pypesto/sample/metropolis.py
+++ b/pypesto/sample/metropolis.py
@@ -46,7 +46,7 @@ def __init__(self, options: dict = None):
         self.trace_neglogpost: Union[Sequence[float], None] = None
         self.trace_neglogprior: Union[Sequence[float], None] = None
         self.temper_lpost: bool = False
-        self._current_x_grad: Union[np.ndarray, None] = None
+        self._current_x_grad: Union[np.ndarray] = np.nan
 
     @classmethod
     def default_options(cls):
@@ -183,7 +183,7 @@ def _perform_step(
     def _propose_parameter(self, x: np.ndarray, beta: float):
         """Propose a step."""
         x_new: np.ndarray = x + self.options["std"] * np.random.randn(len(x))
-        return x_new, None  # no gradient needed
+        return x_new, np.nan  # no gradient needed
 
     def _update_proposal(
         self, x: np.ndarray, lpost: float, log_p_acc: float, n_sample_cur: int

From b775a8dc5483303d17144ae536de0c67beac9da5 Mon Sep 17 00:00:00 2001
From: arrjon <jonas.arruda@uni-bonn.de>
Date: Tue, 21 Oct 2025 20:49:13 +0200
Subject: [PATCH 17/18] fix Emcee test

---
 test/sample/test_sample.py | 14 ++++++++++++--
 test/sample/util.py        | 13 +++++++++++++
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/test/sample/test_sample.py b/test/sample/test_sample.py
index 0a4c7ede5..6d78e7651 100644
--- a/test/sample/test_sample.py
+++ b/test/sample/test_sample.py
@@ -32,6 +32,7 @@
     gaussian_nllh_grad,
     gaussian_nllh_hess,
     gaussian_problem,
+    gaussian_problem_with_grad,
     negative_log_posterior,
     negative_log_prior,
     rosenbrock_problem,
@@ -98,11 +99,15 @@ def sampler(request):
         )
 
 
-@pytest.fixture(params=["gaussian", "gaussian_mixture", "rosenbrock"])
+@pytest.fixture(
+    params=["gaussian", "gaussian_with_grad", "gaussian_mixture", "rosenbrock"]
+)
 def problem(request):
     if request.param == "gaussian":
         return gaussian_problem()
-    if request.param == "gaussian_mixture":
+    elif request.param == "gaussian_with_grad":
+        return gaussian_problem_with_grad()
+    elif request.param == "gaussian_mixture":
         return gaussian_mixture_problem()
     elif request.param == "rosenbrock":
         return rosenbrock_problem()
@@ -113,6 +118,11 @@ def test_pipeline(sampler, problem):
     if isinstance(sampler, sample.MalaSampler):
         if not problem.objective.has_grad:
             pytest.skip("MALA requires gradient information.")
+    if isinstance(sampler, sample.EmceeSampler):
+        if problem.objective.has_grad:
+            pytest.skip(
+                "EMCEE fails to converge when started in optimal point with gradient."
+            )
     # optimization
     optimizer = optimize.ScipyOptimizer(options={"maxiter": 10})
     result = optimize.minimize(
diff --git a/test/sample/util.py b/test/sample/util.py
index 6a93e028d..268f472bc 100644
--- a/test/sample/util.py
+++ b/test/sample/util.py
@@ -46,6 +46,19 @@ def gaussian_nllh_hess(x):
 def gaussian_problem():
     """Defines a simple Gaussian problem."""
 
+    def nllh(x):
+        return -gaussian_llh(x)
+
+    objective = pypesto.Objective(fun=nllh)
+    problem = pypesto.Problem(
+        objective=objective, lb=LB_GAUSSIAN, ub=UB_GAUSSIAN
+    )
+    return problem
+
+
+def gaussian_problem_with_grad():
+    """Defines a simple Gaussian problem."""
+
     def nllh(x):
         return -gaussian_llh(x)
 

From 2af917ce670601f287f319f12c37721c1c63f8cd Mon Sep 17 00:00:00 2001
From: arrjon <jonas.arruda@uni-bonn.de>
Date: Tue, 21 Oct 2025 21:11:51 +0200
Subject: [PATCH 18/18] add grad to sampler study

---
 doc/example/sampler_study.ipynb | 35 +++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/doc/example/sampler_study.ipynb b/doc/example/sampler_study.ipynb
index e3946ff01..452989799 100644
--- a/doc/example/sampler_study.ipynb
+++ b/doc/example/sampler_study.ipynb
@@ -239,7 +239,7 @@
    "outputs": [],
    "source": [
     "import seaborn as sns\n",
-    "from scipy.stats import multivariate_normal\n",
+    "from scipy.stats import multivariate_normal, norm\n",
     "\n",
     "import pypesto\n",
     "import pypesto.sample as sample\n",
@@ -256,7 +256,26 @@
     "    return -np.log(density(x))\n",
     "\n",
     "\n",
-    "objective = pypesto.Objective(fun=nllh)\n",
+    "def nllh_grad(x):\n",
+    "    w = np.array([0.3, 0.7])\n",
+    "    mus = np.array([-1.5, 2.5])\n",
+    "    sigmas2 = np.array([0.1, 0.2])\n",
+    "    sigmas = np.sqrt(sigmas2)\n",
+    "\n",
+    "    pdfs = np.array([w[i] * norm.pdf(x, mus[i], sigmas[i]) for i in range(2)])\n",
+    "    dpdx = np.sum(\n",
+    "        [\n",
+    "            w[i] * norm.pdf(x, mus[i], sigmas[i]) * (mus[i] - x) / sigmas2[i]\n",
+    "            for i in range(2)\n",
+    "        ],\n",
+    "        axis=0,\n",
+    "    )\n",
+    "\n",
+    "    grad = -dpdx / np.sum(pdfs, axis=0)\n",
+    "    return grad\n",
+    "\n",
+    "\n",
+    "objective = pypesto.Objective(fun=nllh, grad=nllh_grad)\n",
     "problem = pypesto.Problem(objective=objective, lb=-4, ub=5, x_names=[\"x\"])"
    ]
   },
@@ -489,20 +508,20 @@
    ]
   },
   {
-   "metadata": {},
    "cell_type": "markdown",
+   "metadata": {},
    "source": "### Metropolis-Adjusted Langevin Algorithm\n"
   },
   {
-   "metadata": {},
    "cell_type": "markdown",
+   "metadata": {},
    "source": "So far, we have used only the function values to guide the sampling. If gradients are available, they can be used to guide the sampling more effectively. The `pypesto.sample.MetropolisAdjustedLangevinAlgorithm` implements this idea. It can be used within the parallel tempering framework as well."
   },
   {
-   "metadata": {},
    "cell_type": "code",
-   "outputs": [],
    "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "sampler = sample.AdaptiveParallelTemperingSampler(\n",
     "    internal_sampler=sample.MalaSampler(), n_chains=3\n",
@@ -513,10 +532,10 @@
    ]
   },
   {
-   "metadata": {},
    "cell_type": "code",
-   "outputs": [],
    "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "sample.geweke_test(result)\n",
     "for i_chain in range(len(result.sample_result.betas)):\n",