From 364044efdb9c8f17875e2e178a2a3ecd236f4e43 Mon Sep 17 00:00:00 2001
From: Thomas Schmelzer <thomas.schmelzer@gmail.com>
Date: Fri, 3 Jul 2026 19:27:35 +0400
Subject: [PATCH] fix: slice the system operator to the active set once per
 outer step

The operator adoption (#71) built full-universe operators and called
apply_free(active_idx, v) inside the CG matvec, which re-gathers the
columns of X (and the target blocks) on every iteration: ~10 MB of
memcpy per iteration against ~50 us of useful GEMV, an ~8x wall-clock
regression at identical iteration counts (16x without shrinkage).

_system_operator now takes the active mask and builds the SumOperator
on pre-sliced contiguous factors (X[:, active], U_k[active, :], the
dense target block), restoring the pre-#71 hoisting; the per-iteration
matvec is a plain operator product. Fixes #72.

Benchmark (S&P 500, n=494, T=1213, LW alpha=0.5, best of 3):
  before  0.0475 s / 82 iters
  after   0.0063 s / 83 iters
  parent of #71: 0.0058 s / 82 iters

Full suite: 379 passed.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/fast_minimum_variance/minvar_problem.py | 38 ++++++++++++---------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/src/fast_minimum_variance/minvar_problem.py b/src/fast_minimum_variance/minvar_problem.py
index c50c654..9df396a 100644
--- a/src/fast_minimum_variance/minvar_problem.py
+++ b/src/fast_minimum_variance/minvar_problem.py
@@ -197,30 +197,38 @@ def _cvxpy_constraints(self, w: Any, cp: Any) -> list[Any]:
         """Return budget-equality and long-only inequality constraints for CVXPY."""
         return [cp.sum(w) == 1, w >= 0]
 
-    def _system_operator(self) -> SumOperator:
-        """Build ``Sigma = (1-alpha)/T * X^T X + alpha * T0`` as a cvx-linalg operator.
+    def _system_operator(self, active: np.ndarray) -> SumOperator:
+        """Build the active-set system ``Sigma_a`` as a cvx-linalg operator.
 
         A :class:`~cvx.linalg.SumOperator` of the data Gram term and, when present,
         the target term (a :class:`~cvx.linalg.FactorOperator` for a low-rank RMT
-        target, else a :class:`~cvx.linalg.DenseOperator`). The full-universe
-        operators are sliced to the active set via ``apply_free``; nothing is
-        formed at ``n x n``. Without a target the data term carries the full weight.
+        target, else a :class:`~cvx.linalg.DenseOperator`). The factors are sliced
+        to the active set HERE, once per outer step, so the per-iteration
+        ``matvec`` runs on contiguous pre-sliced arrays; slicing inside the CG
+        loop (``apply_free`` on full-universe operators) re-gathers the columns
+        of ``X`` on every iteration and is an order of magnitude slower.
+        Nothing is formed at ``n_a x n_a``; without a target the data term
+        carries the full weight.
+
+        Args:
+            active: Boolean mask selecting the active asset subset.
         """
         has_target = self.target_lr is not None or self.target is not None
         c_data = (1.0 - self.alpha) if has_target else 1.0
-        terms: list[tuple[float, Any]] = [(c_data / self.t, GramOperator(self.X))]
+        terms: list[tuple[float, Any]] = [(c_data / self.t, GramOperator(np.ascontiguousarray(self.X[:, active])))]
         if self.target_lr is not None:
             bar_lam, u_k, delta_k = self.target_lr
-            terms.append((self.alpha, FactorOperator(np.full(u_k.shape[0], bar_lam), u_k, np.diag(delta_k))))
+            u_a = u_k[active, :]
+            terms.append((self.alpha, FactorOperator(np.full(u_a.shape[0], bar_lam), u_a, np.diag(delta_k))))
         elif self.target is not None:
-            terms.append((self.alpha, DenseOperator(self.target)))
+            terms.append((self.alpha, DenseOperator(self.target[np.ix_(active, active)])))
         return SumOperator(terms)
 
     def _cg_step(self, active: np.ndarray, x0: np.ndarray | None = None) -> tuple[np.ndarray, int]:
         """Solve the reduced SPD system via matrix-free CG; return ``(w_a, iters)``.
 
         Runs conjugate gradients over the active-set system operator
-        (:meth:`_system_operator`, applied through ``apply_free``) without ever
+        (:meth:`_system_operator`, sliced once per outer step) without ever
         forming ``Sigma_a`` explicitly. Low-rank and dense targets share one path.
 
         Args:
@@ -229,14 +237,13 @@ def _cg_step(self, active: np.ndarray, x0: np.ndarray | None = None) -> tuple[np
                 When provided it must have shape ``(active.sum(),)``.
         """
         n_a = int(active.sum())
-        sigma = self._system_operator()
-        active_idx = np.flatnonzero(active)
+        sigma = self._system_operator(active)
         count = [0]
 
         def matvec(v: np.ndarray) -> np.ndarray:
-            """Apply Sigma_a to v via the operator's free-block product."""
+            """Apply Sigma_a to v via the pre-sliced operator."""
             count[0] += 1
-            return sigma.apply_free(active_idx, v)
+            return sigma.matvec(v)
 
         op = LinearOperator((n_a, n_a), matvec=matvec, dtype=np.float64)  # ty:ignore[missing-argument, parameter-already-assigned, unknown-argument]
 
@@ -261,14 +268,13 @@ def _pcg_step(self, active: np.ndarray, x0: np.ndarray | None = None) -> tuple[n
         n_a = int(active.sum())
 
         # System matvec — the same active-set operator as _cg_step.
-        sigma = self._system_operator()
-        active_idx = np.flatnonzero(active)
+        sigma = self._system_operator(active)
         count = [0]
 
         def matvec(v: np.ndarray) -> np.ndarray:
             """Apply the active-set system matrix Sigma_a to v."""
             count[0] += 1
-            return sigma.apply_free(active_idx, v)
+            return sigma.matvec(v)
 
         op = LinearOperator((n_a, n_a), matvec=matvec, dtype=np.float64)  # ty:ignore[missing-argument, parameter-already-assigned, unknown-argument]