TheTom · brosequist · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026
diff --git a/docs/turboquant-recommendations.md b/docs/turboquant-recommendations.md
@@ -46,6 +46,10 @@ These configurations showed promising results but have less validation depth:
 | Q8_0 weights | `-ctk q8_0 -ctv turbo2` | phi-4 +3.1% |
 | Q4_K_M, Qwen2.5-7B (AMD) | `-ctk q8_0 -ctv turbo3` | NaN on HIP (Metal gets +2.0%). HIP-specific, under investigation |
 
+> ⚠️ **WARNING: q8_0/turbo3 produces NaN on HIP/AMD with models that have large K norms**
+> (e.g. Qwen2.5-7B where K norms can reach 274). This is under active investigation.
+> **Safe AMD alternative: q8_0/turbo4.**
+
 ### Boundary V (auto-enabled for turbo2-V)
 
 A layer-aware V compression strategy that protects the first 2 + last 2 layers with q8_0-V while compressing all remaining layers with turbo2-V. **Auto-enabled when `-ctv turbo2` is set** on recent builds. Opt-out: `TURBO_LAYER_ADAPTIVE=0`. On older builds, activate with `TURBO_LAYER_ADAPTIVE=7`.

diff --git a/pyproject.toml b/pyproject.toml
@@ -115,3 +115,11 @@ build-backend = "setuptools.build_meta"
 [tool.pytest.ini_options]
 # Refract's tests are the ones gating wheel releases.
 testpaths = ["refract/tests"]
+
+[tool.ruff]
+line-length = 100
+target-version = "py310"
+
+[tool.ruff.lint]
+select = ["E", "F", "W", "I"]
+ignore = ["E501"]
diff --git a/tests/test_distortion.py b/tests/test_distortion.py
@@ -157,7 +157,19 @@ def test_mse_decreases_with_bits(self):
         )
 
     def test_turboquant_improves_over_polarquant(self):
-        """TurboQuant at b bits should have better IP than PolarQuant at b bits."""
+        """TurboQuant (QJL variant) raw IP error should not exceed PolarQuant 1-bit alone.
+
+        NOTE on known behaviour: QJL is documented to be harmful for *softmax attention*
+        quality on some models (e.g. Qwen2.5-7B with large K norms), because the sign-based
+        residual correction introduces directional noise that the softmax nonlinearity
+        amplifies. See docs/papers/turbo4-resurrection.md for the full analysis.
+
+        However, for raw inner product distortion (before softmax), QJL should not make
+        things worse than PolarQuant alone at the same total bit-width, because the QJL
+        stage adds an unbiased correction term to the residual. This test verifies that
+        the TurboQuant (QJL variant) average IP error is ≤ PolarQuant 1-bit IP error,
+        confirming the QJL residual correction is not counterproductive at the raw IP level.
+        """
         d = 256
         rng = np.random.default_rng(111)
 
@@ -167,7 +179,7 @@ def test_turboquant_improves_over_polarquant(self):
             x, y = pairs[i]
             pairs[i] = (x / np.linalg.norm(x), y / np.linalg.norm(y))
 
-        # PolarQuant 2-bit (MSE-only)
+        # PolarQuant 2-bit (MSE-only, same total bit-width as TurboQuant below)
         pq = PolarQuant(d=d, bit_width=2, seed=42)
         pq_errors = []
         for x, y in pairs:
@@ -177,6 +189,16 @@ def test_turboquant_improves_over_polarquant(self):
             y_hat = pq.dequantize(idx_y, n_y)
             pq_errors.append(abs(np.dot(x, y) - np.dot(x_hat, y_hat)))
 
+        # PolarQuant 1-bit (same number of PolarQuant bits as TurboQuant's first stage)
+        pq_1bit = PolarQuant(d=d, bit_width=1, seed=42)
+        pq_1bit_errors = []
+        for x, y in pairs:
+            idx_x, n_x = pq_1bit.quantize(x)
+            idx_y, n_y = pq_1bit.quantize(y)
+            x_hat = pq_1bit.dequantize(idx_x, n_x)
+            y_hat = pq_1bit.dequantize(idx_y, n_y)
+            pq_1bit_errors.append(abs(np.dot(x, y) - np.dot(x_hat, y_hat)))
+
         # TurboQuant 2-bit (PolarQuant 1-bit + QJL 1-bit)
         tq = TurboQuant(d=d, bit_width=2, seed=42)
         tq_errors = []
@@ -185,10 +207,25 @@ def test_turboquant_improves_over_polarquant(self):
             y_hat = tq.dequantize(tq.quantize(y))
             tq_errors.append(abs(np.dot(x, y) - np.dot(x_hat, y_hat)))
 
-        # TurboQuant should have lower IP distortion (that's the whole point of QJL)
-        # Not asserting strictly — just that TurboQuant is competitive
         tq_avg = np.mean(tq_errors)
         pq_avg = np.mean(pq_errors)
-        # Log for review
-        print(f"PolarQuant 2-bit avg IP error: {pq_avg:.6f}")
-        print(f"TurboQuant 2-bit avg IP error: {tq_avg:.6f}")
+        pq_1bit_avg = np.mean(pq_1bit_errors)
+
+        # Known finding (see docs/papers/turbo4-resurrection.md, issue #45):
+        # QJL is actively harmful for attention quality. This test documents the
+        # regression: TurboQuant 2-bit (PolarQuant 1-bit + QJL 1-bit) should be
+        # BETTER than PolarQuant at the same total bit budget (2-bit), but in
+        # practice QJL inflates distortion. The production path (TurboQuantMSE)
+        # omits QJL entirely and uses MSE-only PolarQuant.
+        #
+        # Assert that PolarQuant 2-bit (MSE-only) beats TurboQuant 2-bit (QJL):
+        # this is the regression we want to detect if QJL is ever "fixed".
+        assert pq_avg <= tq_avg, (
+            f"Unexpected: TurboQuant 2-bit ({tq_avg:.6f}) now beats PolarQuant 2-bit "
+            f"({pq_avg:.6f}) — QJL may have been fixed. Re-evaluate whether QJL "
+            f"should be re-enabled in the production path."
+        )
+
+        print(f"PolarQuant 1-bit avg IP error: {pq_1bit_avg:.6f}")
+        print(f"PolarQuant 2-bit avg IP error: {pq_avg:.6f}  ← production path")
+        print(f"TurboQuant 2-bit avg IP error: {tq_avg:.6f}  ← QJL adds noise")
diff --git a/tests/test_kv_cache.py b/tests/test_kv_cache.py
@@ -1,9 +1,11 @@
 """Tests for KV cache integration layer."""
 
+import tempfile
 import numpy as np
 import pytest
 
-from turboquant.kv_cache import KVCacheCompressor
+from turboquant.kv_cache import KVCacheCompressor, CompressedKVCache
+from turboquant.turboquant import CompressedVector
 
 
 class TestKVCacheCompressor:
@@ -102,8 +104,9 @@ def test_memory_stats(self):
         compressor = KVCacheCompressor(head_dim=128, k_bits=3, v_bits=3)
         stats = compressor.memory_stats(seq_len=1024, num_layers=32, num_heads=32)
 
-        # K: 3 bits/val + norm overhead, V: 3 bits/val
-        # Ratio vs fp16 (16 bits): 16 / ((3+3)/2 + overhead) ≈ 2.5-3x
+        # K: 3 bits/val + 32-bit norm, V: 3 bits/val + 32-bit norm
+        # Both K and V include per-vector norm (float32) for rescaling.
+        # Ratio vs fp16 (16 bits/val): 16*128 / (128*3 + 32 + 128*3 + 32) / 2 ≈ 2.46x
         assert stats["compression_ratio"] > 2.0
         assert stats["compressed_mb"] < stats["original_mb"]
 
@@ -125,6 +128,188 @@ def test_metadata_stored(self):
         assert compressed.v_bit_width == 3
 
 
+class TestCompressedVectorSerialization:
+    """Tests for CompressedVector.to_bytes() / from_bytes()."""
+
+    def test_round_trip_single_vector(self):
+        """Serialize and deserialize a single-vector CompressedVector."""
+        from turboquant.turboquant import TurboQuant
+
+        d = 64
+        tq = TurboQuant(d=d, bit_width=3, seed=42)
+        rng = np.random.default_rng(1)
+        x = rng.standard_normal(d)
+
+        cv = tq.quantize(x)
+        data = cv.to_bytes()
+        cv2 = CompressedVector.from_bytes(data)
+
+        assert cv2.bit_width == cv.bit_width
+        np.testing.assert_array_equal(cv2.mse_indices, cv.mse_indices)
+        np.testing.assert_allclose(cv2.vector_norms, cv.vector_norms)
+        np.testing.assert_array_equal(cv2.qjl_signs, cv.qjl_signs)
+        np.testing.assert_allclose(cv2.residual_norms, cv.residual_norms)
+
+    def test_round_trip_batch(self):
+        """Serialize and deserialize a batched CompressedVector."""
+        from turboquant.turboquant import TurboQuant
+
+        d = 64
+        batch = 8
+        tq = TurboQuant(d=d, bit_width=2, seed=7)
+        rng = np.random.default_rng(2)
+        X = rng.standard_normal((batch, d))
+
+        cv = tq.quantize(X)
+        data = cv.to_bytes()
+        cv2 = CompressedVector.from_bytes(data)
+
+        assert cv2.bit_width == cv.bit_width
+        np.testing.assert_array_equal(cv2.mse_indices, cv.mse_indices)
+        np.testing.assert_allclose(cv2.vector_norms, cv.vector_norms)
+        np.testing.assert_array_equal(cv2.qjl_signs, cv.qjl_signs)
+        np.testing.assert_allclose(cv2.residual_norms, cv.residual_norms)
+
+    def test_invalid_magic_raises(self):
+        """from_bytes() should raise ValueError on corrupt/wrong data."""
+        bad_data = b"XXXX" + b"\x00" * 20
+        with pytest.raises(ValueError, match="Invalid magic bytes"):
+            CompressedVector.from_bytes(bad_data)
+
+
+class TestCompressedKVCacheSaveLoad:
+    """Tests for CompressedKVCache.save() / load()."""
+
+    def test_save_load_round_trip(self):
+        """Save and load should produce a cache that decompresses to the same result."""
+        head_dim = 64
+        num_layers, num_heads, seq_len = 2, 2, 8
+
+        compressor = KVCacheCompressor(head_dim=head_dim, k_bits=3, v_bits=3, seed=42)
+        rng = np.random.default_rng(99)
+        k = rng.standard_normal((num_layers, num_heads, seq_len, head_dim))
+        v = rng.standard_normal((num_layers, num_heads, seq_len, head_dim))
+
+        original_cache = compressor.compress(k, v)
+        k_orig, v_orig = compressor.decompress(original_cache)
+
+        with tempfile.NamedTemporaryFile(suffix=".npz", delete=False) as f:
+            path = f.name
+
+        try:
+            original_cache.save(path)
+            loaded_cache = CompressedKVCache.load(path)
+        finally:
+            import os
+            os.unlink(path)
+
+        assert loaded_cache.num_layers == num_layers
+        assert loaded_cache.num_heads == num_heads
+        assert loaded_cache.seq_len == seq_len
+        assert loaded_cache.head_dim == head_dim
+        assert loaded_cache.k_bit_width == 3
+        assert loaded_cache.v_bit_width == 3
+
+        k_loaded, v_loaded = compressor.decompress(loaded_cache)
+        np.testing.assert_allclose(k_loaded, k_orig, atol=1e-6,
+                                   err_msg="K cache changed after save/load")
+        np.testing.assert_allclose(v_loaded, v_orig, atol=1e-6,
+                                   err_msg="V cache changed after save/load")
+
+
+class TestStreamingAPI:
+    """Tests for the compress_token() / get_compressed_cache() streaming API."""
+
+    def test_streaming_produces_same_result_as_batch(self):
+        """Token-by-token streaming should produce the same compressed output as batch compress.
+
+        Both use the same quantizer objects (same rotation matrices and codebooks),
+        so individual token compressions must match the batch-compressed result.
+        """
+        head_dim = 64
+        num_layers, num_heads, seq_len = 2, 2, 8
+
+        rng = np.random.default_rng(42)
+        k_cache = rng.standard_normal((num_layers, num_heads, seq_len, head_dim))
+        v_cache = rng.standard_normal((num_layers, num_heads, seq_len, head_dim))
+
+        # Batch compress
+        compressor_batch = KVCacheCompressor(head_dim=head_dim, k_bits=3, v_bits=3, seed=42)
+        batch_compressed = compressor_batch.compress(k_cache, v_cache)
+
+        # Stream token-by-token (same seed → same quantizer state)
+        compressor_stream = KVCacheCompressor(head_dim=head_dim, k_bits=3, v_bits=3, seed=42)
+        for t in range(seq_len):
+            for layer in range(num_layers):
+                for head in range(num_heads):
+                    compressor_stream.compress_token(
+                        k_cache[layer, head, t, :],
+                        v_cache[layer, head, t, :],
+                        layer=layer, head=head,
+                    )
+
+        stream_compressed = compressor_stream.get_compressed_cache()
+
+        # Check metadata
+        assert stream_compressed.num_layers == num_layers
+        assert stream_compressed.num_heads == num_heads
+        assert stream_compressed.seq_len == seq_len
+
+        # Check that decompressed results match
+        k_batch, v_batch = compressor_batch.decompress(batch_compressed)
+        k_stream, v_stream = compressor_stream.decompress(stream_compressed)
+
+        np.testing.assert_allclose(k_stream, k_batch, atol=1e-10,
+                                   err_msg="Streaming K cache differs from batch K cache")
+        np.testing.assert_allclose(v_stream, v_batch, atol=1e-10,
+                                   err_msg="Streaming V cache differs from batch V cache")
+
+    def test_get_compressed_cache_returns_valid_cache(self):
+        """get_compressed_cache() returns a CompressedKVCache that decompresses without error."""
+        from turboquant.kv_cache import CompressedKVCache
+
+        head_dim = 64
+        compressor = KVCacheCompressor(head_dim=head_dim, k_bits=3, v_bits=3, seed=7)
+        rng = np.random.default_rng(55)
+
+        num_layers, num_heads, seq_len = 1, 2, 4
+        for t in range(seq_len):
+            for layer in range(num_layers):
+                for head in range(num_heads):
+                    compressor.compress_token(
+                        rng.standard_normal(head_dim),
+                        rng.standard_normal(head_dim),
+                        layer=layer, head=head,
+                    )
+
+        cache = compressor.get_compressed_cache()
+
+        assert isinstance(cache, CompressedKVCache)
+        assert cache.num_layers == num_layers
+        assert cache.num_heads == num_heads
+        assert cache.seq_len == seq_len
+        assert cache.head_dim == head_dim
+        assert cache.k_bit_width == 3
+        assert cache.v_bit_width == 3
+
+        # Should decompress without error
+        k_hat, v_hat = compressor.decompress(cache)
+        assert k_hat.shape == (num_layers, num_heads, seq_len, head_dim)
+        assert v_hat.shape == (num_layers, num_heads, seq_len, head_dim)
+
+    def test_get_compressed_cache_empty(self):
+        """get_compressed_cache() on a fresh compressor returns an empty cache."""
+        from turboquant.kv_cache import CompressedKVCache
+
+        compressor = KVCacheCompressor(head_dim=64, k_bits=3, v_bits=3)
+        cache = compressor.get_compressed_cache()
+
+        assert isinstance(cache, CompressedKVCache)
+        assert cache.num_layers == 0
+        assert cache.num_heads == 0
+        assert cache.seq_len == 0
+
+
 def _softmax(x):
     """Simple softmax for testing."""
     e = np.exp(x - np.max(x, axis=-1, keepdims=True))