From 5933c610cc84b65d6329eebb3b92fb11644579e9 Mon Sep 17 00:00:00 2001
From: Mykhailo Korobkov <m.korobkov@mil.ua>
Date: Sun, 24 May 2026 15:00:05 +0300
Subject: [PATCH] fix(capabilities): accept MLA architectures when full
 geometry is exposed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR #96 wired MLA absorption (`mla_absorb::absorb`) into the f32 weight
writer: when an architecture reports `uses_mla() == true` AND exposes
all three of `mla_qk_nope_head_dim` / `mla_qk_rope_head_dim` /
`mla_v_head_dim`, the writer fuses the four low-rank tensors
(q_a / q_b / kv_a / kv_b) into standard dense Q/K/V/O at write time.
The on-disk manifest after that is a standard Q/K/V/O vindex —
exactly what `ensure_standard_attention_supported` is gating against.

But the gate still hard-rejected every `uses_mla()` arch, so the
absorption path was unreachable from `larql extract --level
inference/attention/all` for any DeepSeek-V2/V3/Kimi-K2 model. The CLI
failed before the writer could even try:

  $ larql extract --level inference DS-V2-Lite-Chat.Q4_K.gguf
  Error: unsupported architecture 'deepseek' for extract pipeline:
  multi-head latent attention (MLA) is not implemented

This commit narrows the gate to keep rejecting *only* MLA archs whose
geometry fields are missing (where absorption can't safely guess a
qk_head_dim split). Complete-geometry MLA archs pass through, and
`write_f32` runs the absorption path that PR #96 already shipped.

End-to-end verification: built locally on top of #135 (which surfaces
the MLA fields from GGUF metadata), extracted DeepSeek-V2-Lite-Chat
Q4_K (10.4 GB, 27 layers, kv_lora=512, qk_nope=128, qk_rope=64,
v_head=128) → 1.13 GB inference-level vindex with `attn_weights.bin`
sized at 216 MB (post-absorption standard QKVO).

Tests:
- `mla_with_full_geometry_is_accepted_so_absorption_can_run` — proves
  the lower-level `ensure_standard_attention_supported` accepts the
  complete-geometry case.
- `extract_level_inference_accepts_mla_with_full_geometry` — drives
  the CLI-facing gate at Inference AND All levels.
- All existing rejection tests still pass: incomplete-geometry MLA
  (the existing `mla_arch()` fixture has none of qk_nope/qk_rope/v_head)
  remains rejected at every level above Browse. 9/9 pass.

Combined with #133 / #135 / #136 this completes `larql extract` for
DeepSeek-V2 family GGUFs (and clears the last gate for Kimi K2 once
#135 + #136 merge).
---
 .../src/format/weights/capabilities.rs        | 78 +++++++++++++++++--
 1 file changed, 73 insertions(+), 5 deletions(-)

diff --git a/crates/larql-vindex/src/format/weights/capabilities.rs b/crates/larql-vindex/src/format/weights/capabilities.rs
index c8633da3f..efa19e8bb 100644
--- a/crates/larql-vindex/src/format/weights/capabilities.rs
+++ b/crates/larql-vindex/src/format/weights/capabilities.rs
@@ -14,16 +14,32 @@ const FEATURE_MLA: &str = "multi-head latent attention (MLA)";
 /// Q/K/V/O tensors. Architectures such as DeepSeek MLA expose a different
 /// tensor contract (`mla_*`) and must be implemented explicitly before the
 /// writer accepts them.
+///
+/// As of #96, the f32 writer absorbs MLA Q_a/Q_b/KV_a/KV_b into standard
+/// dense Q/K/V tensors at write time when full MLA geometry is known
+/// (`qk_nope_head_dim` / `qk_rope_head_dim` / `v_head_dim` all present).
+/// In that case MLA is accepted because the absorbed output is a standard
+/// Q/K/V/O manifest. MLA architectures without complete geometry still
+/// fail here — there's no defensible default split for `qk_head_dim`.
 pub(super) fn ensure_standard_attention_supported(
     arch: &dyn larql_models::ModelArchitecture,
     surface: &'static str,
 ) -> Result<(), VindexError> {
     if arch.uses_mla() {
-        return Err(VindexError::UnsupportedArchitecture {
-            family: arch.family().to_string(),
-            feature: FEATURE_MLA.into(),
-            surface: surface.into(),
-        });
+        // MLA absorption (#96) needs all three head-dim fields to recover
+        // the pre-absorption split. When any is missing we cannot run
+        // absorption safely, so the standard writer still has no way to
+        // represent the attention block — reject up front.
+        let has_geom = arch.mla_qk_nope_head_dim().is_some()
+            && arch.mla_qk_rope_head_dim().is_some()
+            && arch.mla_v_head_dim().is_some();
+        if !has_geom {
+            return Err(VindexError::UnsupportedArchitecture {
+                family: arch.family().to_string(),
+                feature: FEATURE_MLA.into(),
+                surface: surface.into(),
+            });
+        }
     }
 
     Ok(())
@@ -66,6 +82,9 @@ mod tests {
     const HEAD_DIM_TEST: usize = 128;
     const KV_LORA_RANK_TEST: usize = 512;
     const Q_LORA_RANK_TEST: usize = 1536;
+    const QK_NOPE_HEAD_DIM_TEST: usize = 128;
+    const QK_ROPE_HEAD_DIM_TEST: usize = 64;
+    const V_HEAD_DIM_TEST: usize = 128;
 
     #[test]
     fn standard_attention_accepts_llama() {
@@ -101,6 +120,8 @@ mod tests {
         assert!(msg.contains(TEST_Q4K_SURFACE), "{msg}");
     }
 
+    /// MLA arch without the qk_nope/qk_rope/v_head_dim fields — absorption
+    /// cannot run, so the standard writer must still reject it.
     fn mla_arch() -> Box<dyn larql_models::ModelArchitecture> {
         larql_models::detect_from_json(&serde_json::json!({
             "model_type": MODEL_TYPE_DEEPSEEK_V2,
@@ -115,6 +136,26 @@ mod tests {
         }))
     }
 
+    /// MLA arch with the full pre-absorption geometry exposed — `write_f32`
+    /// can absorb into a standard Q/K/V/O manifest, so the gate must
+    /// accept it.
+    fn mla_arch_with_geometry() -> Box<dyn larql_models::ModelArchitecture> {
+        larql_models::detect_from_json(&serde_json::json!({
+            "model_type": MODEL_TYPE_DEEPSEEK_V2,
+            "hidden_size": HIDDEN_SIZE_TEST,
+            "intermediate_size": INTERMEDIATE_SIZE_TEST,
+            "num_hidden_layers": NUM_LAYERS_TEST,
+            "num_attention_heads": NUM_ATTENTION_HEADS_TEST,
+            "num_key_value_heads": NUM_KV_HEADS_TEST,
+            "head_dim": HEAD_DIM_TEST,
+            "kv_lora_rank": KV_LORA_RANK_TEST,
+            "q_lora_rank": Q_LORA_RANK_TEST,
+            "qk_nope_head_dim": QK_NOPE_HEAD_DIM_TEST,
+            "qk_rope_head_dim": QK_ROPE_HEAD_DIM_TEST,
+            "v_head_dim": V_HEAD_DIM_TEST
+        }))
+    }
+
     fn llama_arch() -> Box<dyn larql_models::ModelArchitecture> {
         larql_models::detect_from_json(&serde_json::json!({
             "model_type": MODEL_TYPE_LLAMA,
@@ -166,4 +207,31 @@ mod tests {
             "Llama models with standard Q/K/V/O attention must pass at every level"
         );
     }
+
+    #[test]
+    fn mla_with_full_geometry_is_accepted_so_absorption_can_run() {
+        // PR #96 wires MLA absorption into the f32 writer. The gate must
+        // not block MLA archs whose qk_nope/qk_rope/v_head dims are exposed;
+        // the writer will fuse Q_a/Q_b/KV_a/KV_b into standard Q/K/V/O.
+        assert!(
+            ensure_standard_attention_supported(&*mla_arch_with_geometry(), TEST_SURFACE).is_ok(),
+            "MLA with full geometry must be accepted (post-#96 absorption path)"
+        );
+    }
+
+    #[test]
+    fn extract_level_inference_accepts_mla_with_full_geometry() {
+        // End-to-end: DS-V2/V3/Kimi K2 GGUFs that expose qk_nope/qk_rope/v_head
+        // (PR #135 wired this through from `attention.key_length[_mla]` etc.)
+        // must extract to inference level via the absorption path.
+        assert!(
+            ensure_extract_level_supported(&*mla_arch_with_geometry(), ExtractLevel::Inference)
+                .is_ok(),
+            "Inference extract should accept MLA when geometry is complete"
+        );
+        assert!(
+            ensure_extract_level_supported(&*mla_arch_with_geometry(), ExtractLevel::All).is_ok(),
+            "All-level extract should also accept MLA when geometry is complete"
+        );
+    }
 }