From 5933c610cc84b65d6329eebb3b92fb11644579e9 Mon Sep 17 00:00:00 2001 From: Mykhailo Korobkov Date: Sun, 24 May 2026 15:00:05 +0300 Subject: [PATCH] fix(capabilities): accept MLA architectures when full geometry is exposed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #96 wired MLA absorption (`mla_absorb::absorb`) into the f32 weight writer: when an architecture reports `uses_mla() == true` AND exposes all three of `mla_qk_nope_head_dim` / `mla_qk_rope_head_dim` / `mla_v_head_dim`, the writer fuses the four low-rank tensors (q_a / q_b / kv_a / kv_b) into standard dense Q/K/V/O at write time. The on-disk manifest after that is a standard Q/K/V/O vindex — exactly what `ensure_standard_attention_supported` is gating against. But the gate still hard-rejected every `uses_mla()` arch, so the absorption path was unreachable from `larql extract --level inference/attention/all` for any DeepSeek-V2/V3/Kimi-K2 model. The CLI failed before the writer could even try: $ larql extract --level inference DS-V2-Lite-Chat.Q4_K.gguf Error: unsupported architecture 'deepseek' for extract pipeline: multi-head latent attention (MLA) is not implemented This commit narrows the gate to keep rejecting *only* MLA archs whose geometry fields are missing (where absorption can't safely guess a qk_head_dim split). Complete-geometry MLA archs pass through, and `write_f32` runs the absorption path that PR #96 already shipped. End-to-end verification: built locally on top of #135 (which surfaces the MLA fields from GGUF metadata), extracted DeepSeek-V2-Lite-Chat Q4_K (10.4 GB, 27 layers, kv_lora=512, qk_nope=128, qk_rope=64, v_head=128) → 1.13 GB inference-level vindex with `attn_weights.bin` sized at 216 MB (post-absorption standard QKVO). Tests: - `mla_with_full_geometry_is_accepted_so_absorption_can_run` — proves the lower-level `ensure_standard_attention_supported` accepts the complete-geometry case. - `extract_level_inference_accepts_mla_with_full_geometry` — drives the CLI-facing gate at Inference AND All levels. - All existing rejection tests still pass: incomplete-geometry MLA (the existing `mla_arch()` fixture has none of qk_nope/qk_rope/v_head) remains rejected at every level above Browse. 9/9 pass. Combined with #133 / #135 / #136 this completes `larql extract` for DeepSeek-V2 family GGUFs (and clears the last gate for Kimi K2 once #135 + #136 merge). --- .../src/format/weights/capabilities.rs | 78 +++++++++++++++++-- 1 file changed, 73 insertions(+), 5 deletions(-) diff --git a/crates/larql-vindex/src/format/weights/capabilities.rs b/crates/larql-vindex/src/format/weights/capabilities.rs index c8633da3f..efa19e8bb 100644 --- a/crates/larql-vindex/src/format/weights/capabilities.rs +++ b/crates/larql-vindex/src/format/weights/capabilities.rs @@ -14,16 +14,32 @@ const FEATURE_MLA: &str = "multi-head latent attention (MLA)"; /// Q/K/V/O tensors. Architectures such as DeepSeek MLA expose a different /// tensor contract (`mla_*`) and must be implemented explicitly before the /// writer accepts them. +/// +/// As of #96, the f32 writer absorbs MLA Q_a/Q_b/KV_a/KV_b into standard +/// dense Q/K/V tensors at write time when full MLA geometry is known +/// (`qk_nope_head_dim` / `qk_rope_head_dim` / `v_head_dim` all present). +/// In that case MLA is accepted because the absorbed output is a standard +/// Q/K/V/O manifest. MLA architectures without complete geometry still +/// fail here — there's no defensible default split for `qk_head_dim`. pub(super) fn ensure_standard_attention_supported( arch: &dyn larql_models::ModelArchitecture, surface: &'static str, ) -> Result<(), VindexError> { if arch.uses_mla() { - return Err(VindexError::UnsupportedArchitecture { - family: arch.family().to_string(), - feature: FEATURE_MLA.into(), - surface: surface.into(), - }); + // MLA absorption (#96) needs all three head-dim fields to recover + // the pre-absorption split. When any is missing we cannot run + // absorption safely, so the standard writer still has no way to + // represent the attention block — reject up front. + let has_geom = arch.mla_qk_nope_head_dim().is_some() + && arch.mla_qk_rope_head_dim().is_some() + && arch.mla_v_head_dim().is_some(); + if !has_geom { + return Err(VindexError::UnsupportedArchitecture { + family: arch.family().to_string(), + feature: FEATURE_MLA.into(), + surface: surface.into(), + }); + } } Ok(()) @@ -66,6 +82,9 @@ mod tests { const HEAD_DIM_TEST: usize = 128; const KV_LORA_RANK_TEST: usize = 512; const Q_LORA_RANK_TEST: usize = 1536; + const QK_NOPE_HEAD_DIM_TEST: usize = 128; + const QK_ROPE_HEAD_DIM_TEST: usize = 64; + const V_HEAD_DIM_TEST: usize = 128; #[test] fn standard_attention_accepts_llama() { @@ -101,6 +120,8 @@ mod tests { assert!(msg.contains(TEST_Q4K_SURFACE), "{msg}"); } + /// MLA arch without the qk_nope/qk_rope/v_head_dim fields — absorption + /// cannot run, so the standard writer must still reject it. fn mla_arch() -> Box { larql_models::detect_from_json(&serde_json::json!({ "model_type": MODEL_TYPE_DEEPSEEK_V2, @@ -115,6 +136,26 @@ mod tests { })) } + /// MLA arch with the full pre-absorption geometry exposed — `write_f32` + /// can absorb into a standard Q/K/V/O manifest, so the gate must + /// accept it. + fn mla_arch_with_geometry() -> Box { + larql_models::detect_from_json(&serde_json::json!({ + "model_type": MODEL_TYPE_DEEPSEEK_V2, + "hidden_size": HIDDEN_SIZE_TEST, + "intermediate_size": INTERMEDIATE_SIZE_TEST, + "num_hidden_layers": NUM_LAYERS_TEST, + "num_attention_heads": NUM_ATTENTION_HEADS_TEST, + "num_key_value_heads": NUM_KV_HEADS_TEST, + "head_dim": HEAD_DIM_TEST, + "kv_lora_rank": KV_LORA_RANK_TEST, + "q_lora_rank": Q_LORA_RANK_TEST, + "qk_nope_head_dim": QK_NOPE_HEAD_DIM_TEST, + "qk_rope_head_dim": QK_ROPE_HEAD_DIM_TEST, + "v_head_dim": V_HEAD_DIM_TEST + })) + } + fn llama_arch() -> Box { larql_models::detect_from_json(&serde_json::json!({ "model_type": MODEL_TYPE_LLAMA, @@ -166,4 +207,31 @@ mod tests { "Llama models with standard Q/K/V/O attention must pass at every level" ); } + + #[test] + fn mla_with_full_geometry_is_accepted_so_absorption_can_run() { + // PR #96 wires MLA absorption into the f32 writer. The gate must + // not block MLA archs whose qk_nope/qk_rope/v_head dims are exposed; + // the writer will fuse Q_a/Q_b/KV_a/KV_b into standard Q/K/V/O. + assert!( + ensure_standard_attention_supported(&*mla_arch_with_geometry(), TEST_SURFACE).is_ok(), + "MLA with full geometry must be accepted (post-#96 absorption path)" + ); + } + + #[test] + fn extract_level_inference_accepts_mla_with_full_geometry() { + // End-to-end: DS-V2/V3/Kimi K2 GGUFs that expose qk_nope/qk_rope/v_head + // (PR #135 wired this through from `attention.key_length[_mla]` etc.) + // must extract to inference level via the absorption path. + assert!( + ensure_extract_level_supported(&*mla_arch_with_geometry(), ExtractLevel::Inference) + .is_ok(), + "Inference extract should accept MLA when geometry is complete" + ); + assert!( + ensure_extract_level_supported(&*mla_arch_with_geometry(), ExtractLevel::All).is_ok(), + "All-level extract should also accept MLA when geometry is complete" + ); + } }