diff --git a/crates/larql-vindex/src/format/weights/capabilities.rs b/crates/larql-vindex/src/format/weights/capabilities.rs index c8633da3f..efa19e8bb 100644 --- a/crates/larql-vindex/src/format/weights/capabilities.rs +++ b/crates/larql-vindex/src/format/weights/capabilities.rs @@ -14,16 +14,32 @@ const FEATURE_MLA: &str = "multi-head latent attention (MLA)"; /// Q/K/V/O tensors. Architectures such as DeepSeek MLA expose a different /// tensor contract (`mla_*`) and must be implemented explicitly before the /// writer accepts them. +/// +/// As of #96, the f32 writer absorbs MLA Q_a/Q_b/KV_a/KV_b into standard +/// dense Q/K/V tensors at write time when full MLA geometry is known +/// (`qk_nope_head_dim` / `qk_rope_head_dim` / `v_head_dim` all present). +/// In that case MLA is accepted because the absorbed output is a standard +/// Q/K/V/O manifest. MLA architectures without complete geometry still +/// fail here — there's no defensible default split for `qk_head_dim`. pub(super) fn ensure_standard_attention_supported( arch: &dyn larql_models::ModelArchitecture, surface: &'static str, ) -> Result<(), VindexError> { if arch.uses_mla() { - return Err(VindexError::UnsupportedArchitecture { - family: arch.family().to_string(), - feature: FEATURE_MLA.into(), - surface: surface.into(), - }); + // MLA absorption (#96) needs all three head-dim fields to recover + // the pre-absorption split. When any is missing we cannot run + // absorption safely, so the standard writer still has no way to + // represent the attention block — reject up front. + let has_geom = arch.mla_qk_nope_head_dim().is_some() + && arch.mla_qk_rope_head_dim().is_some() + && arch.mla_v_head_dim().is_some(); + if !has_geom { + return Err(VindexError::UnsupportedArchitecture { + family: arch.family().to_string(), + feature: FEATURE_MLA.into(), + surface: surface.into(), + }); + } } Ok(()) @@ -66,6 +82,9 @@ mod tests { const HEAD_DIM_TEST: usize = 128; const KV_LORA_RANK_TEST: usize = 512; const Q_LORA_RANK_TEST: usize = 1536; + const QK_NOPE_HEAD_DIM_TEST: usize = 128; + const QK_ROPE_HEAD_DIM_TEST: usize = 64; + const V_HEAD_DIM_TEST: usize = 128; #[test] fn standard_attention_accepts_llama() { @@ -101,6 +120,8 @@ mod tests { assert!(msg.contains(TEST_Q4K_SURFACE), "{msg}"); } + /// MLA arch without the qk_nope/qk_rope/v_head_dim fields — absorption + /// cannot run, so the standard writer must still reject it. fn mla_arch() -> Box { larql_models::detect_from_json(&serde_json::json!({ "model_type": MODEL_TYPE_DEEPSEEK_V2, @@ -115,6 +136,26 @@ mod tests { })) } + /// MLA arch with the full pre-absorption geometry exposed — `write_f32` + /// can absorb into a standard Q/K/V/O manifest, so the gate must + /// accept it. + fn mla_arch_with_geometry() -> Box { + larql_models::detect_from_json(&serde_json::json!({ + "model_type": MODEL_TYPE_DEEPSEEK_V2, + "hidden_size": HIDDEN_SIZE_TEST, + "intermediate_size": INTERMEDIATE_SIZE_TEST, + "num_hidden_layers": NUM_LAYERS_TEST, + "num_attention_heads": NUM_ATTENTION_HEADS_TEST, + "num_key_value_heads": NUM_KV_HEADS_TEST, + "head_dim": HEAD_DIM_TEST, + "kv_lora_rank": KV_LORA_RANK_TEST, + "q_lora_rank": Q_LORA_RANK_TEST, + "qk_nope_head_dim": QK_NOPE_HEAD_DIM_TEST, + "qk_rope_head_dim": QK_ROPE_HEAD_DIM_TEST, + "v_head_dim": V_HEAD_DIM_TEST + })) + } + fn llama_arch() -> Box { larql_models::detect_from_json(&serde_json::json!({ "model_type": MODEL_TYPE_LLAMA, @@ -166,4 +207,31 @@ mod tests { "Llama models with standard Q/K/V/O attention must pass at every level" ); } + + #[test] + fn mla_with_full_geometry_is_accepted_so_absorption_can_run() { + // PR #96 wires MLA absorption into the f32 writer. The gate must + // not block MLA archs whose qk_nope/qk_rope/v_head dims are exposed; + // the writer will fuse Q_a/Q_b/KV_a/KV_b into standard Q/K/V/O. + assert!( + ensure_standard_attention_supported(&*mla_arch_with_geometry(), TEST_SURFACE).is_ok(), + "MLA with full geometry must be accepted (post-#96 absorption path)" + ); + } + + #[test] + fn extract_level_inference_accepts_mla_with_full_geometry() { + // End-to-end: DS-V2/V3/Kimi K2 GGUFs that expose qk_nope/qk_rope/v_head + // (PR #135 wired this through from `attention.key_length[_mla]` etc.) + // must extract to inference level via the absorption path. + assert!( + ensure_extract_level_supported(&*mla_arch_with_geometry(), ExtractLevel::Inference) + .is_ok(), + "Inference extract should accept MLA when geometry is complete" + ); + assert!( + ensure_extract_level_supported(&*mla_arch_with_geometry(), ExtractLevel::All).is_ok(), + "All-level extract should also accept MLA when geometry is complete" + ); + } }