Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 73 additions & 5 deletions crates/larql-vindex/src/format/weights/capabilities.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,32 @@ const FEATURE_MLA: &str = "multi-head latent attention (MLA)";
/// Q/K/V/O tensors. Architectures such as DeepSeek MLA expose a different
/// tensor contract (`mla_*`) and must be implemented explicitly before the
/// writer accepts them.
///
/// As of #96, the f32 writer absorbs MLA Q_a/Q_b/KV_a/KV_b into standard
/// dense Q/K/V tensors at write time when full MLA geometry is known
/// (`qk_nope_head_dim` / `qk_rope_head_dim` / `v_head_dim` all present).
/// In that case MLA is accepted because the absorbed output is a standard
/// Q/K/V/O manifest. MLA architectures without complete geometry still
/// fail here — there's no defensible default split for `qk_head_dim`.
pub(super) fn ensure_standard_attention_supported(
arch: &dyn larql_models::ModelArchitecture,
surface: &'static str,
) -> Result<(), VindexError> {
if arch.uses_mla() {
return Err(VindexError::UnsupportedArchitecture {
family: arch.family().to_string(),
feature: FEATURE_MLA.into(),
surface: surface.into(),
});
// MLA absorption (#96) needs all three head-dim fields to recover
// the pre-absorption split. When any is missing we cannot run
// absorption safely, so the standard writer still has no way to
// represent the attention block — reject up front.
let has_geom = arch.mla_qk_nope_head_dim().is_some()
&& arch.mla_qk_rope_head_dim().is_some()
&& arch.mla_v_head_dim().is_some();
if !has_geom {
return Err(VindexError::UnsupportedArchitecture {
family: arch.family().to_string(),
feature: FEATURE_MLA.into(),
surface: surface.into(),
});
}
}

Ok(())
Expand Down Expand Up @@ -66,6 +82,9 @@ mod tests {
const HEAD_DIM_TEST: usize = 128;
const KV_LORA_RANK_TEST: usize = 512;
const Q_LORA_RANK_TEST: usize = 1536;
const QK_NOPE_HEAD_DIM_TEST: usize = 128;
const QK_ROPE_HEAD_DIM_TEST: usize = 64;
const V_HEAD_DIM_TEST: usize = 128;

#[test]
fn standard_attention_accepts_llama() {
Expand Down Expand Up @@ -101,6 +120,8 @@ mod tests {
assert!(msg.contains(TEST_Q4K_SURFACE), "{msg}");
}

/// MLA arch without the qk_nope/qk_rope/v_head_dim fields — absorption
/// cannot run, so the standard writer must still reject it.
fn mla_arch() -> Box<dyn larql_models::ModelArchitecture> {
larql_models::detect_from_json(&serde_json::json!({
"model_type": MODEL_TYPE_DEEPSEEK_V2,
Expand All @@ -115,6 +136,26 @@ mod tests {
}))
}

/// MLA arch with the full pre-absorption geometry exposed — `write_f32`
/// can absorb into a standard Q/K/V/O manifest, so the gate must
/// accept it.
fn mla_arch_with_geometry() -> Box<dyn larql_models::ModelArchitecture> {
larql_models::detect_from_json(&serde_json::json!({
"model_type": MODEL_TYPE_DEEPSEEK_V2,
"hidden_size": HIDDEN_SIZE_TEST,
"intermediate_size": INTERMEDIATE_SIZE_TEST,
"num_hidden_layers": NUM_LAYERS_TEST,
"num_attention_heads": NUM_ATTENTION_HEADS_TEST,
"num_key_value_heads": NUM_KV_HEADS_TEST,
"head_dim": HEAD_DIM_TEST,
"kv_lora_rank": KV_LORA_RANK_TEST,
"q_lora_rank": Q_LORA_RANK_TEST,
"qk_nope_head_dim": QK_NOPE_HEAD_DIM_TEST,
"qk_rope_head_dim": QK_ROPE_HEAD_DIM_TEST,
"v_head_dim": V_HEAD_DIM_TEST
}))
}

fn llama_arch() -> Box<dyn larql_models::ModelArchitecture> {
larql_models::detect_from_json(&serde_json::json!({
"model_type": MODEL_TYPE_LLAMA,
Expand Down Expand Up @@ -166,4 +207,31 @@ mod tests {
"Llama models with standard Q/K/V/O attention must pass at every level"
);
}

#[test]
fn mla_with_full_geometry_is_accepted_so_absorption_can_run() {
// PR #96 wires MLA absorption into the f32 writer. The gate must
// not block MLA archs whose qk_nope/qk_rope/v_head dims are exposed;
// the writer will fuse Q_a/Q_b/KV_a/KV_b into standard Q/K/V/O.
assert!(
ensure_standard_attention_supported(&*mla_arch_with_geometry(), TEST_SURFACE).is_ok(),
"MLA with full geometry must be accepted (post-#96 absorption path)"
);
}

#[test]
fn extract_level_inference_accepts_mla_with_full_geometry() {
// End-to-end: DS-V2/V3/Kimi K2 GGUFs that expose qk_nope/qk_rope/v_head
// (PR #135 wired this through from `attention.key_length[_mla]` etc.)
// must extract to inference level via the absorption path.
assert!(
ensure_extract_level_supported(&*mla_arch_with_geometry(), ExtractLevel::Inference)
.is_ok(),
"Inference extract should accept MLA when geometry is complete"
);
assert!(
ensure_extract_level_supported(&*mla_arch_with_geometry(), ExtractLevel::All).is_ok(),
"All-level extract should also accept MLA when geometry is complete"
);
}
}
Loading