From 2bb2a95e27b443a2e2c24a90e3f089cbf527adcb Mon Sep 17 00:00:00 2001 From: Mykhailo Korobkov Date: Sun, 24 May 2026 15:18:52 +0300 Subject: [PATCH] fix(gguf): fall back to expert_feed_forward_length for MoE-only configs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DeepSeek-V4 family (and other MoE-only GGUFs) emit only `{arch}.expert_feed_forward_length` — they never set the global `{arch}.feed_forward_length` because there is no dense FFN layer above the per-expert size. The current loader reads only the global key, so `intermediate_size` came back as `0` and config validation rejected the model with: Error: failed to load GGUF model: config validation failed: [ConfigValidationError { field: "intermediate_size", message: "must be greater than 0" }] repro: `larql extract --level browse \ DeepSeek-V4-Flash-Q3_K_M-00001-of-00003.gguf` The HF config exposes `intermediate_size` as a single number, and in every llama.cpp-supported architecture the per-expert FFN inner dim matches what a non-MoE variant would call `intermediate_size` (DS-V2-Lite-Chat happens to emit both at the same value when MoE is active). Falling back to the per-expert key on absence of the global key is the correct semantic. Tests: - `test_gguf_to_config_json_falls_back_to_expert_feed_forward_length_on_moe`: synthesises DS-V4-Flash-shaped metadata (no `feed_forward_length`, `expert_feed_forward_length: 2048`); verifies `intermediate_size: 2048` and that the validated detection path now accepts the config. - `test_gguf_to_config_json_prefers_global_feed_forward_length_when_both_present`: when both keys are emitted (some hybrid configs do), the global key still wins — no behaviour change for non-MoE-only models. 281/281 larql-models tests pass. This unblocks DeepSeek-V4-Flash extraction. Combined with #135 / #136 / #137, the only remaining piece for full Kimi-K2/DS-V4-Flash inference extraction is the per-tensor streaming GGUF reader (still in the in-memory load path; tracks separately). --- crates/larql-models/src/loading/gguf.rs | 118 +++++++++++++++++++++++- 1 file changed, 117 insertions(+), 1 deletion(-) diff --git a/crates/larql-models/src/loading/gguf.rs b/crates/larql-models/src/loading/gguf.rs index cfcd124b4..6afe9f216 100644 --- a/crates/larql-models/src/loading/gguf.rs +++ b/crates/larql-models/src/loading/gguf.rs @@ -38,6 +38,11 @@ const GGUF_GENERAL_ARCHITECTURE: &str = "general.architecture"; const GGUF_EMBEDDING_LENGTH: &str = "embedding_length"; const GGUF_BLOCK_COUNT: &str = "block_count"; const GGUF_FEED_FORWARD_LENGTH: &str = "feed_forward_length"; +// MoE-only architectures (DeepSeek-V4, etc.) omit the global +// `feed_forward_length` and emit only the per-expert size. We fall back +// to it so config validation doesn't reject the model with +// `intermediate_size: must be greater than 0`. +const GGUF_EXPERT_FEED_FORWARD_LENGTH: &str = "expert_feed_forward_length"; const GGUF_ATTENTION_HEAD_COUNT: &str = "attention.head_count"; const GGUF_ATTENTION_HEAD_COUNT_KV: &str = "attention.head_count_kv"; const GGUF_ATTENTION_KEY_LENGTH: &str = "attention.key_length"; @@ -395,11 +400,25 @@ impl GgufFile { num_heads }; + // intermediate_size: prefer the global `feed_forward_length`. For + // MoE-only models (DeepSeek-V4 family) the global key is omitted, + // so we fall back to the per-expert size. The HF config exposes + // `intermediate_size` as a single number even on MoE archs because + // it represents the FFN inner dim — per-expert and per-layer FFNs + // share that dim in every llama.cpp-supported architecture. + let intermediate_size = { + let global = get_arch_u32(GGUF_FEED_FORWARD_LENGTH); + if global > 0 { + global + } else { + get_arch_u32(GGUF_EXPERT_FEED_FORWARD_LENGTH) + } + }; let mut config = serde_json::json!({ HF_MODEL_TYPE: model_type, HF_HIDDEN_SIZE: hidden_size, HF_NUM_HIDDEN_LAYERS: get_arch_u32(GGUF_BLOCK_COUNT), - HF_INTERMEDIATE_SIZE: get_arch_u32(GGUF_FEED_FORWARD_LENGTH), + HF_INTERMEDIATE_SIZE: intermediate_size, HF_NUM_ATTENTION_HEADS: num_heads, HF_NUM_KEY_VALUE_HEADS: num_kv_heads, HF_HEAD_DIM: head_dim, @@ -1358,6 +1377,103 @@ mod tests { assert_eq!(arch.config().rope_base, 10_000.0); } + #[test] + fn test_gguf_to_config_json_falls_back_to_expert_feed_forward_length_on_moe() { + // DeepSeek-V4 family (and other MoE-only GGUFs) omit the global + // `feed_forward_length` and only emit the per-expert size. The + // HF config exposes `intermediate_size` as a single number, so + // the loader must surface the per-expert key into that field — + // otherwise downstream config validation rejects with + // `intermediate_size: must be greater than 0`. + let mut metadata = HashMap::new(); + metadata.insert( + "general.architecture".to_string(), + GgufValue::String("deepseek2".to_string()), + ); + metadata.insert( + "deepseek2.embedding_length".to_string(), + GgufValue::U32(4096), + ); + metadata.insert("deepseek2.block_count".to_string(), GgufValue::U32(43)); + metadata.insert( + "deepseek2.attention.head_count".to_string(), + GgufValue::U32(64), + ); + metadata.insert( + "deepseek2.attention.head_count_kv".to_string(), + GgufValue::U32(1), + ); + metadata.insert( + "deepseek2.attention.key_length".to_string(), + GgufValue::U32(128), + ); + // No `deepseek2.feed_forward_length` — MoE-only family. + metadata.insert( + "deepseek2.expert_feed_forward_length".to_string(), + GgufValue::U32(2048), + ); + metadata.insert("deepseek2.vocab_size".to_string(), GgufValue::U32(129280)); + + let gguf = GgufFile { + metadata, + tensor_infos: Vec::new(), + data_offset: 0, + path: std::path::PathBuf::from(""), + }; + let cfg = gguf.to_config_json(); + assert_eq!(cfg["intermediate_size"], 2048); + // And: this must now pass the validated detection path that + // previously rejected the model. + crate::detect_from_json_validated(&cfg) + .expect("MoE-only GGUF config should pass validation after fallback"); + } + + #[test] + fn test_gguf_to_config_json_prefers_global_feed_forward_length_when_both_present() { + // Some non-MoE / hybrid configs emit BOTH. We must keep using the + // global key in that case — the per-expert size is meaningful + // only when there are routed experts. + let mut metadata = HashMap::new(); + metadata.insert( + "general.architecture".to_string(), + GgufValue::String("deepseek2".to_string()), + ); + metadata.insert( + "deepseek2.embedding_length".to_string(), + GgufValue::U32(2048), + ); + metadata.insert("deepseek2.block_count".to_string(), GgufValue::U32(27)); + metadata.insert( + "deepseek2.attention.head_count".to_string(), + GgufValue::U32(16), + ); + metadata.insert( + "deepseek2.attention.head_count_kv".to_string(), + GgufValue::U32(16), + ); + metadata.insert( + "deepseek2.attention.key_length".to_string(), + GgufValue::U32(192), + ); + metadata.insert( + "deepseek2.feed_forward_length".to_string(), + GgufValue::U32(10944), + ); + metadata.insert( + "deepseek2.expert_feed_forward_length".to_string(), + GgufValue::U32(1408), + ); + + let gguf = GgufFile { + metadata, + tensor_infos: Vec::new(), + data_offset: 0, + path: std::path::PathBuf::from(""), + }; + let cfg = gguf.to_config_json(); + assert_eq!(cfg["intermediate_size"], 10944); + } + /// Build a minimal GGUF file with one 2-D F32 tensor, but truncate the /// tensor data region so that `offset + size > file len`. Loader must /// reject this cleanly, not panic on a slice OOB.