From 2bb2a95e27b443a2e2c24a90e3f089cbf527adcb Mon Sep 17 00:00:00 2001
From: Mykhailo Korobkov <m.korobkov@mil.ua>
Date: Sun, 24 May 2026 15:18:52 +0300
Subject: [PATCH] fix(gguf): fall back to expert_feed_forward_length for
 MoE-only configs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

DeepSeek-V4 family (and other MoE-only GGUFs) emit only
`{arch}.expert_feed_forward_length` — they never set the global
`{arch}.feed_forward_length` because there is no dense FFN layer
above the per-expert size. The current loader reads only the global
key, so `intermediate_size` came back as `0` and config validation
rejected the model with:

    Error: failed to load GGUF model: config validation failed:
    [ConfigValidationError { field: "intermediate_size",
     message: "must be greater than 0" }]

repro: `larql extract --level browse \
            DeepSeek-V4-Flash-Q3_K_M-00001-of-00003.gguf`

The HF config exposes `intermediate_size` as a single number, and
in every llama.cpp-supported architecture the per-expert FFN inner
dim matches what a non-MoE variant would call `intermediate_size`
(DS-V2-Lite-Chat happens to emit both at the same value when MoE
is active). Falling back to the per-expert key on absence of the
global key is the correct semantic.

Tests:
- `test_gguf_to_config_json_falls_back_to_expert_feed_forward_length_on_moe`:
  synthesises DS-V4-Flash-shaped metadata (no `feed_forward_length`,
  `expert_feed_forward_length: 2048`); verifies `intermediate_size: 2048`
  and that the validated detection path now accepts the config.
- `test_gguf_to_config_json_prefers_global_feed_forward_length_when_both_present`:
  when both keys are emitted (some hybrid configs do), the global key
  still wins — no behaviour change for non-MoE-only models.

281/281 larql-models tests pass.

This unblocks DeepSeek-V4-Flash extraction. Combined with #135 / #136 /
#137, the only remaining piece for full Kimi-K2/DS-V4-Flash inference
extraction is the per-tensor streaming GGUF reader (still in the
in-memory load path; tracks separately).
---
 crates/larql-models/src/loading/gguf.rs | 118 +++++++++++++++++++++++-
 1 file changed, 117 insertions(+), 1 deletion(-)

diff --git a/crates/larql-models/src/loading/gguf.rs b/crates/larql-models/src/loading/gguf.rs
index cfcd124b4..6afe9f216 100644
--- a/crates/larql-models/src/loading/gguf.rs
+++ b/crates/larql-models/src/loading/gguf.rs
@@ -38,6 +38,11 @@ const GGUF_GENERAL_ARCHITECTURE: &str = "general.architecture";
 const GGUF_EMBEDDING_LENGTH: &str = "embedding_length";
 const GGUF_BLOCK_COUNT: &str = "block_count";
 const GGUF_FEED_FORWARD_LENGTH: &str = "feed_forward_length";
+// MoE-only architectures (DeepSeek-V4, etc.) omit the global
+// `feed_forward_length` and emit only the per-expert size. We fall back
+// to it so config validation doesn't reject the model with
+// `intermediate_size: must be greater than 0`.
+const GGUF_EXPERT_FEED_FORWARD_LENGTH: &str = "expert_feed_forward_length";
 const GGUF_ATTENTION_HEAD_COUNT: &str = "attention.head_count";
 const GGUF_ATTENTION_HEAD_COUNT_KV: &str = "attention.head_count_kv";
 const GGUF_ATTENTION_KEY_LENGTH: &str = "attention.key_length";
@@ -395,11 +400,25 @@ impl GgufFile {
             num_heads
         };
 
+        // intermediate_size: prefer the global `feed_forward_length`. For
+        // MoE-only models (DeepSeek-V4 family) the global key is omitted,
+        // so we fall back to the per-expert size. The HF config exposes
+        // `intermediate_size` as a single number even on MoE archs because
+        // it represents the FFN inner dim — per-expert and per-layer FFNs
+        // share that dim in every llama.cpp-supported architecture.
+        let intermediate_size = {
+            let global = get_arch_u32(GGUF_FEED_FORWARD_LENGTH);
+            if global > 0 {
+                global
+            } else {
+                get_arch_u32(GGUF_EXPERT_FEED_FORWARD_LENGTH)
+            }
+        };
         let mut config = serde_json::json!({
             HF_MODEL_TYPE: model_type,
             HF_HIDDEN_SIZE: hidden_size,
             HF_NUM_HIDDEN_LAYERS: get_arch_u32(GGUF_BLOCK_COUNT),
-            HF_INTERMEDIATE_SIZE: get_arch_u32(GGUF_FEED_FORWARD_LENGTH),
+            HF_INTERMEDIATE_SIZE: intermediate_size,
             HF_NUM_ATTENTION_HEADS: num_heads,
             HF_NUM_KEY_VALUE_HEADS: num_kv_heads,
             HF_HEAD_DIM: head_dim,
@@ -1358,6 +1377,103 @@ mod tests {
         assert_eq!(arch.config().rope_base, 10_000.0);
     }
 
+    #[test]
+    fn test_gguf_to_config_json_falls_back_to_expert_feed_forward_length_on_moe() {
+        // DeepSeek-V4 family (and other MoE-only GGUFs) omit the global
+        // `feed_forward_length` and only emit the per-expert size. The
+        // HF config exposes `intermediate_size` as a single number, so
+        // the loader must surface the per-expert key into that field —
+        // otherwise downstream config validation rejects with
+        // `intermediate_size: must be greater than 0`.
+        let mut metadata = HashMap::new();
+        metadata.insert(
+            "general.architecture".to_string(),
+            GgufValue::String("deepseek2".to_string()),
+        );
+        metadata.insert(
+            "deepseek2.embedding_length".to_string(),
+            GgufValue::U32(4096),
+        );
+        metadata.insert("deepseek2.block_count".to_string(), GgufValue::U32(43));
+        metadata.insert(
+            "deepseek2.attention.head_count".to_string(),
+            GgufValue::U32(64),
+        );
+        metadata.insert(
+            "deepseek2.attention.head_count_kv".to_string(),
+            GgufValue::U32(1),
+        );
+        metadata.insert(
+            "deepseek2.attention.key_length".to_string(),
+            GgufValue::U32(128),
+        );
+        // No `deepseek2.feed_forward_length` — MoE-only family.
+        metadata.insert(
+            "deepseek2.expert_feed_forward_length".to_string(),
+            GgufValue::U32(2048),
+        );
+        metadata.insert("deepseek2.vocab_size".to_string(), GgufValue::U32(129280));
+
+        let gguf = GgufFile {
+            metadata,
+            tensor_infos: Vec::new(),
+            data_offset: 0,
+            path: std::path::PathBuf::from("<no-file>"),
+        };
+        let cfg = gguf.to_config_json();
+        assert_eq!(cfg["intermediate_size"], 2048);
+        // And: this must now pass the validated detection path that
+        // previously rejected the model.
+        crate::detect_from_json_validated(&cfg)
+            .expect("MoE-only GGUF config should pass validation after fallback");
+    }
+
+    #[test]
+    fn test_gguf_to_config_json_prefers_global_feed_forward_length_when_both_present() {
+        // Some non-MoE / hybrid configs emit BOTH. We must keep using the
+        // global key in that case — the per-expert size is meaningful
+        // only when there are routed experts.
+        let mut metadata = HashMap::new();
+        metadata.insert(
+            "general.architecture".to_string(),
+            GgufValue::String("deepseek2".to_string()),
+        );
+        metadata.insert(
+            "deepseek2.embedding_length".to_string(),
+            GgufValue::U32(2048),
+        );
+        metadata.insert("deepseek2.block_count".to_string(), GgufValue::U32(27));
+        metadata.insert(
+            "deepseek2.attention.head_count".to_string(),
+            GgufValue::U32(16),
+        );
+        metadata.insert(
+            "deepseek2.attention.head_count_kv".to_string(),
+            GgufValue::U32(16),
+        );
+        metadata.insert(
+            "deepseek2.attention.key_length".to_string(),
+            GgufValue::U32(192),
+        );
+        metadata.insert(
+            "deepseek2.feed_forward_length".to_string(),
+            GgufValue::U32(10944),
+        );
+        metadata.insert(
+            "deepseek2.expert_feed_forward_length".to_string(),
+            GgufValue::U32(1408),
+        );
+
+        let gguf = GgufFile {
+            metadata,
+            tensor_infos: Vec::new(),
+            data_offset: 0,
+            path: std::path::PathBuf::from("<no-file>"),
+        };
+        let cfg = gguf.to_config_json();
+        assert_eq!(cfg["intermediate_size"], 10944);
+    }
+
     /// Build a minimal GGUF file with one 2-D F32 tensor, but truncate the
     /// tensor data region so that `offset + size > file len`. Loader must
     /// reject this cleanly, not panic on a slice OOB.