foldl
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎convert.py‎
Lines changed: 62 additions & 27 deletions b/‎convert.py‎
Lines changed: 62 additions & 27 deletions
diff --git a/‎docs/models.md‎
Lines changed: 6 additions & 2 deletions b/‎docs/models.md‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎models/qwen3_5.cpp‎
Lines changed: 74 additions & 17 deletions b/‎models/qwen3_5.cpp‎
Lines changed: 74 additions & 17 deletions
diff --git a/‎scripts/models.json‎
Lines changed: 60 additions & 0 deletions b/‎scripts/models.json‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎src/backend.h‎
Lines changed: 2 additions & 0 deletions b/‎src/backend.h‎
Lines changed: 2 additions & 0 deletions
@@ -35,6 +35,7 @@ LittleAcademia[<a href="https://github.com/foldl/little-academia"   style="text-
 
 **What's New:**
 
+* 2026-03-06: Qwen3.5
 * 2026-03-03: GLM-OCR
 * 2026-02-22: Youtu-VL
 * 2026-02-18: Youtu-LLM
 
@@ -5531,33 +5531,18 @@ class Qwen3VLConverter(BaseConverter):
     MODEL_TYPE = ModelType.Qwen3_VL
 
     @classmethod
-    def state_dict_pp(cls, config, state_dict):
+    def vis_state_dict_pp(cls, config, state_dict):
         r = {}
         for k in state_dict: # name: str
             name: str = k
             tensor: torch.Tensor = state_dict[name]
-            if name.startswith('model.language_model.'):
-                name = name.replace('model.language_model.', 'model.')
-                if name.endswith('experts.down_proj'):
-                    shape = tensor.shape
-                    for j in range(shape[0]):
-                        kkk = name.replace('mlp.experts.down_proj', f'mlp.experts.{j}.down_proj.weight')
-                        r[kkk] = tensor[j].T.contiguous()
-                elif name.endswith('experts.gate_up_proj'):
-                    shape = tensor.shape
-                    gate = tensor[:, :, : shape[2] // 2]
-                    up   = tensor[:, :, shape[2] // 2 : ]
-                    for j in range(shape[0]):
-                        kkk = name.replace('experts.gate_up_proj', f'experts.{j}.gate_proj.weight')
-                        r[kkk] = gate[j].T.contiguous()
-                        kkk = name.replace('experts.gate_up_proj', f'experts.{j}.up_proj.weight')
-                        r[kkk] = up[j].T.contiguous()
-                else:
-                    r[name] = tensor
-                continue
 
             name = name.replace('model.visual.', 'visual.')
 
+            if name.startswith('model'):
+                r[name] = tensor
+                continue
+
             if name == 'visual.patch_embed.proj.weight':
                 shape = tensor.shape
                 assert len(shape) == 5
@@ -5580,6 +5565,37 @@ def state_dict_pp(cls, config, state_dict):
 
         return r
 
+    @classmethod
+    def state_dict_pp(cls, config, state_dict):
+        r = {}
+        for k in state_dict: # name: str
+            name: str = k
+            tensor: torch.Tensor = state_dict[name]
+            if name.startswith('model.language_model.'):
+                name = name.replace('model.language_model.', 'model.')
+                if name.endswith('experts.down_proj'):
+                    shape = tensor.shape
+                    for j in range(shape[0]):
+                        kkk = name.replace('mlp.experts.down_proj', f'mlp.experts.{j}.down_proj.weight')
+                        r[kkk] = tensor[j].T.contiguous()
+                elif name.endswith('experts.gate_up_proj'):
+                    shape = tensor.shape
+                    gate = tensor[:, :, : shape[2] // 2]
+                    up   = tensor[:, :, shape[2] // 2 : ]
+                    for j in range(shape[0]):
+                        kkk = name.replace('experts.gate_up_proj', f'experts.{j}.gate_proj.weight')
+                        r[kkk] = gate[j].T.contiguous()
+                        kkk = name.replace('experts.gate_up_proj', f'experts.{j}.up_proj.weight')
+                        r[kkk] = up[j].T.contiguous()
+                else:
+                    r[name] = tensor
+                continue
+
+            r[name] = tensor
+
+        r = Qwen3VLConverter.vis_state_dict_pp(config, r)
+        return r
+
     @staticmethod
     def dump_config(f, config, ggml_type):
         MROPE_SECTION_MAX = 4
@@ -5651,20 +5667,39 @@ class QWen3_5Converter(BaseConverter):
 
     @classmethod
     def state_dict_pp(cls, config, state_dict):
-        state_dict = Qwen3VLConverter.state_dict_pp(config, state_dict)
+        state_dict = Qwen3VLConverter.vis_state_dict_pp(config, state_dict)
         r = {}
         for k in state_dict:
             name: str = k
             tensor: torch.Tensor = state_dict[name]
-            if name.endswith('.self_attn.q_proj.weight'):
+            if name.startswith('model.language_model.'):
+                name = name.replace('model.language_model.', 'model.')
+                if name.endswith('experts.down_proj'):
+                    shape = tensor.shape
+                    for j in range(shape[0]):
+                        kkk = name.replace('mlp.experts.down_proj', f'mlp.experts.{j}.down_proj.weight')
+                        r[kkk] = tensor[j].contiguous()
+                elif name.endswith('experts.gate_up_proj'):
+                    shape = tensor.shape
+                    gate = tensor[:, : shape[1] // 2, :]
+                    up   = tensor[:, shape[1] // 2 :, :]
+                    for j in range(shape[0]):
+                        kkk = name.replace('experts.gate_up_proj', f'experts.{j}.gate_proj.weight')
+                        r[kkk] = gate[j].contiguous()
+                        kkk = name.replace('experts.gate_up_proj', f'experts.{j}.up_proj.weight')
+                        r[kkk] = up[j].contiguous()
+                elif name.endswith('.self_attn.q_proj.weight'):
+                    head_dim = QWen3_5Converter.txt_config.head_dim
+                    q, g = torch.chunk(tensor.view(-1, head_dim * 2, tensor.shape[1]), 2, dim=1)
+                    r[name] = q.contiguous().view(-1, tensor.shape[1])
+                    r[name.replace('.q_proj.', '.gate_proj.')] = g.contiguous().view(-1, tensor.shape[1])
+                else:
+                    r[name] = tensor
+            elif name.startswith("mtp.") and name.endswith('.self_attn.q_proj.weight'):
                 head_dim = QWen3_5Converter.txt_config.head_dim
                 q, g = torch.chunk(tensor.view(-1, head_dim * 2, tensor.shape[1]), 2, dim=1)
                 r[name] = q.contiguous().view(-1, tensor.shape[1])
                 r[name.replace('.q_proj.', '.gate_proj.')] = g.contiguous().view(-1, tensor.shape[1])
-            elif (name == "model.norm.weight") or   \
-                name.endswith('.input_layernorm.weight') or name.endswith('.post_attention_layernorm.weight') or \
-                name.endswith('.self_attn.q_norm.weight') or name.endswith('.self_attn.k_norm.weight'):
-                r[name] = 1.0 + tensor
             else:
                 r[name] = tensor
         return r
@@ -9817,7 +9852,7 @@ def main():
         if config['thinker_config']['model_type'] == 'qwen3_forced_aligner':
             Qwen3ASRConverter.MODEL_TYPE = ModelType.Qwen3ForcedAligner
         Qwen3ASRConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
-    elif arch == 'Qwen3_5ForConditionalGeneration':
+    elif arch in ['Qwen3_5ForConditionalGeneration', 'Qwen3_5MoeForConditionalGeneration']:
         QWen3_5Converter.convert(config, model_files, vocab, ggml_type, args.save_path)
     elif arch == 'KimiVLForConditionalGeneration':
         KimiVLConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
 
@@ -424,7 +424,7 @@ Please use `--format completion` for these models.
     [8B-Reasoning-2512](https://huggingface.co/mistralai/Ministral-3-8B-Reasoning-2512/tree/f511871f6402ba68dadfb42a94a7a7e13499fd65)
     * [x] Devstral-Small-2: [24B-Instruct-2512](https://huggingface.co/mistralai/Devstral-Small-2-24B-Instruct-2512/tree/8d27a0d2120f1563c11dc91d494e99f9678ecf79)
 
-* Qwen (`Qwen2AudioForConditionalGeneration`, `Qwen2VLForConditionalGeneration`, `Qwen2_5_VLForConditionalGeneration`, `Qwen3VLForConditionalGeneration`, `Qwen3VLMoeForConditionalGeneration`)
+* Qwen (`Qwen2AudioForConditionalGeneration`, `Qwen2VLForConditionalGeneration`, `Qwen2_5_VLForConditionalGeneration`, `Qwen3VLForConditionalGeneration`, `Qwen3VLMoeForConditionalGeneration`, `Qwen3_5ForConditionalGeneration`, `Qwen3_5MoeForConditionalGeneration`)
     * [x] Qwen2-Audio: [7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct/tree/0a095220c30b7b31434169c3086508ef3ea5bf0a)
     * [x] Qwen2-VL: [2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/tree/895c3a49bc3fa70a340399125c650a463535e71c), [7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/tree/eed13092ef92e448dd6875b2a00151bd3f7db0ac)
     * [x] Qwen2.5-VL: [3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct/tree/66285546d2b821cf421d4f5eb2576359d3770cd3), [7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/tree/cc594898137f460bfe9f0759e9844b3ce807cfb5)
@@ -433,6 +433,10 @@ Please use `--format completion` for these models.
     * [x] Qwen3-VL: [2B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct/tree/89644892e4d85e24eaac8bacfd4f463576704203),
     [4B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct/tree/ebb281ec70b05090aa6165b016eac8ec08e71b17),
     [A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct/tree/9c4b90e1e4ba969fd3b5378b57d966d725f1b86c), etc
+    * [x] Qwen3.5: [0.8B](https://huggingface.co/Qwen/Qwen3.5-0.8B/tree/2fc06364715b967f1860aea9cf38778875588b17),
+    [2B](https://huggingface.co/Qwen/Qwen3.5-2B/tree/15852e8c16360a2fea060d615a32b45270f8a8fc),
+    [4B](https://huggingface.co/Qwen/Qwen3.5-4B/tree/851bf6e806efd8d0a36b00ddf55e13ccb7b8cd0a),
+    [9B](https://huggingface.co/Qwen/Qwen3.5-9B/tree/c202236235762e1c871ad0ccb60c8ee5ba337b9a)
 
 * SmolVLM2 (`SmolVLMForConditionalGeneration`)
     * [x] [2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct/tree/482adb537c021c86670beed01cd58990d01e72e4)
@@ -461,7 +465,7 @@ Please use `--format completion` for these models.
     * [x] OCR2: [3B](https://huggingface.co/nanonets/Nanonets-OCR2-3B/tree/d0368059ad151ce9e38f526890cfd4f27b28be65), [1.5B](https://huggingface.co/nanonets/Nanonets-OCR2-1.5B-exp/tree/306a9b2a65672a3dbebd9bce9a9373a9a18674a2)
 
 * GLM-OCR (`GlmOcrForConditionalGeneration`)
-    * [x] [0.7B](https://huggingface.co/zai-org/GLM-OCR/tree/677c6baa60442a451f8a8c7eabdfab32d9801a0b)
+    * [x] [0.9B](https://huggingface.co/zai-org/GLM-OCR/tree/677c6baa60442a451f8a8c7eabdfab32d9801a0b)
 
 ## ASR Models
 
 
@@ -89,10 +89,10 @@ namespace chatllm::qwen::v3_5
         bool vit_loaded = false;
     };
 
-    class QwenGatedAttention : public QKNormedRoPEAttention<RMSNorm, BaseAttention>
+    class QwenGatedAttention : public QKNormedRoPEAttention<RMSNormWeightPlus1, BaseAttention>
     {
     public:
-        typedef QKNormedRoPEAttention<RMSNorm, BaseAttention> Base;
+        typedef QKNormedRoPEAttention<RMSNormWeightPlus1, BaseAttention> Base;
         QwenGatedAttention(InitContext *ctx, int hidden_size, int num_attention_heads, int num_kv_heads, int head_dim, int max_length);
         int64_t get_param_num(bool effective_only) const override;
         ggml::tensor *forward(ComputeContext *ctx, ggml::tensor *input, int n_past) override;
@@ -485,7 +485,7 @@ namespace chatllm::qwen::v3_5
 
         transformer = new ModelClass(&w_ctx_, config.num_hidden_layers, config.hidden_size,
             vocab_size <= 0 ? create_embedding<Embedding>(&w_ctx_, config) : create_embedding<Embedding>(&w_ctx_, vocab_size, config.hidden_size),
-            create_final_norm<RMSNorm>(&w_ctx_, config),
+            create_final_norm<RMSNormWeightPlus1>(&w_ctx_, config),
             config.tie_word_embeddings ? (Block *)nullptr : create_lm_head(&w_ctx_, config, false),
             [&](InitContext *ctx, int layer_index) {
                 return create_layer(ctx, layer_index);
@@ -509,26 +509,79 @@ namespace chatllm::qwen::v3_5
         return r;
     }
 
-    Block *ConditionalGeneration::create_layer(InitContext *ctx, int layer_index)
+    template <int NUM_EXPERTS, int EXPERTS_PER_TOK> class QWenSparseMoE : public BaseSparseMLP
     {
-        CHATLLM_CHECK(config.num_experts < 0) << "TODO: MoE";
+    public:
+        QWenSparseMoE(InitContext *ctx, int hidden_size, int intermediate_size)
+            : BaseSparseMLP(ctx, hidden_size, intermediate_size, NUM_EXPERTS, EXPERTS_PER_TOK, ActFunc::SILU, false)
+        {
+        }
+    };
 
-        if (config.layer_is_la[layer_index])
+    Block *ConditionalGeneration::create_layer(InitContext *ctx, int layer_index)
+    {
+        if (config.num_experts <= 0)
         {
-            typedef LMBlock1<RMSNorm, QwenGatedDeltaNet, RMSNorm, SiLUMLP> Layer;
-            auto layer = new Layer(ctx, TypeLinearAttention(), config.hidden_size, config.intermediate_size,
-                config.linear_conv_kernel_dim, config.linear_num_key_heads, config.linear_num_value_heads, config.linear_key_head_dim, config.linear_value_head_dim);
-            return layer;
+            if (config.layer_is_la[layer_index])
+            {
+                typedef LMBlock1<RMSNormWeightPlus1, QwenGatedDeltaNet, RMSNormWeightPlus1, SiLUMLP> Layer;
+                auto layer = new Layer(ctx, TypeLinearAttention(), config.hidden_size, config.intermediate_size,
+                    config.linear_conv_kernel_dim, config.linear_num_key_heads, config.linear_num_value_heads, config.linear_key_head_dim, config.linear_value_head_dim);
+                return layer;
+            }
+            else
+            {
+                typedef LMBlock1<RMSNormWeightPlus1, QwenGatedAttention, RMSNormWeightPlus1, SiLUMLP> Layer;
+                auto layer = new Layer(ctx, config.hidden_size, config.num_attention_heads, config.intermediate_size, config.num_key_value_heads, config.head_dim, config.max_length);
+                layer->attention.mrope_sections = config.mrope_sections;
+                layer->attention.rope_mode      = RoPEMode::IMROPE;
+                layer->attention.rope_dim       = config.rope_dim;
+                layer->attention.freq_base      = config.rope_theta;
+                return layer;
+            }
         }
         else
         {
-            typedef LMBlock1<RMSNorm, QwenGatedAttention, RMSNorm, SiLUMLP> Layer;
-            auto layer = new Layer(ctx, config.hidden_size, config.num_attention_heads, config.intermediate_size, config.num_key_value_heads, config.head_dim, config.max_length);
-            layer->attention.mrope_sections = config.mrope_sections;
-            layer->attention.rope_mode      = RoPEMode::IMROPE;
-            layer->attention.rope_dim       = config.rope_dim;
-            layer->attention.freq_base      = config.rope_theta;
-            return layer;
+            typedef GatedMLP<SiLUMLP> QWenGatedMLP;
+
+            if (config.layer_is_la[layer_index])
+            {
+                if ((config.num_experts == 256) && (config.num_experts_per_tok == 8))
+                {
+                    typedef CombinedMLP<QWenSparseMoE<256, 8>, QWenGatedMLP> QWenMoEMLP;
+                    typedef LMBlock1<RMSNormWeightPlus1, QwenGatedDeltaNet, RMSNormWeightPlus1, QWenMoEMLP> Layer;
+                    auto layer = new Layer(ctx, TypeLinearAttention(), config.hidden_size, config.intermediate_size,
+                        config.moe_intermediate_size, config.shared_expert_intermediate_size,
+                        config.linear_conv_kernel_dim, config.linear_num_key_heads, config.linear_num_value_heads, config.linear_key_head_dim, config.linear_value_head_dim);
+                    return layer;
+                }
+                else
+                {
+                    CHATLLM_CHECK(false) << "unsupported MoE param: " << config.num_experts << ", " << config.num_experts_per_tok;
+                    return nullptr;
+                }
+            }
+            else
+            {
+                if ((config.num_experts == 256) && (config.num_experts_per_tok == 8))
+                {
+                    typedef CombinedMLP<QWenSparseMoE<256, 8>, QWenGatedMLP> QWenMoEMLP;
+                    typedef LMBlock1<RMSNormWeightPlus1, QwenGatedAttention, RMSNormWeightPlus1, QWenMoEMLP> Layer;
+                    auto layer = new Layer(ctx, config.hidden_size, config.num_attention_heads, config.intermediate_size,
+                        config.moe_intermediate_size, config.shared_expert_intermediate_size,
+                        config.num_key_value_heads, config.head_dim, config.max_length);
+                    layer->attention.mrope_sections = config.mrope_sections;
+                    layer->attention.rope_mode      = RoPEMode::IMROPE;
+                    layer->attention.rope_dim       = config.rope_dim;
+                    layer->attention.freq_base      = config.rope_theta;
+                    return layer;
+                }
+                else
+                {
+                    CHATLLM_CHECK(false) << "unsupported MoE param: " << config.num_experts << ", " << config.num_experts_per_tok;
+                    return nullptr;
+                }
+            }
         }
     }
 
@@ -544,7 +597,11 @@ namespace chatllm::qwen::v3_5
             {".self_attn.in_proj_qkv.",             ".linear_attn.in_proj_qkv."},
             {".self_attn.norm.",                    ".linear_attn.norm."},
             {".self_attn.out_proj.",                ".linear_attn.out_proj."},
+            {".mlp.mlp1.",                          ".mlp."},
+            {".mlp.mlp2.gate.",                     ".mlp.shared_expert_gate."},
+            {".mlp.mlp2.",                          ".mlp.shared_expert."},
         });
+
         BaseModelForConditionalGeneration::load(loader);
 
         loader.clear_tensor_name_translations();
 
@@ -4152,5 +4152,65 @@
                 }
             }
         }
+    },
+    "glm-ocr": {
+        "brief": "GLM-OCR is a multimodal OCR model for complex document understanding, built on the GLM-V encoder–decoder architecture.",
+        "default": "0.9b",
+        "license": "MIT",
+        "variants": {
+            "0.9b": {
+                "default": "q8",
+                "quantized": {
+                    "q8": {
+                        "size": 1187511024,
+                        "url": "chatllm_quantized_glm/glm-ocr.bin"
+                    }
+                }
+            }
+        }
+    },
+    "qwen3.5": {
+        "brief": "Qwen 3.5 is a family of open-source multimodal models that delivers exceptional utility and performance.",
+        "default": "0.8b",
+        "license": "Apache License 2.0",
+        "variants": {
+            "0.8b": {
+                "default": "q8",
+                "quantized": {
+                    "q8": {
+                        "size": 938351648,
+                        "url": "chatllm_quantized_qwen3.5/qwen3.5-0.8b.bin"
+                    }
+                }
+            },
+            "2b": {
+                "default": "q8",
+                "quantized": {
+                    "q8": {
+                        "size": 2427658976,
+                        "url": "chatllm_quantized_qwen3.5/qwen3.5-2b.bin"
+                    }
+                }
+            },
+            "4b": {
+                "default": "q8",
+                "quantized": {
+                    "q8": {
+                        "size": 4963107408,
+                        "url": "chatllm_quantized_qwen3.5/qwen3.5-4b.bin"
+                    }
+                }
+            },
+            "9b": {
+                "default": "q8",
+                "quantized": {
+                    "q8": {
+                        "size": 10394664000,
+                        "url": "chatllm_quantized_qwen3.5/qwen3.5-9b.bin"
+                    }
+                }
+            }
+        }
     }
+
 }
@@ -4,6 +4,7 @@
 #include <memory>
 #include <map>
 #include <string>
+#include <functional>
 
 #include "ggml-backend.h"
 
@@ -80,6 +81,7 @@ namespace chatllm
                         ggml::tensor *tensor) = 0;
         virtual void read_scaler(const std::string &name, float *value) = 0;
         virtual bool has_tensor(const std::string &name) const = 0;
+        static void map_tensor_element(ggml::tensor *tensor, std::function<float (float)> f);
     };
 
     // Is `ggml_backend_buffer_type_t` a good name?