Skip to content

Commit 18f685f

Browse files
committed
work on qwen3.5-moe (in progress); upload models
1 parent 00700c9 commit 18f685f

11 files changed

Lines changed: 339 additions & 97 deletions

File tree

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ LittleAcademia[<a href="https://github.com/foldl/little-academia" style="text-
3535

3636
**What's New:**
3737

38+
* 2026-03-06: Qwen3.5
3839
* 2026-03-03: GLM-OCR
3940
* 2026-02-22: Youtu-VL
4041
* 2026-02-18: Youtu-LLM

convert.py

Lines changed: 62 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -5531,33 +5531,18 @@ class Qwen3VLConverter(BaseConverter):
55315531
MODEL_TYPE = ModelType.Qwen3_VL
55325532

55335533
@classmethod
5534-
def state_dict_pp(cls, config, state_dict):
5534+
def vis_state_dict_pp(cls, config, state_dict):
55355535
r = {}
55365536
for k in state_dict: # name: str
55375537
name: str = k
55385538
tensor: torch.Tensor = state_dict[name]
5539-
if name.startswith('model.language_model.'):
5540-
name = name.replace('model.language_model.', 'model.')
5541-
if name.endswith('experts.down_proj'):
5542-
shape = tensor.shape
5543-
for j in range(shape[0]):
5544-
kkk = name.replace('mlp.experts.down_proj', f'mlp.experts.{j}.down_proj.weight')
5545-
r[kkk] = tensor[j].T.contiguous()
5546-
elif name.endswith('experts.gate_up_proj'):
5547-
shape = tensor.shape
5548-
gate = tensor[:, :, : shape[2] // 2]
5549-
up = tensor[:, :, shape[2] // 2 : ]
5550-
for j in range(shape[0]):
5551-
kkk = name.replace('experts.gate_up_proj', f'experts.{j}.gate_proj.weight')
5552-
r[kkk] = gate[j].T.contiguous()
5553-
kkk = name.replace('experts.gate_up_proj', f'experts.{j}.up_proj.weight')
5554-
r[kkk] = up[j].T.contiguous()
5555-
else:
5556-
r[name] = tensor
5557-
continue
55585539

55595540
name = name.replace('model.visual.', 'visual.')
55605541

5542+
if name.startswith('model'):
5543+
r[name] = tensor
5544+
continue
5545+
55615546
if name == 'visual.patch_embed.proj.weight':
55625547
shape = tensor.shape
55635548
assert len(shape) == 5
@@ -5580,6 +5565,37 @@ def state_dict_pp(cls, config, state_dict):
55805565

55815566
return r
55825567

5568+
@classmethod
5569+
def state_dict_pp(cls, config, state_dict):
5570+
r = {}
5571+
for k in state_dict: # name: str
5572+
name: str = k
5573+
tensor: torch.Tensor = state_dict[name]
5574+
if name.startswith('model.language_model.'):
5575+
name = name.replace('model.language_model.', 'model.')
5576+
if name.endswith('experts.down_proj'):
5577+
shape = tensor.shape
5578+
for j in range(shape[0]):
5579+
kkk = name.replace('mlp.experts.down_proj', f'mlp.experts.{j}.down_proj.weight')
5580+
r[kkk] = tensor[j].T.contiguous()
5581+
elif name.endswith('experts.gate_up_proj'):
5582+
shape = tensor.shape
5583+
gate = tensor[:, :, : shape[2] // 2]
5584+
up = tensor[:, :, shape[2] // 2 : ]
5585+
for j in range(shape[0]):
5586+
kkk = name.replace('experts.gate_up_proj', f'experts.{j}.gate_proj.weight')
5587+
r[kkk] = gate[j].T.contiguous()
5588+
kkk = name.replace('experts.gate_up_proj', f'experts.{j}.up_proj.weight')
5589+
r[kkk] = up[j].T.contiguous()
5590+
else:
5591+
r[name] = tensor
5592+
continue
5593+
5594+
r[name] = tensor
5595+
5596+
r = Qwen3VLConverter.vis_state_dict_pp(config, r)
5597+
return r
5598+
55835599
@staticmethod
55845600
def dump_config(f, config, ggml_type):
55855601
MROPE_SECTION_MAX = 4
@@ -5651,20 +5667,39 @@ class QWen3_5Converter(BaseConverter):
56515667

56525668
@classmethod
56535669
def state_dict_pp(cls, config, state_dict):
5654-
state_dict = Qwen3VLConverter.state_dict_pp(config, state_dict)
5670+
state_dict = Qwen3VLConverter.vis_state_dict_pp(config, state_dict)
56555671
r = {}
56565672
for k in state_dict:
56575673
name: str = k
56585674
tensor: torch.Tensor = state_dict[name]
5659-
if name.endswith('.self_attn.q_proj.weight'):
5675+
if name.startswith('model.language_model.'):
5676+
name = name.replace('model.language_model.', 'model.')
5677+
if name.endswith('experts.down_proj'):
5678+
shape = tensor.shape
5679+
for j in range(shape[0]):
5680+
kkk = name.replace('mlp.experts.down_proj', f'mlp.experts.{j}.down_proj.weight')
5681+
r[kkk] = tensor[j].contiguous()
5682+
elif name.endswith('experts.gate_up_proj'):
5683+
shape = tensor.shape
5684+
gate = tensor[:, : shape[1] // 2, :]
5685+
up = tensor[:, shape[1] // 2 :, :]
5686+
for j in range(shape[0]):
5687+
kkk = name.replace('experts.gate_up_proj', f'experts.{j}.gate_proj.weight')
5688+
r[kkk] = gate[j].contiguous()
5689+
kkk = name.replace('experts.gate_up_proj', f'experts.{j}.up_proj.weight')
5690+
r[kkk] = up[j].contiguous()
5691+
elif name.endswith('.self_attn.q_proj.weight'):
5692+
head_dim = QWen3_5Converter.txt_config.head_dim
5693+
q, g = torch.chunk(tensor.view(-1, head_dim * 2, tensor.shape[1]), 2, dim=1)
5694+
r[name] = q.contiguous().view(-1, tensor.shape[1])
5695+
r[name.replace('.q_proj.', '.gate_proj.')] = g.contiguous().view(-1, tensor.shape[1])
5696+
else:
5697+
r[name] = tensor
5698+
elif name.startswith("mtp.") and name.endswith('.self_attn.q_proj.weight'):
56605699
head_dim = QWen3_5Converter.txt_config.head_dim
56615700
q, g = torch.chunk(tensor.view(-1, head_dim * 2, tensor.shape[1]), 2, dim=1)
56625701
r[name] = q.contiguous().view(-1, tensor.shape[1])
56635702
r[name.replace('.q_proj.', '.gate_proj.')] = g.contiguous().view(-1, tensor.shape[1])
5664-
elif (name == "model.norm.weight") or \
5665-
name.endswith('.input_layernorm.weight') or name.endswith('.post_attention_layernorm.weight') or \
5666-
name.endswith('.self_attn.q_norm.weight') or name.endswith('.self_attn.k_norm.weight'):
5667-
r[name] = 1.0 + tensor
56685703
else:
56695704
r[name] = tensor
56705705
return r
@@ -9817,7 +9852,7 @@ def main():
98179852
if config['thinker_config']['model_type'] == 'qwen3_forced_aligner':
98189853
Qwen3ASRConverter.MODEL_TYPE = ModelType.Qwen3ForcedAligner
98199854
Qwen3ASRConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
9820-
elif arch == 'Qwen3_5ForConditionalGeneration':
9855+
elif arch in ['Qwen3_5ForConditionalGeneration', 'Qwen3_5MoeForConditionalGeneration']:
98219856
QWen3_5Converter.convert(config, model_files, vocab, ggml_type, args.save_path)
98229857
elif arch == 'KimiVLForConditionalGeneration':
98239858
KimiVLConverter.convert(config, model_files, vocab, ggml_type, args.save_path)

docs/models.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -424,7 +424,7 @@ Please use `--format completion` for these models.
424424
[8B-Reasoning-2512](https://huggingface.co/mistralai/Ministral-3-8B-Reasoning-2512/tree/f511871f6402ba68dadfb42a94a7a7e13499fd65)
425425
* [x] Devstral-Small-2: [24B-Instruct-2512](https://huggingface.co/mistralai/Devstral-Small-2-24B-Instruct-2512/tree/8d27a0d2120f1563c11dc91d494e99f9678ecf79)
426426

427-
* Qwen (`Qwen2AudioForConditionalGeneration`, `Qwen2VLForConditionalGeneration`, `Qwen2_5_VLForConditionalGeneration`, `Qwen3VLForConditionalGeneration`, `Qwen3VLMoeForConditionalGeneration`)
427+
* Qwen (`Qwen2AudioForConditionalGeneration`, `Qwen2VLForConditionalGeneration`, `Qwen2_5_VLForConditionalGeneration`, `Qwen3VLForConditionalGeneration`, `Qwen3VLMoeForConditionalGeneration`, `Qwen3_5ForConditionalGeneration`, `Qwen3_5MoeForConditionalGeneration`)
428428
* [x] Qwen2-Audio: [7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct/tree/0a095220c30b7b31434169c3086508ef3ea5bf0a)
429429
* [x] Qwen2-VL: [2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/tree/895c3a49bc3fa70a340399125c650a463535e71c), [7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/tree/eed13092ef92e448dd6875b2a00151bd3f7db0ac)
430430
* [x] Qwen2.5-VL: [3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct/tree/66285546d2b821cf421d4f5eb2576359d3770cd3), [7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/tree/cc594898137f460bfe9f0759e9844b3ce807cfb5)
@@ -433,6 +433,10 @@ Please use `--format completion` for these models.
433433
* [x] Qwen3-VL: [2B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct/tree/89644892e4d85e24eaac8bacfd4f463576704203),
434434
[4B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct/tree/ebb281ec70b05090aa6165b016eac8ec08e71b17),
435435
[A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct/tree/9c4b90e1e4ba969fd3b5378b57d966d725f1b86c), etc
436+
* [x] Qwen3.5: [0.8B](https://huggingface.co/Qwen/Qwen3.5-0.8B/tree/2fc06364715b967f1860aea9cf38778875588b17),
437+
[2B](https://huggingface.co/Qwen/Qwen3.5-2B/tree/15852e8c16360a2fea060d615a32b45270f8a8fc),
438+
[4B](https://huggingface.co/Qwen/Qwen3.5-4B/tree/851bf6e806efd8d0a36b00ddf55e13ccb7b8cd0a),
439+
[9B](https://huggingface.co/Qwen/Qwen3.5-9B/tree/c202236235762e1c871ad0ccb60c8ee5ba337b9a)
436440

437441
* SmolVLM2 (`SmolVLMForConditionalGeneration`)
438442
* [x] [2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct/tree/482adb537c021c86670beed01cd58990d01e72e4)
@@ -461,7 +465,7 @@ Please use `--format completion` for these models.
461465
* [x] OCR2: [3B](https://huggingface.co/nanonets/Nanonets-OCR2-3B/tree/d0368059ad151ce9e38f526890cfd4f27b28be65), [1.5B](https://huggingface.co/nanonets/Nanonets-OCR2-1.5B-exp/tree/306a9b2a65672a3dbebd9bce9a9373a9a18674a2)
462466

463467
* GLM-OCR (`GlmOcrForConditionalGeneration`)
464-
* [x] [0.7B](https://huggingface.co/zai-org/GLM-OCR/tree/677c6baa60442a451f8a8c7eabdfab32d9801a0b)
468+
* [x] [0.9B](https://huggingface.co/zai-org/GLM-OCR/tree/677c6baa60442a451f8a8c7eabdfab32d9801a0b)
465469

466470
## ASR Models
467471

models/qwen3_5.cpp

Lines changed: 74 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -89,10 +89,10 @@ namespace chatllm::qwen::v3_5
8989
bool vit_loaded = false;
9090
};
9191

92-
class QwenGatedAttention : public QKNormedRoPEAttention<RMSNorm, BaseAttention>
92+
class QwenGatedAttention : public QKNormedRoPEAttention<RMSNormWeightPlus1, BaseAttention>
9393
{
9494
public:
95-
typedef QKNormedRoPEAttention<RMSNorm, BaseAttention> Base;
95+
typedef QKNormedRoPEAttention<RMSNormWeightPlus1, BaseAttention> Base;
9696
QwenGatedAttention(InitContext *ctx, int hidden_size, int num_attention_heads, int num_kv_heads, int head_dim, int max_length);
9797
int64_t get_param_num(bool effective_only) const override;
9898
ggml::tensor *forward(ComputeContext *ctx, ggml::tensor *input, int n_past) override;
@@ -485,7 +485,7 @@ namespace chatllm::qwen::v3_5
485485

486486
transformer = new ModelClass(&w_ctx_, config.num_hidden_layers, config.hidden_size,
487487
vocab_size <= 0 ? create_embedding<Embedding>(&w_ctx_, config) : create_embedding<Embedding>(&w_ctx_, vocab_size, config.hidden_size),
488-
create_final_norm<RMSNorm>(&w_ctx_, config),
488+
create_final_norm<RMSNormWeightPlus1>(&w_ctx_, config),
489489
config.tie_word_embeddings ? (Block *)nullptr : create_lm_head(&w_ctx_, config, false),
490490
[&](InitContext *ctx, int layer_index) {
491491
return create_layer(ctx, layer_index);
@@ -509,26 +509,79 @@ namespace chatllm::qwen::v3_5
509509
return r;
510510
}
511511

512-
Block *ConditionalGeneration::create_layer(InitContext *ctx, int layer_index)
512+
template <int NUM_EXPERTS, int EXPERTS_PER_TOK> class QWenSparseMoE : public BaseSparseMLP
513513
{
514-
CHATLLM_CHECK(config.num_experts < 0) << "TODO: MoE";
514+
public:
515+
QWenSparseMoE(InitContext *ctx, int hidden_size, int intermediate_size)
516+
: BaseSparseMLP(ctx, hidden_size, intermediate_size, NUM_EXPERTS, EXPERTS_PER_TOK, ActFunc::SILU, false)
517+
{
518+
}
519+
};
515520

516-
if (config.layer_is_la[layer_index])
521+
Block *ConditionalGeneration::create_layer(InitContext *ctx, int layer_index)
522+
{
523+
if (config.num_experts <= 0)
517524
{
518-
typedef LMBlock1<RMSNorm, QwenGatedDeltaNet, RMSNorm, SiLUMLP> Layer;
519-
auto layer = new Layer(ctx, TypeLinearAttention(), config.hidden_size, config.intermediate_size,
520-
config.linear_conv_kernel_dim, config.linear_num_key_heads, config.linear_num_value_heads, config.linear_key_head_dim, config.linear_value_head_dim);
521-
return layer;
525+
if (config.layer_is_la[layer_index])
526+
{
527+
typedef LMBlock1<RMSNormWeightPlus1, QwenGatedDeltaNet, RMSNormWeightPlus1, SiLUMLP> Layer;
528+
auto layer = new Layer(ctx, TypeLinearAttention(), config.hidden_size, config.intermediate_size,
529+
config.linear_conv_kernel_dim, config.linear_num_key_heads, config.linear_num_value_heads, config.linear_key_head_dim, config.linear_value_head_dim);
530+
return layer;
531+
}
532+
else
533+
{
534+
typedef LMBlock1<RMSNormWeightPlus1, QwenGatedAttention, RMSNormWeightPlus1, SiLUMLP> Layer;
535+
auto layer = new Layer(ctx, config.hidden_size, config.num_attention_heads, config.intermediate_size, config.num_key_value_heads, config.head_dim, config.max_length);
536+
layer->attention.mrope_sections = config.mrope_sections;
537+
layer->attention.rope_mode = RoPEMode::IMROPE;
538+
layer->attention.rope_dim = config.rope_dim;
539+
layer->attention.freq_base = config.rope_theta;
540+
return layer;
541+
}
522542
}
523543
else
524544
{
525-
typedef LMBlock1<RMSNorm, QwenGatedAttention, RMSNorm, SiLUMLP> Layer;
526-
auto layer = new Layer(ctx, config.hidden_size, config.num_attention_heads, config.intermediate_size, config.num_key_value_heads, config.head_dim, config.max_length);
527-
layer->attention.mrope_sections = config.mrope_sections;
528-
layer->attention.rope_mode = RoPEMode::IMROPE;
529-
layer->attention.rope_dim = config.rope_dim;
530-
layer->attention.freq_base = config.rope_theta;
531-
return layer;
545+
typedef GatedMLP<SiLUMLP> QWenGatedMLP;
546+
547+
if (config.layer_is_la[layer_index])
548+
{
549+
if ((config.num_experts == 256) && (config.num_experts_per_tok == 8))
550+
{
551+
typedef CombinedMLP<QWenSparseMoE<256, 8>, QWenGatedMLP> QWenMoEMLP;
552+
typedef LMBlock1<RMSNormWeightPlus1, QwenGatedDeltaNet, RMSNormWeightPlus1, QWenMoEMLP> Layer;
553+
auto layer = new Layer(ctx, TypeLinearAttention(), config.hidden_size, config.intermediate_size,
554+
config.moe_intermediate_size, config.shared_expert_intermediate_size,
555+
config.linear_conv_kernel_dim, config.linear_num_key_heads, config.linear_num_value_heads, config.linear_key_head_dim, config.linear_value_head_dim);
556+
return layer;
557+
}
558+
else
559+
{
560+
CHATLLM_CHECK(false) << "unsupported MoE param: " << config.num_experts << ", " << config.num_experts_per_tok;
561+
return nullptr;
562+
}
563+
}
564+
else
565+
{
566+
if ((config.num_experts == 256) && (config.num_experts_per_tok == 8))
567+
{
568+
typedef CombinedMLP<QWenSparseMoE<256, 8>, QWenGatedMLP> QWenMoEMLP;
569+
typedef LMBlock1<RMSNormWeightPlus1, QwenGatedAttention, RMSNormWeightPlus1, QWenMoEMLP> Layer;
570+
auto layer = new Layer(ctx, config.hidden_size, config.num_attention_heads, config.intermediate_size,
571+
config.moe_intermediate_size, config.shared_expert_intermediate_size,
572+
config.num_key_value_heads, config.head_dim, config.max_length);
573+
layer->attention.mrope_sections = config.mrope_sections;
574+
layer->attention.rope_mode = RoPEMode::IMROPE;
575+
layer->attention.rope_dim = config.rope_dim;
576+
layer->attention.freq_base = config.rope_theta;
577+
return layer;
578+
}
579+
else
580+
{
581+
CHATLLM_CHECK(false) << "unsupported MoE param: " << config.num_experts << ", " << config.num_experts_per_tok;
582+
return nullptr;
583+
}
584+
}
532585
}
533586
}
534587

@@ -544,7 +597,11 @@ namespace chatllm::qwen::v3_5
544597
{".self_attn.in_proj_qkv.", ".linear_attn.in_proj_qkv."},
545598
{".self_attn.norm.", ".linear_attn.norm."},
546599
{".self_attn.out_proj.", ".linear_attn.out_proj."},
600+
{".mlp.mlp1.", ".mlp."},
601+
{".mlp.mlp2.gate.", ".mlp.shared_expert_gate."},
602+
{".mlp.mlp2.", ".mlp.shared_expert."},
547603
});
604+
548605
BaseModelForConditionalGeneration::load(loader);
549606

550607
loader.clear_tensor_name_translations();

scripts/models.json

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4152,5 +4152,65 @@
41524152
}
41534153
}
41544154
}
4155+
},
4156+
"glm-ocr": {
4157+
"brief": "GLM-OCR is a multimodal OCR model for complex document understanding, built on the GLM-V encoder–decoder architecture.",
4158+
"default": "0.9b",
4159+
"license": "MIT",
4160+
"variants": {
4161+
"0.9b": {
4162+
"default": "q8",
4163+
"quantized": {
4164+
"q8": {
4165+
"size": 1187511024,
4166+
"url": "chatllm_quantized_glm/glm-ocr.bin"
4167+
}
4168+
}
4169+
}
4170+
}
4171+
},
4172+
"qwen3.5": {
4173+
"brief": "Qwen 3.5 is a family of open-source multimodal models that delivers exceptional utility and performance.",
4174+
"default": "0.8b",
4175+
"license": "Apache License 2.0",
4176+
"variants": {
4177+
"0.8b": {
4178+
"default": "q8",
4179+
"quantized": {
4180+
"q8": {
4181+
"size": 938351648,
4182+
"url": "chatllm_quantized_qwen3.5/qwen3.5-0.8b.bin"
4183+
}
4184+
}
4185+
},
4186+
"2b": {
4187+
"default": "q8",
4188+
"quantized": {
4189+
"q8": {
4190+
"size": 2427658976,
4191+
"url": "chatllm_quantized_qwen3.5/qwen3.5-2b.bin"
4192+
}
4193+
}
4194+
},
4195+
"4b": {
4196+
"default": "q8",
4197+
"quantized": {
4198+
"q8": {
4199+
"size": 4963107408,
4200+
"url": "chatllm_quantized_qwen3.5/qwen3.5-4b.bin"
4201+
}
4202+
}
4203+
},
4204+
"9b": {
4205+
"default": "q8",
4206+
"quantized": {
4207+
"q8": {
4208+
"size": 10394664000,
4209+
"url": "chatllm_quantized_qwen3.5/qwen3.5-9b.bin"
4210+
}
4211+
}
4212+
}
4213+
}
41554214
}
4215+
41564216
}

src/backend.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include <memory>
55
#include <map>
66
#include <string>
7+
#include <functional>
78

89
#include "ggml-backend.h"
910

@@ -80,6 +81,7 @@ namespace chatllm
8081
ggml::tensor *tensor) = 0;
8182
virtual void read_scaler(const std::string &name, float *value) = 0;
8283
virtual bool has_tensor(const std::string &name) const = 0;
84+
static void map_tensor_element(ggml::tensor *tensor, std::function<float (float)> f);
8385
};
8486

8587
// Is `ggml_backend_buffer_type_t` a good name?

0 commit comments

Comments
 (0)