fix CI and polish support of qwen2-vl

foldl · foldl · commit 4e9bc2c7593f · 2026-02-23T08:30:47.000+08:00
diff --git a/convert.py b/convert.py
@@ -4762,7 +4762,8 @@ def state_dict_pp(cls, config, state_dict):
 
     @staticmethod
     def dump_config(f, config, ggml_type):
-        assert config.vision_config['hidden_act'] == 'quick_gelu'
+        if 'hidden_act' in config.vision_config:
+            assert config.vision_config['hidden_act'] == 'quick_gelu'
         config.vision_config['hidden_act'] = 'silu'
         config.vision_config['hidden_size'] = config.vision_config['embed_dim']
         QWen2_5VLConverter.dump_config(f, config, ggml_type)
diff --git a/docs/models.md b/docs/models.md
@@ -424,8 +424,9 @@ Please use `--format completion` for these models.
     [8B-Reasoning-2512](https://huggingface.co/mistralai/Ministral-3-8B-Reasoning-2512/tree/f511871f6402ba68dadfb42a94a7a7e13499fd65)
     * [x] Devstral-Small-2: [24B-Instruct-2512](https://huggingface.co/mistralai/Devstral-Small-2-24B-Instruct-2512/tree/8d27a0d2120f1563c11dc91d494e99f9678ecf79)
 
-* Qwen (`Qwen2AudioForConditionalGeneration`, `Qwen2_5_VLForConditionalGeneration`, `Qwen3VLForConditionalGeneration`, `Qwen3VLMoeForConditionalGeneration`)
+* Qwen (`Qwen2AudioForConditionalGeneration`, `Qwen2VLForConditionalGeneration`, `Qwen2_5_VLForConditionalGeneration`, `Qwen3VLForConditionalGeneration`, `Qwen3VLMoeForConditionalGeneration`)
     * [x] Qwen2-Audio: [7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct/tree/0a095220c30b7b31434169c3086508ef3ea5bf0a)
+    * [x] Qwen2-VL: [2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/tree/895c3a49bc3fa70a340399125c650a463535e71c), [7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/tree/eed13092ef92e448dd6875b2a00151bd3f7db0ac)
     * [x] Qwen2.5-VL: [3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct/tree/66285546d2b821cf421d4f5eb2576359d3770cd3), [7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/tree/cc594898137f460bfe9f0759e9844b3ce807cfb5)
     * [x] MiMo-VL: [7B-RL](https://huggingface.co/XiaomiMiMo/MiMo-VL-7B-RL/tree/460c34be0c6cfe79b6b311647ae9112784f80b73), [7B-RL-2508](https://huggingface.co/XiaomiMiMo/MiMo-VL-7B-RL-2508/tree/4bfb270765825d2fa059011deb4c96fdd579be6f)
     * [x] Dolphin: [v2](https://huggingface.co/ByteDance/Dolphin-v2/tree/c37c62768c644bb594da4283149c627765aa80f3)
diff --git a/models/hunyuan.cpp b/models/hunyuan.cpp
@@ -1,5 +1,6 @@
 #include "hunyuan.h"
 #include <numeric>
+#include <string.h>
 #include "qwen.h"
 #include "deepseek.h"
 #include "../src/vision_process.h"
@@ -1084,6 +1085,11 @@ namespace chatllm::hunyuan::youtu::vit
         float image_std[3];
 
         bool fullatt_block_indices[VIT_MAX_LAYERS];
+
+        Config()
+        {
+            memset(this, 0, sizeof(Config));
+        }
     };
 
     class PatchEmbedding : public Linear
@@ -1454,7 +1460,6 @@ namespace chatllm::hunyuan::youtu::vl
         if (!vis_cfg.IsObject()) return false;
 
         vit::Config vis_config;
-        memset(&vis_config, 0, sizeof(vis_config));
 
         vis_config.dtype = this->config.dtype;
 
diff --git a/models/qwen.cpp b/models/qwen.cpp
@@ -1,5 +1,6 @@
 #include "qwen.h"
 #include <optional>
+#include <string.h>
 #include "../src/audio_process.h"
 #include "../src/vision_process.h"
 #include "qwen_asr.h"
@@ -900,6 +901,11 @@ namespace chatllm::qwen::ds_r1_distill
 
 namespace chatllm::qwen::vit
 {
+    Config::Config()
+    {
+        memset(this, 0, sizeof(Config));
+    }
+
     PatchEmbedding::PatchEmbedding(InitContext *ctx, const Config &config)
         : proj0(ctx, 3, config.hidden_size, config.patch_size, config.patch_size, 0, 1, 1, false),
           proj1(ctx, 3, config.hidden_size, config.patch_size, config.patch_size, 0, 1, 1, false)
@@ -1366,9 +1372,9 @@ namespace chatllm::qwen::vit
         :
         GRAPH_SIZE(GRAPH_SIZE), _ctx(&backend_context),
         n_threads(runtime_config.n_threads),
+        vis_config(),
         max_patches(max_patches)
     {
-        memset(&vis_config, 0, sizeof(vis_config));
         _ctx.cache_dtype = runtime_config.cache_type;
         model_gpu_layers = BackendContext::get_ngl_of_model(runtime_config.model_gpu_layers, "vis");
     }
@@ -1391,8 +1397,10 @@ namespace chatllm::qwen::vit
         const auto vis_cfg = config["config.json"]["vision_config"];
         if (!vis_cfg.IsObject()) return false;
 
+        int full_cnt = 0;
+
         vis_config.dtype = dtype;
-        vis_config.is_ver_2_0           = vis_cfg["model_type"].ToString() == "qwen2_vl";
+        vis_config.is_ver_2_0           = config["config.json"]["model_type"].ToString() == "qwen2_vl";
 
         vis_config.patch_size           = (int)vis_cfg["patch_size"].ToInt();
         vis_config.num_attention_heads  = (int)vis_cfg["num_heads"].ToInt();
@@ -1420,7 +1428,10 @@ namespace chatllm::qwen::vit
             auto indices = vis_cfg["fullatt_block_indexes"];
             CHATLLM_CHECK((int)indices.length() <= VIT_MAX_LAYERS);
             for (int i = 0; i < (int)indices.length(); i++)
+            {
+                full_cnt += 1;
                 vis_config.fullatt_block_indices[indices[i].ToInt()] = true;
+            }
         }
 
         auto pp_cfg = config["preprocessor_config.json"];
@@ -1446,7 +1457,7 @@ namespace chatllm::qwen::vit
         }
 
         const size_t tensor_ovhd = ggml_tensor_overhead();
-        const size_t num_tensors = vis_config.is_ver_2_0 ? 10 + vis_config.num_hidden_layers * 18 :  5 + vis_config.num_hidden_layers * 18;
+        const size_t num_tensors = vis_config.is_ver_2_0 ? 10 + vis_config.num_hidden_layers * 18 :  (9 + vis_config.num_hidden_layers * 18 - full_cnt);
         const size_t ctx_size = num_tensors * tensor_ovhd;
         _ctx.gctx = GGMLContext({.mem_size = ctx_size, .mem_buffer = nullptr, .no_alloc = true});
         _ctx.dtype = dtype;
diff --git a/models/qwen.h b/models/qwen.h
@@ -395,6 +395,8 @@ namespace chatllm::qwen
             int min_pixels;
             int max_pixels;
             int merge_size;
+
+            Config();
         };
 
         class PatchEmbedding : public Block
diff --git a/scripts/models.json b/scripts/models.json
@@ -1525,6 +1525,22 @@
             }
         }
     },
+    "qwen2-vl" : {
+        "brief": "Flagship vision-language model of Qwen and also a significant leap from the previous Qwen2-VL.",
+        "default": "2b",
+        "license": "Apache License 2.0",
+        "variants": {
+            "2b": {
+                "default": "q8",
+                "quantized": {
+                    "q8": {
+                        "size": 2354785088,
+                        "url": "chatllm_quantized_qwen2/qwen2-vl-2b-it.bin"
+                    }
+                }
+            }
+        }
+    },
     "qwen2.5-vl" : {
         "brief": "Flagship vision-language model of Qwen and also a significant leap from the previous Qwen2-VL.",
         "default": "3b",