Skip to content

Commit 4e9bc2c

Browse files
committed
fix CI and polish support of qwen2-vl
1 parent b3580e3 commit 4e9bc2c

6 files changed

Lines changed: 42 additions & 6 deletions

File tree

convert.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4762,7 +4762,8 @@ def state_dict_pp(cls, config, state_dict):
47624762

47634763
@staticmethod
47644764
def dump_config(f, config, ggml_type):
4765-
assert config.vision_config['hidden_act'] == 'quick_gelu'
4765+
if 'hidden_act' in config.vision_config:
4766+
assert config.vision_config['hidden_act'] == 'quick_gelu'
47664767
config.vision_config['hidden_act'] = 'silu'
47674768
config.vision_config['hidden_size'] = config.vision_config['embed_dim']
47684769
QWen2_5VLConverter.dump_config(f, config, ggml_type)

docs/models.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -424,8 +424,9 @@ Please use `--format completion` for these models.
424424
[8B-Reasoning-2512](https://huggingface.co/mistralai/Ministral-3-8B-Reasoning-2512/tree/f511871f6402ba68dadfb42a94a7a7e13499fd65)
425425
* [x] Devstral-Small-2: [24B-Instruct-2512](https://huggingface.co/mistralai/Devstral-Small-2-24B-Instruct-2512/tree/8d27a0d2120f1563c11dc91d494e99f9678ecf79)
426426

427-
* Qwen (`Qwen2AudioForConditionalGeneration`, `Qwen2_5_VLForConditionalGeneration`, `Qwen3VLForConditionalGeneration`, `Qwen3VLMoeForConditionalGeneration`)
427+
* Qwen (`Qwen2AudioForConditionalGeneration`, `Qwen2VLForConditionalGeneration`, `Qwen2_5_VLForConditionalGeneration`, `Qwen3VLForConditionalGeneration`, `Qwen3VLMoeForConditionalGeneration`)
428428
* [x] Qwen2-Audio: [7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct/tree/0a095220c30b7b31434169c3086508ef3ea5bf0a)
429+
* [x] Qwen2-VL: [2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/tree/895c3a49bc3fa70a340399125c650a463535e71c), [7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/tree/eed13092ef92e448dd6875b2a00151bd3f7db0ac)
429430
* [x] Qwen2.5-VL: [3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct/tree/66285546d2b821cf421d4f5eb2576359d3770cd3), [7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/tree/cc594898137f460bfe9f0759e9844b3ce807cfb5)
430431
* [x] MiMo-VL: [7B-RL](https://huggingface.co/XiaomiMiMo/MiMo-VL-7B-RL/tree/460c34be0c6cfe79b6b311647ae9112784f80b73), [7B-RL-2508](https://huggingface.co/XiaomiMiMo/MiMo-VL-7B-RL-2508/tree/4bfb270765825d2fa059011deb4c96fdd579be6f)
431432
* [x] Dolphin: [v2](https://huggingface.co/ByteDance/Dolphin-v2/tree/c37c62768c644bb594da4283149c627765aa80f3)

models/hunyuan.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "hunyuan.h"
22
#include <numeric>
3+
#include <string.h>
34
#include "qwen.h"
45
#include "deepseek.h"
56
#include "../src/vision_process.h"
@@ -1084,6 +1085,11 @@ namespace chatllm::hunyuan::youtu::vit
10841085
float image_std[3];
10851086

10861087
bool fullatt_block_indices[VIT_MAX_LAYERS];
1088+
1089+
Config()
1090+
{
1091+
memset(this, 0, sizeof(Config));
1092+
}
10871093
};
10881094

10891095
class PatchEmbedding : public Linear
@@ -1454,7 +1460,6 @@ namespace chatllm::hunyuan::youtu::vl
14541460
if (!vis_cfg.IsObject()) return false;
14551461

14561462
vit::Config vis_config;
1457-
memset(&vis_config, 0, sizeof(vis_config));
14581463

14591464
vis_config.dtype = this->config.dtype;
14601465

models/qwen.cpp

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "qwen.h"
22
#include <optional>
3+
#include <string.h>
34
#include "../src/audio_process.h"
45
#include "../src/vision_process.h"
56
#include "qwen_asr.h"
@@ -900,6 +901,11 @@ namespace chatllm::qwen::ds_r1_distill
900901

901902
namespace chatllm::qwen::vit
902903
{
904+
Config::Config()
905+
{
906+
memset(this, 0, sizeof(Config));
907+
}
908+
903909
PatchEmbedding::PatchEmbedding(InitContext *ctx, const Config &config)
904910
: proj0(ctx, 3, config.hidden_size, config.patch_size, config.patch_size, 0, 1, 1, false),
905911
proj1(ctx, 3, config.hidden_size, config.patch_size, config.patch_size, 0, 1, 1, false)
@@ -1366,9 +1372,9 @@ namespace chatllm::qwen::vit
13661372
:
13671373
GRAPH_SIZE(GRAPH_SIZE), _ctx(&backend_context),
13681374
n_threads(runtime_config.n_threads),
1375+
vis_config(),
13691376
max_patches(max_patches)
13701377
{
1371-
memset(&vis_config, 0, sizeof(vis_config));
13721378
_ctx.cache_dtype = runtime_config.cache_type;
13731379
model_gpu_layers = BackendContext::get_ngl_of_model(runtime_config.model_gpu_layers, "vis");
13741380
}
@@ -1391,8 +1397,10 @@ namespace chatllm::qwen::vit
13911397
const auto vis_cfg = config["config.json"]["vision_config"];
13921398
if (!vis_cfg.IsObject()) return false;
13931399

1400+
int full_cnt = 0;
1401+
13941402
vis_config.dtype = dtype;
1395-
vis_config.is_ver_2_0 = vis_cfg["model_type"].ToString() == "qwen2_vl";
1403+
vis_config.is_ver_2_0 = config["config.json"]["model_type"].ToString() == "qwen2_vl";
13961404

13971405
vis_config.patch_size = (int)vis_cfg["patch_size"].ToInt();
13981406
vis_config.num_attention_heads = (int)vis_cfg["num_heads"].ToInt();
@@ -1420,7 +1428,10 @@ namespace chatllm::qwen::vit
14201428
auto indices = vis_cfg["fullatt_block_indexes"];
14211429
CHATLLM_CHECK((int)indices.length() <= VIT_MAX_LAYERS);
14221430
for (int i = 0; i < (int)indices.length(); i++)
1431+
{
1432+
full_cnt += 1;
14231433
vis_config.fullatt_block_indices[indices[i].ToInt()] = true;
1434+
}
14241435
}
14251436

14261437
auto pp_cfg = config["preprocessor_config.json"];
@@ -1446,7 +1457,7 @@ namespace chatllm::qwen::vit
14461457
}
14471458

14481459
const size_t tensor_ovhd = ggml_tensor_overhead();
1449-
const size_t num_tensors = vis_config.is_ver_2_0 ? 10 + vis_config.num_hidden_layers * 18 : 5 + vis_config.num_hidden_layers * 18;
1460+
const size_t num_tensors = vis_config.is_ver_2_0 ? 10 + vis_config.num_hidden_layers * 18 : (9 + vis_config.num_hidden_layers * 18 - full_cnt);
14501461
const size_t ctx_size = num_tensors * tensor_ovhd;
14511462
_ctx.gctx = GGMLContext({.mem_size = ctx_size, .mem_buffer = nullptr, .no_alloc = true});
14521463
_ctx.dtype = dtype;

models/qwen.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,8 @@ namespace chatllm::qwen
395395
int min_pixels;
396396
int max_pixels;
397397
int merge_size;
398+
399+
Config();
398400
};
399401

400402
class PatchEmbedding : public Block

scripts/models.json

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1525,6 +1525,22 @@
15251525
}
15261526
}
15271527
},
1528+
"qwen2-vl" : {
1529+
"brief": "Flagship vision-language model of Qwen and also a significant leap from the previous Qwen2-VL.",
1530+
"default": "2b",
1531+
"license": "Apache License 2.0",
1532+
"variants": {
1533+
"2b": {
1534+
"default": "q8",
1535+
"quantized": {
1536+
"q8": {
1537+
"size": 2354785088,
1538+
"url": "chatllm_quantized_qwen2/qwen2-vl-2b-it.bin"
1539+
}
1540+
}
1541+
}
1542+
}
1543+
},
15281544
"qwen2.5-vl" : {
15291545
"brief": "Flagship vision-language model of Qwen and also a significant leap from the previous Qwen2-VL.",
15301546
"default": "3b",

0 commit comments

Comments
 (0)