#!/bin/bash
export CONFIG_DIR=angelslim/compressor/speculative/train/configs
export TARGET_MODEL_NAME_OR_PATH=models/Qwen3-VL-2B-Instruct
export DRAFT_MODEL_CONFIG_PATH=$CONFIG_DIR/qwen3-vl-2b-eagle3-mrope.json
export TRAIN_DATA_PATH=dataset/driver_dataset/twb-0319/camera_wide/10/10.jsonl #单条数据可以正常执行,多条数据会报下边错误
export EVAL_DATA_PATH=dataset/driver_dataset/twb-0319/camera_wide/10/10_format.jsonl # 需要修改,临时验证数据
NUM_SPEC_TOKENS=3
GRADIENT=4
export OUTPUT_DIR=outputs/VL-2B/train_driving_2_epochs10_spec"$NUM_SPEC_TOKENS"_gradient$GRADIENT
export EMBED_WEIGHT_KEY="model.language_model.embed_tokens.weight"
export MODEL_MAX_LENGTH=32768
# export MODEL_MAX_LENGTH=16384
export CHAT_TEMPLATE_TYPE=qwen3_vl
python tools/train_eagle3_online.py \
--modal_type VLM \
--target_model_name_or_path $TARGET_MODEL_NAME_OR_PATH \
--draft_model_config_path $DRAFT_MODEL_CONFIG_PATH \
--train_data_path $TRAIN_DATA_PATH \
--eval_data_path $EVAL_DATA_PATH \
--output_dir $OUTPUT_DIR \
--num_train_epochs 100 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps $GRADIENT \
--training_time_test_length $NUM_SPEC_TOKENS \
--num_proc 1 \
--save_strategy "steps" \
--save_steps 50 \
--eval_steps 10 \
--learning_rate 1e-4 \
--weight_decay 0.0 \
--warmup_ratio 0.1 \
--lr_scheduler_type "cosine" \
--logging_steps 20 \
--model_max_length $MODEL_MAX_LENGTH \
--embed_weight_key $EMBED_WEIGHT_KEY \
--chat_template_type $CHAT_TEMPLATE_TYPE \
--report_to none \
--run_name qwen3-2b-eagle3-angelslim \
--bf16
# bash scripts/speculative/qwen3_vl/2B_driving/train_eagle3_vlm_online.sh
Loading draft model config...
Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section', 'mrope_interleaved'}
Loading target model with hf backend...
[Single Process] Loading model to device: cuda:0
Target model loaded successfully
Loading draft model...
draft_model_config: LlamaConfig {
"architectures": [
"Eagle3LlamaForCausalLM"
],
"attention_bias": false,
"attention_dropout": 0.0,
"bos_token_id": 151643,
"draft_vocab_size": 32000,
"dtype": "bfloat16",
"eos_token_id": 151645,
"head_dim": 128,
"hidden_act": "silu",
"hidden_size": 2048,
"image_token_id": 151655,
"initializer_range": 0.02,
"intermediate_size": 6144,
"max_position_embeddings": 262144,
"mlp_bias": false,
"modal_type": "VLM",
"model_type": "llama",
"num_attention_heads": 16,
"num_hidden_layers": 1,
"num_key_value_heads": 8,
"pretraining_tp": 1,
"rms_norm_eps": 1e-06,
"rope_scaling": {
"mrope_interleaved": true,
"mrope_section": [
24,
20,
20
],
"rope_type": "default",
"type": "default"
},
"rope_theta": 5000000,
"target_model_type": "qwen3_vl",
"tie_word_embeddings": true,
"transformers_version": "4.57.6",
"use_cache": true,
"video_token_id": 151656,
"vision_end_token_id": 151653,
"vision_start_token_id": 151652,
"vocab_size": 151936
}
`torch_dtype` is deprecated! Use `dtype` instead!
Draft model loaded successfully
Creating training and evaluation datasets with chat template type: qwen3_vl...
Generating train split: 2 examples [00:00, 311.59 examples/s]
Processing conversations: 100%|███████████████████████████████████████| 2/2 [00:01<00:00, 1.17 examples/s]
Filtering empty input_ids: 100%|█████████████████████████████████████| 2/2 [00:00<00:00, 196.81 examples/s]
Processing conversations: 2 examples [00:01, 1.07s/ examples]
Train dataset size: 2, Eval dataset size: 1
Building vocabulary mapping for draft model...
vocab len(dataset)=2 type(dataset)=<class 'datasets.arrow_dataset.Dataset'>
Added missing tokens to reach draft vocab size: 32000
Total tokens after addition: 32000
top 32000 token frequency ratio: 100.00%
Saved vocab mapping to: outputs/VL-2B/train_driving_2_epochs10_spec3_gradient4/vocab_mapping_cache.pt
Vocabulary mapping built successfully
Initializing trainer...
Starting training...
0%| | 0/100 [00:00<?, ?it/s]Traceback (most recent call last):
File "/data/users/swm/AngelSlim-driving/tools/train_eagle3_online.py", line 404, in <module>
train()
File "/data/users/swm/AngelSlim-driving/tools/train_eagle3_online.py", line 399, in train
trainer.train()
File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2325, in train
return inner_training_loop(
File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2674, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 4020, in training_step
loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
File "/usr/local/lib/python3.10/dist-packages/angelslim/compressor/speculative/train/trainer/eagle3_trainer.py", line 86, in compute_loss
loss = self.draft_model_training_time_test(
File "/usr/local/lib/python3.10/dist-packages/angelslim/compressor/speculative/train/trainer/eagle3_trainer.py", line 171, in draft_model_training_time_test
hidden_states, cache_hidden = self.draft_model.encode_layers(
File "/usr/local/lib/python3.10/dist-packages/angelslim/compressor/speculative/train/models/draft/llama_eagle3.py", line 585, in encode_layers
layer_outputs, cache_hidden = self.midlayer(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1787, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/angelslim/compressor/speculative/train/models/draft/llama_eagle3.py", line 523, in forward
hidden_states, latest_hidden_cache = self.self_attn(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1787, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/angelslim/compressor/speculative/train/models/draft/llama_eagle3.py", line 325, in forward
cos, sin = self.rotary_emb(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1787, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/angelslim/compressor/speculative/train/models/draft/llama_eagle3.py", line 222, in forward
inv_freq_expanded.float() @ position_ids_expanded.float()
RuntimeError: The size of tensor a (3) must match the size of tensor b (6) at non-singleton dimension 0
0%| | 0/100 [00:03<?, ?it/s]
{"id": "ea83027f-4dde-4dcd-b7c8-5eb4f87f", "conversations": [{"role": "user", "content": [{"type": "image", "image": "/data/users/swm/AngelSlim-driving/dataset/driver_dataset/twb-0319/localim/CJSB217016CS--717305796300800--1752864697599--camera_front_wide.jpg"}, {"type": "text", "text": "图1图2为行车POV视角相同场景,简单描述当前环境和驾驶相关重点元素"}]}, {"role": "assistant", "content": [{"type": "text", "text": "当前车辆位于道路左侧车道,前方无车辆,左侧车道亦无前车,右侧车道有1辆前车。道路两侧停满车辆,左侧有白色护栏,右侧为树木与建筑。夜间路灯照明,路面标线清晰,限速30km/h,前方绿灯通行,整体环境较安全但需注意右侧车辆及道路狭窄。"}]}]}
{"id": "c9a054c1-f84d-4b37-a5ed-16ced2fe", "conversations": [{"role": "user", "content": [{"type": "image", "image": "/data/users/swm/AngelSlim-driving/dataset/driver_dataset/twb-0319/localim/CJSB217016CS--717305796300800--1752864697599--camera_front_wide.jpg"}, {"type": "text", "text": "图1图2为行车POV视角相同场景,已知导航车信如下:当前道路车道数为4,当前道路限速60,本车当前车速28,前方50米内有路口。用户期望对目标进行超车,给出meta_action和理由。"}]}, {"role": "assistant", "content": [{"type": "text", "text": "<thinking>前方50米内有路口,且当前道路限速60,本车车速28,处于安全可控范围。右侧车道虽有前车,但路口临近,超车易引发变道风险。保持当前车道行驶,可确保安全通过路口,避免紧急操作,提升驾驶舒适性与安全性。<thinking><keep_current_lane>"}]}]}
执行在线训练脚本train_eagle3_vlm_online.sh:
报错日志如下:
训练数据dataset/driver_dataset/twb-0319/camera_wide/10/10.jsonl: