This error occurs when inferencing Qwen/Qwen2.5-VL-32B-Instruct-AWQ on Win11_64 but works properly on WSL Ubuntu-22.04.
win11 python environments:
accelerate 1.6.0
aiohappyeyeballs 2.6.1
aiohttp 3.11.16
aiosignal 1.3.2
annotated-types 0.7.0
anyio 4.9.0
asttokens 3.0.0
async-timeout 5.0.1
attrs 25.3.0
autoawq 0.2.8
autoawq_kernels 0.0.7
autopep8 2.3.2
av 14.3.0
bitsandbytes 0.45.3
certifi 2025.1.31
charset-normalizer 3.4.1
colorama 0.4.6
comm 0.2.2
compressed-tensors 0.9.3
datasets 3.5.0
debugpy 1.8.13
decorator 5.2.1
decord 0.6.0
device-smi 0.4.1
dill 0.3.8
distro 1.9.0
exceptiongroup 1.2.2
executing 2.2.0
filelock 3.18.0
frozenlist 1.5.0
fsspec 2024.12.0
gekko 1.3.0
gptqmodel 2.2.0
h11 0.14.0
hf_transfer 0.1.9
httpcore 1.0.7
httpx 0.28.1
huggingface-hub 0.30.1
idna 3.10
iniconfig 2.1.0
intel-openmp 2021.4.0
ipykernel 6.29.5
ipython 8.34.0
jedi 0.19.2
Jinja2 3.1.6
jiter 0.9.0
jupyter_client 8.6.3
jupyter_core 5.7.2
logbar 0.0.4
MarkupSafe 3.0.2
matplotlib-inline 0.1.7
mkl 2021.4.0
mpmath 1.3.0
multidict 6.3.2
multiprocess 0.70.16
nest-asyncio 1.6.0
networkx 3.4.2
numpy 2.2.4
openai 1.70.0
optimum 1.24.0
packaging 24.2
pandas 2.2.3
parso 0.8.4
peft 0.15.1
pillow 11.1.0
pip 25.0
platformdirs 4.3.7
pluggy 1.5.0
prompt_toolkit 3.0.50
propcache 0.3.1
protobuf 6.30.2
psutil 7.0.0
pure_eval 0.2.3
pyarrow 19.0.1
pycodestyle 2.13.0
pydantic 2.11.2
pydantic_core 2.33.1
Pygments 2.19.1
pytest 8.3.5
python-dateutil 2.9.0.post0
pytz 2025.2
pywin32 310
PyYAML 6.0.2
pyzmq 26.4.0
qwen-vl-utils 0.0.8
random_word 1.0.13
regex 2024.11.6
requests 2.32.3
safetensors 0.5.3
setuptools 75.8.0
six 1.17.0
sniffio 1.3.1
stack-data 0.6.3
sympy 1.13.1
tbb 2021.13.1
threadpoolctl 3.6.0
tokenicer 0.0.4
tokenizers 0.21.1
tomli 2.2.1
torch 2.6.0+cu124
torchaudio 2.6.0+cu124
torchvision 0.21.0+cu124
tornado 6.4.2
tqdm 4.67.1
traitlets 5.14.3
transformers 4.52.0.dev0
typing_extensions 4.13.1
typing-inspection 0.4.0
tzdata 2025.2
urllib3 2.3.0
wcwidth 0.2.13
wheel 0.45.1
xxhash 3.5.0
yarl 1.19.0
zstandard 0.23.0
error log from vscode:
RuntimeError Traceback (most recent call last)
Cell In[3], line 17
12 image = Image.open(image_path)
13 # image = image.resize( (int(image.size[0]/2), int(image.size[1]/2)) )
14 # display(image)
15
16 ## Use a local HuggingFace model to inference.
---> 17 response = inference_32B_4bit_resize(image_path, sys_prompt=system_prompt,prompt=user_prompt)
18 print(response)
20 torch.cuda.empty_cache()
Cell In[1], line 175
172 inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt")
173 inputs = inputs.to('cuda')
--> 175 output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
176 generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
177 output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\torch\utils_contextlib.py:116, in context_decorator..decorate_context(*args, **kwargs)
113 @functools.wraps(func)
114 def decorate_context(*args, **kwargs):
115 with ctx_factory():
--> 116 return func(*args, **kwargs)
File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\transformers\generation\utils.py:2460, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, use_model_defaults, **kwargs)
2452 input_ids, model_kwargs = self._expand_inputs_for_generation(
2453 input_ids=input_ids,
2454 expand_size=generation_config.num_return_sequences,
2455 is_encoder_decoder=self.config.is_encoder_decoder,
2456 **model_kwargs,
2457 )
2459 # 12. run sample (it degenerates to greedy search when generation_config.do_sample=False)
-> 2460 result = self._sample(
2461 input_ids,
2462 logits_processor=prepared_logits_processor,
2463 stopping_criteria=prepared_stopping_criteria,
2464 generation_config=generation_config,
2465 synced_gpus=synced_gpus,
2466 streamer=streamer,
2467 **model_kwargs,
2468 )
2470 elif generation_mode in (GenerationMode.BEAM_SAMPLE, GenerationMode.BEAM_SEARCH):
2471 # 11. interleave input_ids with num_beams additional sequences per batch
2472 input_ids, model_kwargs = self._expand_inputs_for_generation(
2473 input_ids=input_ids,
2474 expand_size=generation_config.num_beams,
2475 is_encoder_decoder=self.config.is_encoder_decoder,
2476 **model_kwargs,
2477 )
File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\transformers\generation\utils.py:3426, in GenerationMixin._sample(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, **model_kwargs)
3423 model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
3425 if is_prefill:
-> 3426 outputs = self(**model_inputs, return_dict=True)
3427 is_prefill = False
3428 else:
File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\torch\nn\modules\module.py:1739, in Module._wrapped_call_impl(self, *args, **kwargs)
1737 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1738 else:
-> 1739 return self._call_impl(*args, **kwargs)
File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\torch\nn\modules\module.py:1750, in Module._call_impl(self, *args, **kwargs)
1745 # If we don't have any hooks, we want to skip the rest of the logic in
1746 # this function, and just call forward.
1747 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1748 or _global_backward_pre_hooks or _global_backward_hooks
1749 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1750 return forward_call(*args, **kwargs)
1752 result = None
1753 called_always_called_hooks = set()
File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\transformers\models\qwen2_5_vl\modeling_qwen2_5_vl.py:1839, in Qwen2_5_VLForConditionalGeneration.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, pixel_values, pixel_values_videos, image_grid_thw, video_grid_thw, rope_deltas, cache_position, second_per_grid_ts)
1825 outputs = self.model(
1826 input_ids=None,
1827 position_ids=position_ids,
(...)
1835 cache_position=cache_position,
1836 )
1838 hidden_states = outputs[0]
-> 1839 logits = self.lm_head(hidden_states)
1841 loss = None
1842 if labels is not None:
1843 # Upcast to float if we need to compute the loss to avoid potential precision issues
File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\torch\nn\modules\module.py:1739, in Module._wrapped_call_impl(self, *args, **kwargs)
1737 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1738 else:
-> 1739 return self._call_impl(*args, **kwargs)
File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\torch\nn\modules\module.py:1750, in Module._call_impl(self, *args, **kwargs)
1745 # If we don't have any hooks, we want to skip the rest of the logic in
1746 # this function, and just call forward.
1747 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1748 or _global_backward_pre_hooks or _global_backward_hooks
1749 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1750 return forward_call(*args, **kwargs)
1752 result = None
1753 called_always_called_hooks = set()
File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\awq\modules\linear\gemm.py:271, in WQLinear_GEMM.forward(self, x)
269 else:
270 with torch.no_grad():
--> 271 out = WQLinearMMFunction.apply(
272 x,
273 self.qweight,
274 self.qzeros,
275 self.scales,
276 self.w_bit,
277 self.group_size,
278 self.bias,
279 self.out_features,
280 )
282 if input_dtype != torch.float16:
283 out = out.to(dtype=input_dtype)
File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\torch\autograd\function.py:575, in Function.apply(cls, *args, **kwargs)
572 if not torch._C._are_functorch_transforms_active():
573 # See NOTE: [functorch vjp and autograd interaction]
574 args = _functorch.utils.unwrap_dead_wrappers(args)
--> 575 return super().apply(*args, **kwargs) # type: ignore[misc]
577 if not is_setup_ctx_defined:
578 raise RuntimeError(
579 "In order to use an autograd.Function with functorch transforms "
580 "(vmap, grad, jvp, jacrev, ...), it must override the setup_context "
581 "staticmethod. For more details, please see "
582 "https://pytorch.org/docs/main/notes/extending.func.html"
583 )
File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\awq\modules\linear\gemm.py:74, in WQLinearMMFunction.forward(ctx, x, qweight, qzeros, scales, w_bit, group_size, bias, out_features)
72 warnings.warn("Using naive (slow) implementation." + msg)
73 user_has_been_warned = True
---> 74 out = dequantize_gemm(qweight, qzeros, scales, w_bit, group_size)
75 out = torch.matmul(x, out)
77 out = out + bias if bias is not None else out
File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\awq\utils\packing_utils.py:89, in dequantize_gemm(qweight, qzeros, scales, bits, group_size)
87 def dequantize_gemm(qweight, qzeros, scales, bits, group_size):
88 # Unpack the qweight and qzeros tensors
---> 89 iweight, izeros = unpack_awq(qweight, qzeros, bits)
90 # Reverse the order of the iweight and izeros tensors
91 iweight, izeros = reverse_awq_order(iweight, izeros, bits)
File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\awq\utils\packing_utils.py:12, in unpack_awq(qweight, qzeros, bits)
9 shifts = torch.arange(0, 32, bits, device=qzeros.device)
11 # unpacking columnwise
---> 12 iweights = torch.bitwise_right_shift(qweight[:, :, None], shifts[None, None, :]).to(
13 torch.int8 # smallest dtype available
14 )
15 iweights = iweights.view(iweights.shape[0], -1)
17 # unpacking columnwise
RuntimeError: "rshift_cuda" not implemented for 'Half'
This error occurs when inferencing Qwen/Qwen2.5-VL-32B-Instruct-AWQ on Win11_64 but works properly on WSL Ubuntu-22.04.
win11 python environments:
accelerate 1.6.0
aiohappyeyeballs 2.6.1
aiohttp 3.11.16
aiosignal 1.3.2
annotated-types 0.7.0
anyio 4.9.0
asttokens 3.0.0
async-timeout 5.0.1
attrs 25.3.0
autoawq 0.2.8
autoawq_kernels 0.0.7
autopep8 2.3.2
av 14.3.0
bitsandbytes 0.45.3
certifi 2025.1.31
charset-normalizer 3.4.1
colorama 0.4.6
comm 0.2.2
compressed-tensors 0.9.3
datasets 3.5.0
debugpy 1.8.13
decorator 5.2.1
decord 0.6.0
device-smi 0.4.1
dill 0.3.8
distro 1.9.0
exceptiongroup 1.2.2
executing 2.2.0
filelock 3.18.0
frozenlist 1.5.0
fsspec 2024.12.0
gekko 1.3.0
gptqmodel 2.2.0
h11 0.14.0
hf_transfer 0.1.9
httpcore 1.0.7
httpx 0.28.1
huggingface-hub 0.30.1
idna 3.10
iniconfig 2.1.0
intel-openmp 2021.4.0
ipykernel 6.29.5
ipython 8.34.0
jedi 0.19.2
Jinja2 3.1.6
jiter 0.9.0
jupyter_client 8.6.3
jupyter_core 5.7.2
logbar 0.0.4
MarkupSafe 3.0.2
matplotlib-inline 0.1.7
mkl 2021.4.0
mpmath 1.3.0
multidict 6.3.2
multiprocess 0.70.16
nest-asyncio 1.6.0
networkx 3.4.2
numpy 2.2.4
openai 1.70.0
optimum 1.24.0
packaging 24.2
pandas 2.2.3
parso 0.8.4
peft 0.15.1
pillow 11.1.0
pip 25.0
platformdirs 4.3.7
pluggy 1.5.0
prompt_toolkit 3.0.50
propcache 0.3.1
protobuf 6.30.2
psutil 7.0.0
pure_eval 0.2.3
pyarrow 19.0.1
pycodestyle 2.13.0
pydantic 2.11.2
pydantic_core 2.33.1
Pygments 2.19.1
pytest 8.3.5
python-dateutil 2.9.0.post0
pytz 2025.2
pywin32 310
PyYAML 6.0.2
pyzmq 26.4.0
qwen-vl-utils 0.0.8
random_word 1.0.13
regex 2024.11.6
requests 2.32.3
safetensors 0.5.3
setuptools 75.8.0
six 1.17.0
sniffio 1.3.1
stack-data 0.6.3
sympy 1.13.1
tbb 2021.13.1
threadpoolctl 3.6.0
tokenicer 0.0.4
tokenizers 0.21.1
tomli 2.2.1
torch 2.6.0+cu124
torchaudio 2.6.0+cu124
torchvision 0.21.0+cu124
tornado 6.4.2
tqdm 4.67.1
traitlets 5.14.3
transformers 4.52.0.dev0
typing_extensions 4.13.1
typing-inspection 0.4.0
tzdata 2025.2
urllib3 2.3.0
wcwidth 0.2.13
wheel 0.45.1
xxhash 3.5.0
yarl 1.19.0
zstandard 0.23.0
error log from vscode:
RuntimeError Traceback (most recent call last)
Cell In[3], line 17
12 image = Image.open(image_path)
13 # image = image.resize( (int(image.size[0]/2), int(image.size[1]/2)) )
14 # display(image)
15
16 ## Use a local HuggingFace model to inference.
---> 17 response = inference_32B_4bit_resize(image_path, sys_prompt=system_prompt,prompt=user_prompt)
18 print(response)
20 torch.cuda.empty_cache()
Cell In[1], line 175
172 inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt")
173 inputs = inputs.to('cuda')
--> 175 output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
176 generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
177 output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\torch\utils_contextlib.py:116, in context_decorator..decorate_context(*args, **kwargs)
113 @functools.wraps(func)
114 def decorate_context(*args, **kwargs):
115 with ctx_factory():
--> 116 return func(*args, **kwargs)
File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\transformers\generation\utils.py:2460, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, use_model_defaults, **kwargs)
2452 input_ids, model_kwargs = self._expand_inputs_for_generation(
2453 input_ids=input_ids,
2454 expand_size=generation_config.num_return_sequences,
2455 is_encoder_decoder=self.config.is_encoder_decoder,
2456 **model_kwargs,
2457 )
2459 # 12. run sample (it degenerates to greedy search when
generation_config.do_sample=False)-> 2460 result = self._sample(
2461 input_ids,
2462 logits_processor=prepared_logits_processor,
2463 stopping_criteria=prepared_stopping_criteria,
2464 generation_config=generation_config,
2465 synced_gpus=synced_gpus,
2466 streamer=streamer,
2467 **model_kwargs,
2468 )
2470 elif generation_mode in (GenerationMode.BEAM_SAMPLE, GenerationMode.BEAM_SEARCH):
2471 # 11. interleave input_ids with
num_beamsadditional sequences per batch2472 input_ids, model_kwargs = self._expand_inputs_for_generation(
2473 input_ids=input_ids,
2474 expand_size=generation_config.num_beams,
2475 is_encoder_decoder=self.config.is_encoder_decoder,
2476 **model_kwargs,
2477 )
File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\transformers\generation\utils.py:3426, in GenerationMixin._sample(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, **model_kwargs)
3423 model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
3425 if is_prefill:
-> 3426 outputs = self(**model_inputs, return_dict=True)
3427 is_prefill = False
3428 else:
File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\torch\nn\modules\module.py:1739, in Module._wrapped_call_impl(self, *args, **kwargs)
1737 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1738 else:
-> 1739 return self._call_impl(*args, **kwargs)
File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\torch\nn\modules\module.py:1750, in Module._call_impl(self, *args, **kwargs)
1745 # If we don't have any hooks, we want to skip the rest of the logic in
1746 # this function, and just call forward.
1747 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1748 or _global_backward_pre_hooks or _global_backward_hooks
1749 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1750 return forward_call(*args, **kwargs)
1752 result = None
1753 called_always_called_hooks = set()
File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\transformers\models\qwen2_5_vl\modeling_qwen2_5_vl.py:1839, in Qwen2_5_VLForConditionalGeneration.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, pixel_values, pixel_values_videos, image_grid_thw, video_grid_thw, rope_deltas, cache_position, second_per_grid_ts)
1825 outputs = self.model(
1826 input_ids=None,
1827 position_ids=position_ids,
(...)
1835 cache_position=cache_position,
1836 )
1838 hidden_states = outputs[0]
-> 1839 logits = self.lm_head(hidden_states)
1841 loss = None
1842 if labels is not None:
1843 # Upcast to float if we need to compute the loss to avoid potential precision issues
File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\torch\nn\modules\module.py:1739, in Module._wrapped_call_impl(self, *args, **kwargs)
1737 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1738 else:
-> 1739 return self._call_impl(*args, **kwargs)
File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\torch\nn\modules\module.py:1750, in Module._call_impl(self, *args, **kwargs)
1745 # If we don't have any hooks, we want to skip the rest of the logic in
1746 # this function, and just call forward.
1747 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1748 or _global_backward_pre_hooks or _global_backward_hooks
1749 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1750 return forward_call(*args, **kwargs)
1752 result = None
1753 called_always_called_hooks = set()
File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\awq\modules\linear\gemm.py:271, in WQLinear_GEMM.forward(self, x)
269 else:
270 with torch.no_grad():
--> 271 out = WQLinearMMFunction.apply(
272 x,
273 self.qweight,
274 self.qzeros,
275 self.scales,
276 self.w_bit,
277 self.group_size,
278 self.bias,
279 self.out_features,
280 )
282 if input_dtype != torch.float16:
283 out = out.to(dtype=input_dtype)
File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\torch\autograd\function.py:575, in Function.apply(cls, *args, **kwargs)
572 if not torch._C._are_functorch_transforms_active():
573 # See NOTE: [functorch vjp and autograd interaction]
574 args = _functorch.utils.unwrap_dead_wrappers(args)
--> 575 return super().apply(*args, **kwargs) # type: ignore[misc]
577 if not is_setup_ctx_defined:
578 raise RuntimeError(
579 "In order to use an autograd.Function with functorch transforms "
580 "(vmap, grad, jvp, jacrev, ...), it must override the setup_context "
581 "staticmethod. For more details, please see "
582 "https://pytorch.org/docs/main/notes/extending.func.html"
583 )
File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\awq\modules\linear\gemm.py:74, in WQLinearMMFunction.forward(ctx, x, qweight, qzeros, scales, w_bit, group_size, bias, out_features)
72 warnings.warn("Using naive (slow) implementation." + msg)
73 user_has_been_warned = True
---> 74 out = dequantize_gemm(qweight, qzeros, scales, w_bit, group_size)
75 out = torch.matmul(x, out)
77 out = out + bias if bias is not None else out
File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\awq\utils\packing_utils.py:89, in dequantize_gemm(qweight, qzeros, scales, bits, group_size)
87 def dequantize_gemm(qweight, qzeros, scales, bits, group_size):
88 # Unpack the qweight and qzeros tensors
---> 89 iweight, izeros = unpack_awq(qweight, qzeros, bits)
90 # Reverse the order of the iweight and izeros tensors
91 iweight, izeros = reverse_awq_order(iweight, izeros, bits)
File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\awq\utils\packing_utils.py:12, in unpack_awq(qweight, qzeros, bits)
9 shifts = torch.arange(0, 32, bits, device=qzeros.device)
11 # unpacking columnwise
---> 12 iweights = torch.bitwise_right_shift(qweight[:, :, None], shifts[None, None, :]).to(
13 torch.int8 # smallest dtype available
14 )
15 iweights = iweights.view(iweights.shape[0], -1)
17 # unpacking columnwise
RuntimeError: "rshift_cuda" not implemented for 'Half'