Skip to content
This repository was archived by the owner on May 11, 2025. It is now read-only.
This repository was archived by the owner on May 11, 2025. It is now read-only.

RuntimeError: "rshift_cuda" not implemented for 'Half' #745

@RoadToNowhereX

Description

@RoadToNowhereX

This error occurs when inferencing Qwen/Qwen2.5-VL-32B-Instruct-AWQ on Win11_64 but works properly on WSL Ubuntu-22.04.

win11 python environments:
accelerate 1.6.0
aiohappyeyeballs 2.6.1
aiohttp 3.11.16
aiosignal 1.3.2
annotated-types 0.7.0
anyio 4.9.0
asttokens 3.0.0
async-timeout 5.0.1
attrs 25.3.0
autoawq 0.2.8
autoawq_kernels 0.0.7
autopep8 2.3.2
av 14.3.0
bitsandbytes 0.45.3
certifi 2025.1.31
charset-normalizer 3.4.1
colorama 0.4.6
comm 0.2.2
compressed-tensors 0.9.3
datasets 3.5.0
debugpy 1.8.13
decorator 5.2.1
decord 0.6.0
device-smi 0.4.1
dill 0.3.8
distro 1.9.0
exceptiongroup 1.2.2
executing 2.2.0
filelock 3.18.0
frozenlist 1.5.0
fsspec 2024.12.0
gekko 1.3.0
gptqmodel 2.2.0
h11 0.14.0
hf_transfer 0.1.9
httpcore 1.0.7
httpx 0.28.1
huggingface-hub 0.30.1
idna 3.10
iniconfig 2.1.0
intel-openmp 2021.4.0
ipykernel 6.29.5
ipython 8.34.0
jedi 0.19.2
Jinja2 3.1.6
jiter 0.9.0
jupyter_client 8.6.3
jupyter_core 5.7.2
logbar 0.0.4
MarkupSafe 3.0.2
matplotlib-inline 0.1.7
mkl 2021.4.0
mpmath 1.3.0
multidict 6.3.2
multiprocess 0.70.16
nest-asyncio 1.6.0
networkx 3.4.2
numpy 2.2.4
openai 1.70.0
optimum 1.24.0
packaging 24.2
pandas 2.2.3
parso 0.8.4
peft 0.15.1
pillow 11.1.0
pip 25.0
platformdirs 4.3.7
pluggy 1.5.0
prompt_toolkit 3.0.50
propcache 0.3.1
protobuf 6.30.2
psutil 7.0.0
pure_eval 0.2.3
pyarrow 19.0.1
pycodestyle 2.13.0
pydantic 2.11.2
pydantic_core 2.33.1
Pygments 2.19.1
pytest 8.3.5
python-dateutil 2.9.0.post0
pytz 2025.2
pywin32 310
PyYAML 6.0.2
pyzmq 26.4.0
qwen-vl-utils 0.0.8
random_word 1.0.13
regex 2024.11.6
requests 2.32.3
safetensors 0.5.3
setuptools 75.8.0
six 1.17.0
sniffio 1.3.1
stack-data 0.6.3
sympy 1.13.1
tbb 2021.13.1
threadpoolctl 3.6.0
tokenicer 0.0.4
tokenizers 0.21.1
tomli 2.2.1
torch 2.6.0+cu124
torchaudio 2.6.0+cu124
torchvision 0.21.0+cu124
tornado 6.4.2
tqdm 4.67.1
traitlets 5.14.3
transformers 4.52.0.dev0
typing_extensions 4.13.1
typing-inspection 0.4.0
tzdata 2025.2
urllib3 2.3.0
wcwidth 0.2.13
wheel 0.45.1
xxhash 3.5.0
yarl 1.19.0
zstandard 0.23.0

error log from vscode:

RuntimeError Traceback (most recent call last)
Cell In[3], line 17
12 image = Image.open(image_path)
13 # image = image.resize( (int(image.size[0]/2), int(image.size[1]/2)) )
14 # display(image)
15
16 ## Use a local HuggingFace model to inference.
---> 17 response = inference_32B_4bit_resize(image_path, sys_prompt=system_prompt,prompt=user_prompt)
18 print(response)
20 torch.cuda.empty_cache()

Cell In[1], line 175
172 inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt")
173 inputs = inputs.to('cuda')
--> 175 output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
176 generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
177 output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)

File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\torch\utils_contextlib.py:116, in context_decorator..decorate_context(*args, **kwargs)
113 @functools.wraps(func)
114 def decorate_context(*args, **kwargs):
115 with ctx_factory():
--> 116 return func(*args, **kwargs)

File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\transformers\generation\utils.py:2460, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, use_model_defaults, **kwargs)
2452 input_ids, model_kwargs = self._expand_inputs_for_generation(
2453 input_ids=input_ids,
2454 expand_size=generation_config.num_return_sequences,
2455 is_encoder_decoder=self.config.is_encoder_decoder,
2456 **model_kwargs,
2457 )
2459 # 12. run sample (it degenerates to greedy search when generation_config.do_sample=False)
-> 2460 result = self._sample(
2461 input_ids,
2462 logits_processor=prepared_logits_processor,
2463 stopping_criteria=prepared_stopping_criteria,
2464 generation_config=generation_config,
2465 synced_gpus=synced_gpus,
2466 streamer=streamer,
2467 **model_kwargs,
2468 )
2470 elif generation_mode in (GenerationMode.BEAM_SAMPLE, GenerationMode.BEAM_SEARCH):
2471 # 11. interleave input_ids with num_beams additional sequences per batch
2472 input_ids, model_kwargs = self._expand_inputs_for_generation(
2473 input_ids=input_ids,
2474 expand_size=generation_config.num_beams,
2475 is_encoder_decoder=self.config.is_encoder_decoder,
2476 **model_kwargs,
2477 )

File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\transformers\generation\utils.py:3426, in GenerationMixin._sample(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, **model_kwargs)
3423 model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
3425 if is_prefill:
-> 3426 outputs = self(**model_inputs, return_dict=True)
3427 is_prefill = False
3428 else:

File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\torch\nn\modules\module.py:1739, in Module._wrapped_call_impl(self, *args, **kwargs)
1737 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1738 else:
-> 1739 return self._call_impl(*args, **kwargs)

File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\torch\nn\modules\module.py:1750, in Module._call_impl(self, *args, **kwargs)
1745 # If we don't have any hooks, we want to skip the rest of the logic in
1746 # this function, and just call forward.
1747 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1748 or _global_backward_pre_hooks or _global_backward_hooks
1749 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1750 return forward_call(*args, **kwargs)
1752 result = None
1753 called_always_called_hooks = set()

File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\transformers\models\qwen2_5_vl\modeling_qwen2_5_vl.py:1839, in Qwen2_5_VLForConditionalGeneration.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, pixel_values, pixel_values_videos, image_grid_thw, video_grid_thw, rope_deltas, cache_position, second_per_grid_ts)
1825 outputs = self.model(
1826 input_ids=None,
1827 position_ids=position_ids,
(...)
1835 cache_position=cache_position,
1836 )
1838 hidden_states = outputs[0]
-> 1839 logits = self.lm_head(hidden_states)
1841 loss = None
1842 if labels is not None:
1843 # Upcast to float if we need to compute the loss to avoid potential precision issues

File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\torch\nn\modules\module.py:1739, in Module._wrapped_call_impl(self, *args, **kwargs)
1737 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1738 else:
-> 1739 return self._call_impl(*args, **kwargs)

File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\torch\nn\modules\module.py:1750, in Module._call_impl(self, *args, **kwargs)
1745 # If we don't have any hooks, we want to skip the rest of the logic in
1746 # this function, and just call forward.
1747 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1748 or _global_backward_pre_hooks or _global_backward_hooks
1749 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1750 return forward_call(*args, **kwargs)
1752 result = None
1753 called_always_called_hooks = set()

File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\awq\modules\linear\gemm.py:271, in WQLinear_GEMM.forward(self, x)
269 else:
270 with torch.no_grad():
--> 271 out = WQLinearMMFunction.apply(
272 x,
273 self.qweight,
274 self.qzeros,
275 self.scales,
276 self.w_bit,
277 self.group_size,
278 self.bias,
279 self.out_features,
280 )
282 if input_dtype != torch.float16:
283 out = out.to(dtype=input_dtype)

File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\torch\autograd\function.py:575, in Function.apply(cls, *args, **kwargs)
572 if not torch._C._are_functorch_transforms_active():
573 # See NOTE: [functorch vjp and autograd interaction]
574 args = _functorch.utils.unwrap_dead_wrappers(args)
--> 575 return super().apply(*args, **kwargs) # type: ignore[misc]
577 if not is_setup_ctx_defined:
578 raise RuntimeError(
579 "In order to use an autograd.Function with functorch transforms "
580 "(vmap, grad, jvp, jacrev, ...), it must override the setup_context "
581 "staticmethod. For more details, please see "
582 "https://pytorch.org/docs/main/notes/extending.func.html"
583 )

File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\awq\modules\linear\gemm.py:74, in WQLinearMMFunction.forward(ctx, x, qweight, qzeros, scales, w_bit, group_size, bias, out_features)
72 warnings.warn("Using naive (slow) implementation." + msg)
73 user_has_been_warned = True
---> 74 out = dequantize_gemm(qweight, qzeros, scales, w_bit, group_size)
75 out = torch.matmul(x, out)
77 out = out + bias if bias is not None else out

File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\awq\utils\packing_utils.py:89, in dequantize_gemm(qweight, qzeros, scales, bits, group_size)
87 def dequantize_gemm(qweight, qzeros, scales, bits, group_size):
88 # Unpack the qweight and qzeros tensors
---> 89 iweight, izeros = unpack_awq(qweight, qzeros, bits)
90 # Reverse the order of the iweight and izeros tensors
91 iweight, izeros = reverse_awq_order(iweight, izeros, bits)

File e:\CondaEnvironments\qwen-vl_env\lib\site-packages\awq\utils\packing_utils.py:12, in unpack_awq(qweight, qzeros, bits)
9 shifts = torch.arange(0, 32, bits, device=qzeros.device)
11 # unpacking columnwise
---> 12 iweights = torch.bitwise_right_shift(qweight[:, :, None], shifts[None, None, :]).to(
13 torch.int8 # smallest dtype available
14 )
15 iweights = iweights.view(iweights.shape[0], -1)
17 # unpacking columnwise

RuntimeError: "rshift_cuda" not implemented for 'Half'

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions