Skip to content
Merged
6 changes: 4 additions & 2 deletions auto_round/compressors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1746,8 +1746,10 @@ def _adjust_immediate_packing_and_saving(self):
self.low_cpu_mem_usage = False
self.is_immediate_saving = False

Comment thread
Kaihui-intel marked this conversation as resolved.
if self.is_immediate_saving and "int" not in self.data_type:
logger.warning("immediate_saving is only supported for int quantization, set to False")
if self.is_immediate_saving and not (
"int" in self.data_type or is_nv_fp(self.data_type) or is_mx_fp(self.data_type)
):
logger.warning("immediate_saving is only supported for int/nv_fp/mx_fp quantization, set to False")
self.is_immediate_saving = False

if self.orig_output_dir is None:
Expand Down
3 changes: 2 additions & 1 deletion auto_round/compressors/shard_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,10 +146,11 @@ def _handle_tied_weights(self):
filtered_tensors[name] = tensor
continue

ptr = tensor.untyped_storage().data_ptr()
ptr = tensor.untyped_storage().data_ptr() + tensor.storage_offset() * tensor.element_size()
Comment thread
xin3he marked this conversation as resolved.
if ptr not in storage_map:
storage_map.add(ptr)
filtered_tensors[name] = tensor

self.current_shard_tensors = filtered_tensors

def _flush_shard(self):
Expand Down
17 changes: 2 additions & 15 deletions test/test_cpu/models/test_moe_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,7 @@ def test_gptoss(scheme, tiny_gpt_oss_model_path, tmp_path):

# verify the quantized model can be loaded and run inference
loaded_model = GptOssForCausalLM.from_pretrained(output_dir)
for n, m in quantized_model.named_modules():
if m.__class__.__name__ == "QuantLinear":
loaded_m = loaded_model.get_submodule(n)
if scheme == "MXFP4":
assert (loaded_m.weight_packed.to("cpu") == m.weight_packed.to("cpu")).all()
if scheme == "MXFP8":
assert (loaded_m.weight.to("cpu") == m.weight.to("cpu")).all()

inp = torch.randint(0, 100, (1, 32))
with torch.inference_mode():
loaded_out = loaded_model(inp)
Expand All @@ -84,10 +78,7 @@ def test_llama4(tiny_llama4_model_path):
assert quantized_model is not None, "Quantized model should not be None."

loaded_model = Llama4ForConditionalGeneration.from_pretrained(output_dir)
for n, m in quantized_model.named_modules():
if m.__class__.__name__ == "QuantLinear":
loaded_m = loaded_model.get_submodule(n)
assert (loaded_m.weight_packed.to("cpu") == m.weight_packed.to("cpu")).all()

inp = torch.randint(0, 100, (1, 32))
with torch.inference_mode():
loaded_out = loaded_model(inp)
Expand All @@ -110,10 +101,6 @@ def test_qwen3_vl_moe_mxfp(tiny_qwen3_vl_moe_model_path):
assert quantized_model is not None, "Quantized model should not be None."
loaded_model = Qwen3VLMoeForConditionalGeneration.from_pretrained(output_dir, device_map="cpu")

for n, m in quantized_model.named_modules():
if m.__class__.__name__ == "QuantLinear":
loaded_m = loaded_model.get_submodule(n)
assert (loaded_m.weight_packed.to("cpu") == m.weight_packed.to("cpu")).all()
inp = torch.randint(0, 100, (1, 32))
with torch.inference_mode():
loaded_out = loaded_model(inp)
Expand Down
1 change: 1 addition & 0 deletions test/test_cuda/models/test_fp8_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ def test_qwen3_fp8_moe_mxfp(tiny_fp8_qwen_moe_model_path, mock_fp8_capable_devic
nsamples=2,
seqlen=32,
iters=0,
low_cpu_mem_usage=False,
)
quantized_model, _ = autoround.quantize_and_save(format="auto_round", output_dir=output_dir)
assert quantized_model is not None, "Quantized model should not be None."
Expand Down
Loading