Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ ar.quantize_and_save(output_dir="./qmodel", format="auto_round")
| **auto_round** | W4A16(Recommended), W2A16, W3A16, W8A16, W2A16G64, W2A16G32, `MXFP4`, `MXFP8`, `MXFP4_RCEIL`, `MXFP8_RCEIL`, `NVFP4`, `FPW8A16`, `FP8_STATIC`, `BF16` |
| **auto_awq** | W4A16(Recommended), BF16 |
| **auto_gptq** | W4A16(Recommended), W2A16, W3A16, W8A16, W2A16G64, W2A16G32,BF16 |
| **llm_compressor** | NVFP4(Recommended), `MXFP4`, `MXFP8`, `FPW8A16`, `FP8_STATIC`, `FP8_BLOCK`, `INT8_W8A8`, `W4A16`, `W8A16` |
| **llm_compressor** | NVFP4(Recommended), `MXFP4`, `MXFP8`, `FPW8A16`, `FP8_STATIC`, `FP8_BLOCK`, `INT8`, `W4A16`, `W8A16` |
| **gguf** | GGUF:Q4_K_M(Recommended), GGUF:Q2_K_S, GGUF:Q3_K_S, GGUF:Q3_K_M, GGUF:Q3_K_L, GGUF:Q4_K_S, GGUF:Q5_K_S, GGUF:Q5_K_M, GGUF:Q6_K, GGUF:Q4_0, GGUF:Q4_1, GGUF:Q5_0, GGUF:Q5_1,GGUF:Q8_0 |
| **fake** | `all schemes (only for research)` |
</details>
Expand Down
2 changes: 1 addition & 1 deletion README_CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ ar.quantize_and_save(output_dir="./qmodel", format="auto_round")
|**auto_round**| W4A16(推荐)、W2A16、W3A16、W8A16、W2A16G64、W2A16G32、`MXFP4`​、`MXFP8`​、`MXFP4_RCEIL`​、`MXFP8_RCEIL`​、`NVFP4`​、`FPW8A16`​、`FP8_STATIC`​、`BF16` |
|**auto_awq**| W4A16(推荐)、BF16 |
|**auto_gptq**| W4A16(推荐)、W2A16、W3A16、W8A16、W2A16G64、W2A16G32、BF16 |
|**llm_compressor**| NVFP4(推荐)、`MXFP4`​、`MXFP8`​、`FPW8A16`​、`FP8_STATIC`、`FP8_BLOCK`、`INT8_W8A8`、`W4A16`、`W8A16` |
|**llm_compressor**| NVFP4(推荐)、`MXFP4`​、`MXFP8`​、`FPW8A16`​、`FP8_STATIC`、`FP8_BLOCK`、`INT8`、`W4A16`、`W8A16` |
|**gguf**| GGUF:Q4\_K\_M(推荐)、Auto-RoundGGUF:Q2\_K\_S、GGUF:Q3\_K\_S、GGUF:Q3\_K\_M、GGUF:Q3\_K\_L、GGUF:Q4\_K\_S、GGUF:Q5\_K\_S、GGUF:Q5\_K\_M、GGUF:Q6\_K、GGUF:Q4\_0、GGUF:Q4\_1、GGUF:Q5\_0、GGUF:Q5\_1、GGUF:Q8\_0 |
|**fake**| ​`所有方案(仅用于研究)` |
</details>
Expand Down
20 changes: 16 additions & 4 deletions auto_round/formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class AutoRoundExportFormat(str, Enum):
NV_FP = "nv_fp"
MX_FP_RCEIL = "mx_fp_rceil"
NV_FP4_WITH_STATIC_GS = "nv_fp4_with_static_gs"
INT8_W8A8 = "int8_w8a8"
INT8 = "int8_w8a8"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should change the value to INT8 as well?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it’s probably better to deprecate it. If a user provides int8_w8a8, we can trigger a warning and automatically map it to INT8 instead

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

make sense, updated with deprecated warning. INT8_W8A8 has mapped to INT8 in PRESET_SCHEME. Here value is used to map with compressed_tensor since its format uses this name

FP8_BLOCK = "fp8_block"
MXINT4 = "mxint4"
MX_INT = "mx_int"
Expand Down Expand Up @@ -350,7 +350,18 @@ def save_quantized(

@OutputFormat.register("llm_compressor")
class LLMCompressorFormat(OutputFormat):
support_schemes = ["MXFP4", "MXFP8", "NVFP4", "FPW8A16", "FP8_STATIC", "INT8_W8A8", "FP8_BLOCK", "W4A16", "W8A16"]
support_schemes = [
"MXFP4",
"MXFP8",
"NVFP4",
"FPW8A16",
"FP8_STATIC",
"INT8",
"INT8_W8A8",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dupliacated?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is kept for backward compatibility since it was already in previous releases

"FP8_BLOCK",
"W4A16",
"W8A16",
]
format_name = "llm_compressor"

def __init__(self, format, ar):
Expand Down Expand Up @@ -388,7 +399,8 @@ def __init__(self, format, ar):
from auto_round.export.export_to_llmcompressor import check_compressed_tensors_supported

check_compressed_tensors_supported()
self.backend = LLMCompressorFormat(AutoRoundExportFormat.INT8_W8A8.value, ar)
self.backend = LLMCompressorFormat(AutoRoundExportFormat.INT8.name, ar)
self.backend.output_format = f"llm_compressor:{AutoRoundExportFormat.INT8.value}"
elif is_wint_woq(ar):
from auto_round.export.export_to_llmcompressor import check_compressed_tensors_supported

Expand Down Expand Up @@ -475,7 +487,7 @@ def pack_layer(self, layer_name, model, device=None, **kwargs):
from auto_round.export.export_to_llmcompressor.export_to_static_fp import pack_layer

return pack_layer(layer_name, model, self.get_backend_name(), device=device)
elif re.search(f"{AutoRoundExportFormat.INT8_W8A8.value}", self.output_format):
elif re.search(f"{AutoRoundExportFormat.INT8.value}", self.output_format):
from auto_round.export.export_to_llmcompressor.export import pack_layer

return pack_layer(layer_name, model, device=device)
Expand Down
11 changes: 9 additions & 2 deletions auto_round/schemes.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,12 @@ def preset_name_to_scheme(name: str) -> QuantizationScheme:
if name not in PRESET_SCHEMES:
raise KeyError(f"Unknown preset scheme name {name}, " f"available names: {list(PRESET_SCHEMES.keys())}")

if name == "INT8_W8A8":
logger.warning_once(
"The 'INT8_W8A8' scheme name is deprecated and will be removed in a future release. "
"Please use 'INT8' instead."
)

scheme_args = deepcopy(PRESET_SCHEMES[name])
return scheme_args

Expand Down Expand Up @@ -298,7 +304,7 @@ def is_preset_scheme(name: str) -> bool:
}
)

INT8_W8A8 = QuantizationScheme.from_dict(
INT8 = QuantizationScheme.from_dict(
{
"bits": 8,
"group_size": -1,
Expand Down Expand Up @@ -340,7 +346,8 @@ def is_preset_scheme(name: str) -> bool:
"FP8_STATIC": FP8_STATIC,
"BF16": BF16,
"W4A16_MIXED": W4A16,
"INT8_W8A8": INT8_W8A8,
"INT8": INT8,
"INT8_W8A8": INT8,
"FP8_BLOCK": FP8_BLOCK,
"MXINT4": MXINT4,
}
Expand Down
26 changes: 19 additions & 7 deletions test/test_cpu/export/test_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,14 +400,25 @@ def test_export_format(self):

autoround = AutoRound(
model=self.model_name,
scheme="INT8_W8A8",
scheme="INT8",
)
format_list = get_formats("llm_compressor, auto_round:llm_compressor", autoround)
assert format_list[0].output_format == "llm_compressor"
assert format_list[0].get_backend_name() == "llm_compressor:int8_w8a8"
assert format_list[1].output_format == "auto_round"
assert format_list[1].get_backend_name() == "auto_round:llm_compressor:int8_w8a8"

# Verify backward compatibility: INT8_W8A8 (old name) produces identical formats to INT8
autoround_old = AutoRound(
model=self.model_name,
scheme="INT8_W8A8",
)
format_list_old = get_formats("llm_compressor, auto_round:llm_compressor", autoround_old)
assert format_list_old[0].output_format == "llm_compressor"
assert format_list_old[0].get_backend_name() == "llm_compressor:int8_w8a8"
assert format_list_old[1].output_format == "auto_round"
assert format_list_old[1].get_backend_name() == "auto_round:llm_compressor:int8_w8a8"

def test_export_format_with_scheme(self, tiny_qwen_model_path):
from auto_round.formats import get_formats

Expand Down Expand Up @@ -472,14 +483,15 @@ def test_autoawq_qwen3_vl_infer(self, dataloader):
), f"'model.visual.blocks' should be in modules_to_not_convert. Got: {modules_to_not_convert}"

@pytest.mark.parametrize(
"iters,use_dataloader",
"iters,use_dataloader,scheme",
[
(0, False), # RTN (no tuning)
(1, True), # with tuning
(0, False, "INT8"), # RTN with new scheme name
(1, True, "INT8"), # tuning with new scheme name
(0, False, "INT8_W8A8"), # RTN with old scheme name (backward compat)
],
ids=["rtn", "tuning"],
ids=["rtn", "tuning", "rtn-old-scheme"],
)
def test_llmc_dynamic_wint8aint8_export(self, iters, use_dataloader, dataloader):
def test_llmc_dynamic_wint8aint8_export(self, iters, use_dataloader, scheme, dataloader):
from safetensors import safe_open

dataset = dataloader if use_dataloader else None
Expand All @@ -489,7 +501,7 @@ def test_llmc_dynamic_wint8aint8_export(self, iters, use_dataloader, dataloader)
nsamples=2,
seqlen=2,
dataset=dataset,
scheme="INT8_W8A8",
scheme=scheme,
)
quantized_model_path = self.save_dir
autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
Expand Down
Loading