diff --git a/examples/quantization/brevitas/quantize_llm.py b/examples/quantization/brevitas/quantize_llm.py index e61fe8a5..32e85a52 100644 --- a/examples/quantization/brevitas/quantize_llm.py +++ b/examples/quantization/brevitas/quantize_llm.py @@ -1,6 +1,6 @@ from argparse import ArgumentParser -from optimum.amd import BrevitasQuantizationConfig, BrevitasQuantizer +from optimum.amd import AutoQuantizationConfig, BrevitasQuantizer from optimum.amd.brevitas.accelerate_utils import calc_cpu_device_map, calc_gpu_device_map, offload_model, remove_hooks from optimum.amd.brevitas.data_utils import compute_perplexity, get_dataset_for_model from optimum.amd.brevitas.export import onnx_export_from_quantized_model @@ -14,13 +14,10 @@ def main(args): use_accelerate = args.device == "auto" # Prepare the quantizer, specifying its configuration and loading the model. - qconfig = BrevitasQuantizationConfig( + qconfig = AutoQuantizationConfig.ipu_transformers_config( apply_gptq=args.apply_gptq, apply_weight_equalization=args.apply_weight_equalization, activations_equalization=args.activations_equalization, - is_static=args.is_static, - weights_symmetric=True, - activations_symmetric=args.is_static, # ONNX export only supports unsigned for dynamic quantization gpu_device_map=args.gpu_device_map, cpu_device_map=args.cpu_device_map, ) diff --git a/optimum/amd/__init__.py b/optimum/amd/__init__.py index 204ad858..754a5d28 100644 --- a/optimum/amd/__init__.py +++ b/optimum/amd/__init__.py @@ -8,6 +8,7 @@ _import_structure = { "brevitas.configuration": [ + "AutoQuantizationConfig", "BrevitasQuantizationConfig", ], "brevitas.quantizer": [ @@ -16,7 +17,7 @@ } if TYPE_CHECKING: - from .brevitas.configuration import BrevitasQuantizationConfig + from .brevitas.configuration import AutoQuantizationConfig, BrevitasQuantizationConfig from .brevitas.quantizer import BrevitasQuantizer else: import sys diff --git a/optimum/amd/brevitas/__init__.py b/optimum/amd/brevitas/__init__.py index a4227479..edd95d12 100644 --- a/optimum/amd/brevitas/__init__.py +++ b/optimum/amd/brevitas/__init__.py @@ -1,6 +1,6 @@ # Copyright 2023 The HuggingFace Team. All rights reserved. # Licensed under the MIT License. -from .configuration import BrevitasQuantizationConfig +from .configuration import AutoQuantizationConfig, BrevitasQuantizationConfig from .data_utils import get_dataset_for_model from .quantizer import BrevitasQuantizer diff --git a/optimum/amd/brevitas/configuration.py b/optimum/amd/brevitas/configuration.py index fb81e5be..49e2f736 100644 --- a/optimum/amd/brevitas/configuration.py +++ b/optimum/amd/brevitas/configuration.py @@ -17,7 +17,7 @@ class BrevitasQuantizationConfig: Bitwidth of the activations quantization. weights_only (`bool`, defaults to `False`): If set to `True`, only weights are to be quantized, otherwise activations are quantized as well. - weights_param_method (`str`, defaults to `stats`): + weights_calibration_method (`str`, defaults to `stats`): Strategy to use to estimate the quantization parameters (scale, zero-point) for the weights. Two strategies are available: - `"stats"`: Use min-max to estimate the range to quantize on. - `"mse"`: Use mean-square error between the unquantized weights and quantized weights to estimate the range to quantize on. @@ -34,7 +34,7 @@ class BrevitasQuantizationConfig: Group size to use for the weights in case `weights_quant_granularity="per_group"`. Defaults to `128` in this case, to `None` otherwise. quantize_zero_point (`bool`, defaults to `True`): When set to True, the unquantized value 0.0 is exactly representable as a quantized value: the zero point. When set to False, a quantization range [a, b] is exactly reprensentable (no rounding on a and b), but the unquantized value zero is not exactly representable. - activations_param_method (`List[str]`): + activations_calibration_method (`List[str]`): Strategy to use to estimate the quantization parameters (scale, zero-point) for the activations. Two strategies are available: - `"stats"`: Use min-max to estimate the range to quantize on. - `"mse"`: Use mean-square error between the unquantized activations and quantized activations to estimate the range to quantize on. @@ -64,18 +64,18 @@ class BrevitasQuantizationConfig: weights_bitwidth: int = 8 activations_bitwidth: Optional[int] = 8 weights_only: bool = False - weights_param_method: Literal["stats", "mse"] = "stats" + weights_calibration_method: Literal["stats", "mse"] = "stats" weights_symmetric: bool = True scale_precision: Literal["float_scale", "power_of_two_scale"] = "float_scale" - weights_quant_granularity: Literal["per_tensor", "per_channel", "per_group"] = "per_tensor" + weights_quant_granularity: Literal["per_tensor", "per_channel", "per_group"] = "per_channel" weights_group_size: Optional[int] = None quantize_zero_point: bool = True - activations_param_method: Optional[Literal["stats", "mse"]] = "stats" + activations_calibration_method: Optional[Literal["stats", "mse"]] = "stats" is_static: bool = False activations_symmetric: Optional[bool] = False activations_quant_granularity: Optional[Literal["per_tensor", "per_row", "per_group"]] = "per_tensor" activations_group_size: Optional[int] = None - activations_equalization: Optional[Literal[None, "layerwise", "cross_layer"]] = "cross_layer" + activations_equalization: Optional[Literal[None, "layerwise", "cross_layer"]] = None apply_weight_equalization: bool = False apply_bias_correction: bool = False apply_gptq: bool = False @@ -99,9 +99,9 @@ def __post_init__(self): f'Static quantization with activations_quant_granularity="{self.activations_quant_granularity}" is not supported. The quantization granularity must be activations_quant_granularity="per_tensor" when using static quantization.' ) - if self.weights_quant_granularity == "per_group" and self.weights_param_method == "mse": + if self.weights_quant_granularity == "per_group" and self.weights_calibration_method == "mse": raise ValueError( - 'The quantization configuration `weights_quant_granularity="per_group"` is not supported along `weights_param_method="mse"`. Per group MSE weight quantization is not supported.' + 'The quantization configuration `weights_quant_granularity="per_group"` is not supported along `weights_calibration_method="mse"`. Per group MSE weight quantization is not supported.' ) if self.scale_precision == "power_of_two_scale" and ( @@ -126,12 +126,40 @@ def __post_init__(self): 'The quantization configuration `scale_precision="power_of_two_scale"` is not supported along `is_static=False`. Dynamic activation quantization with power-of-two scale factor is not supported.' ) + if self.activations_calibration_method == "mse" and self.is_static: + raise ValueError( + 'The quantization configuration `activations_calibration_method="mse"` is not supported along `is_static=True`. Dynamic activation quantization with mse calibration is not supported.' + ) + if self.weights_only: self.activations_bitwidth = None self.activations_symmetric = None self.activations_equalization = None self.activations_group_size = None - self.activations_param_method = None + self.activations_calibration_method = None def requires_fx_graph(self): return self.activations_equalization == "cross_layer" or self.apply_weight_equalization + + +class AutoQuantizationConfig: + @staticmethod + def ipu_transformers_config( + activations_equalization: Optional[Literal[None, "layerwise", "cross_layer"]] = None, + apply_weight_equalization: bool = False, + apply_bias_correction: bool = False, + apply_gptq: bool = False, + gptq_act_order: bool = False, + gpu_device_map: Optional[Dict[int, float]] = None, + cpu_device_map: Optional[Dict[int, float]] = None, + ): + return BrevitasQuantizationConfig( + weights_quant_granularity="per_tensor", + activations_equalization=activations_equalization, + apply_weight_equalization=apply_weight_equalization, + apply_bias_correction=apply_bias_correction, + apply_gptq=apply_gptq, + gptq_act_order=gptq_act_order, + gpu_device_map=gpu_device_map, + cpu_device_map=cpu_device_map, + ) diff --git a/optimum/amd/brevitas/quantizer.py b/optimum/amd/brevitas/quantizer.py index fcc85e8c..5b7d7393 100644 --- a/optimum/amd/brevitas/quantizer.py +++ b/optimum/amd/brevitas/quantizer.py @@ -184,7 +184,7 @@ def quantize( weight_quant_format="int", weight_quant_type="sym" if quantization_config.weights_symmetric else "asym", weight_bit_width=quantization_config.weights_bitwidth, - weight_param_method=quantization_config.weights_param_method, + weight_param_method=quantization_config.weights_calibration_method, weight_scale_precision=quantization_config.scale_precision, weight_quant_granularity=quantization_config.weights_quant_granularity, weight_group_size=quantization_config.weights_group_size, @@ -192,7 +192,7 @@ def quantize( input_bit_width=None if quantization_config.weights_only else quantization_config.activations_bitwidth, input_quant_type="sym" if quantization_config.activations_symmetric else "asym", input_quant_format="int", - input_param_method=quantization_config.activations_param_method, + input_param_method=quantization_config.activations_calibration_method, input_scale_precision=quantization_config.scale_precision, input_scale_type="static" if quantization_config.is_static else "dynamic", input_quant_granularity=quantization_config.activations_quant_granularity,