huggingface · Giuseppe5 · Mar 17, 2024 · Apr 8, 2024
diff --git a/examples/quantization/brevitas/quantize_llm.py b/examples/quantization/brevitas/quantize_llm.py
@@ -1,6 +1,6 @@
 from argparse import ArgumentParser
 
-from optimum.amd import BrevitasQuantizationConfig, BrevitasQuantizer
+from optimum.amd import AutoQuantizationConfig, BrevitasQuantizer
 from optimum.amd.brevitas.accelerate_utils import calc_cpu_device_map, calc_gpu_device_map, offload_model, remove_hooks
 from optimum.amd.brevitas.data_utils import compute_perplexity, get_dataset_for_model
 from optimum.amd.brevitas.export import onnx_export_from_quantized_model
@@ -14,13 +14,10 @@ def main(args):
     use_accelerate = args.device == "auto"
 
     # Prepare the quantizer, specifying its configuration and loading the model.
-    qconfig = BrevitasQuantizationConfig(
+    qconfig = AutoQuantizationConfig.ipu_transformers_config(
         apply_gptq=args.apply_gptq,
         apply_weight_equalization=args.apply_weight_equalization,
         activations_equalization=args.activations_equalization,
-        is_static=args.is_static,
-        weights_symmetric=True,
-        activations_symmetric=args.is_static,  # ONNX export only supports unsigned for dynamic quantization
         gpu_device_map=args.gpu_device_map,
         cpu_device_map=args.cpu_device_map,
     )

diff --git a/optimum/amd/__init__.py b/optimum/amd/__init__.py
@@ -8,6 +8,7 @@
 
 _import_structure = {
     "brevitas.configuration": [
+        "AutoQuantizationConfig",
         "BrevitasQuantizationConfig",
     ],
     "brevitas.quantizer": [
@@ -16,7 +17,7 @@
 }
 
 if TYPE_CHECKING:
-    from .brevitas.configuration import BrevitasQuantizationConfig
+    from .brevitas.configuration import AutoQuantizationConfig, BrevitasQuantizationConfig
     from .brevitas.quantizer import BrevitasQuantizer
 else:
     import sys

diff --git a/optimum/amd/brevitas/__init__.py b/optimum/amd/brevitas/__init__.py
@@ -1,6 +1,6 @@
 # Copyright 2023 The HuggingFace Team. All rights reserved.
 # Licensed under the MIT License.
 
-from .configuration import BrevitasQuantizationConfig
+from .configuration import AutoQuantizationConfig, BrevitasQuantizationConfig
 from .data_utils import get_dataset_for_model
 from .quantizer import BrevitasQuantizer
diff --git a/optimum/amd/brevitas/configuration.py b/optimum/amd/brevitas/configuration.py
@@ -17,7 +17,7 @@ class BrevitasQuantizationConfig:
             Bitwidth of the activations quantization.
         weights_only (`bool`, defaults to `False`):
             If set to `True`, only weights are to be quantized, otherwise activations are quantized as well.
-        weights_param_method (`str`, defaults to `stats`):
+        weights_calibration_method (`str`, defaults to `stats`):
             Strategy to use to estimate the quantization parameters (scale, zero-point) for the weights. Two strategies are available:
             - `"stats"`: Use min-max to estimate the range to quantize on.
             - `"mse"`: Use mean-square error between the unquantized weights and quantized weights to estimate the range to quantize on.
@@ -34,7 +34,7 @@ class BrevitasQuantizationConfig:
             Group size to use for the weights in case `weights_quant_granularity="per_group"`. Defaults to `128` in this case, to `None` otherwise.
         quantize_zero_point (`bool`, defaults to `True`):
             When set to True, the unquantized value 0.0 is exactly representable as a quantized value: the zero point. When set to False, a quantization range [a, b] is exactly reprensentable (no rounding on a and b), but the unquantized value zero is not exactly representable.
-        activations_param_method (`List[str]`):
+        activations_calibration_method (`List[str]`):
             Strategy to use to estimate the quantization parameters (scale, zero-point) for the activations. Two strategies are available:
             - `"stats"`: Use min-max to estimate the range to quantize on.
             - `"mse"`: Use mean-square error between the unquantized activations and quantized activations to estimate the range to quantize on.
@@ -64,18 +64,18 @@ class BrevitasQuantizationConfig:
     weights_bitwidth: int = 8
     activations_bitwidth: Optional[int] = 8
     weights_only: bool = False
-    weights_param_method: Literal["stats", "mse"] = "stats"
+    weights_calibration_method: Literal["stats", "mse"] = "stats"
     weights_symmetric: bool = True
     scale_precision: Literal["float_scale", "power_of_two_scale"] = "float_scale"
-    weights_quant_granularity: Literal["per_tensor", "per_channel", "per_group"] = "per_tensor"
+    weights_quant_granularity: Literal["per_tensor", "per_channel", "per_group"] = "per_channel"
     weights_group_size: Optional[int] = None
     quantize_zero_point: bool = True
-    activations_param_method: Optional[Literal["stats", "mse"]] = "stats"
+    activations_calibration_method: Optional[Literal["stats", "mse"]] = "stats"
     is_static: bool = False
     activations_symmetric: Optional[bool] = False
     activations_quant_granularity: Optional[Literal["per_tensor", "per_row", "per_group"]] = "per_tensor"
     activations_group_size: Optional[int] = None
-    activations_equalization: Optional[Literal[None, "layerwise", "cross_layer"]] = "cross_layer"
+    activations_equalization: Optional[Literal[None, "layerwise", "cross_layer"]] = None
     apply_weight_equalization: bool = False
     apply_bias_correction: bool = False
     apply_gptq: bool = False
@@ -99,9 +99,9 @@ def __post_init__(self):
                 f'Static quantization with activations_quant_granularity="{self.activations_quant_granularity}" is not supported. The quantization granularity must be activations_quant_granularity="per_tensor" when using static quantization.'
             )
 
-        if self.weights_quant_granularity == "per_group" and self.weights_param_method == "mse":
+        if self.weights_quant_granularity == "per_group" and self.weights_calibration_method == "mse":
             raise ValueError(
-                'The quantization configuration `weights_quant_granularity="per_group"` is not supported along `weights_param_method="mse"`. Per group MSE weight quantization is not supported.'
+                'The quantization configuration `weights_quant_granularity="per_group"` is not supported along `weights_calibration_method="mse"`. Per group MSE weight quantization is not supported.'
             )
 
         if self.scale_precision == "power_of_two_scale" and (
@@ -126,12 +126,40 @@ def __post_init__(self):
                 'The quantization configuration `scale_precision="power_of_two_scale"` is not supported along `is_static=False`. Dynamic activation quantization with power-of-two scale factor is not supported.'
             )
 
+        if self.activations_calibration_method == "mse" and self.is_static:
+            raise ValueError(
+                'The quantization configuration `activations_calibration_method="mse"` is not supported along `is_static=True`. Dynamic activation quantization with mse calibration is not supported.'
+            )
+
         if self.weights_only:
             self.activations_bitwidth = None
             self.activations_symmetric = None
             self.activations_equalization = None
             self.activations_group_size = None
-            self.activations_param_method = None
+            self.activations_calibration_method = None
 
     def requires_fx_graph(self):
         return self.activations_equalization == "cross_layer" or self.apply_weight_equalization
+
+
+class AutoQuantizationConfig:
+    @staticmethod
+    def ipu_transformers_config(
+        activations_equalization: Optional[Literal[None, "layerwise", "cross_layer"]] = None,
+        apply_weight_equalization: bool = False,
+        apply_bias_correction: bool = False,
+        apply_gptq: bool = False,
+        gptq_act_order: bool = False,
+        gpu_device_map: Optional[Dict[int, float]] = None,
+        cpu_device_map: Optional[Dict[int, float]] = None,
+    ):
+        return BrevitasQuantizationConfig(
+            weights_quant_granularity="per_tensor",
+            activations_equalization=activations_equalization,
+            apply_weight_equalization=apply_weight_equalization,
+            apply_bias_correction=apply_bias_correction,
+            apply_gptq=apply_gptq,
+            gptq_act_order=gptq_act_order,
+            gpu_device_map=gpu_device_map,
+            cpu_device_map=cpu_device_map,
+        )
diff --git a/optimum/amd/brevitas/quantizer.py b/optimum/amd/brevitas/quantizer.py
@@ -184,15 +184,15 @@ def quantize(
             weight_quant_format="int",
             weight_quant_type="sym" if quantization_config.weights_symmetric else "asym",
             weight_bit_width=quantization_config.weights_bitwidth,
-            weight_param_method=quantization_config.weights_param_method,
+            weight_param_method=quantization_config.weights_calibration_method,
             weight_scale_precision=quantization_config.scale_precision,
             weight_quant_granularity=quantization_config.weights_quant_granularity,
             weight_group_size=quantization_config.weights_group_size,
             quantize_weight_zero_point=quantization_config.quantize_zero_point,
             input_bit_width=None if quantization_config.weights_only else quantization_config.activations_bitwidth,
             input_quant_type="sym" if quantization_config.activations_symmetric else "asym",
             input_quant_format="int",
-            input_param_method=quantization_config.activations_param_method,
+            input_param_method=quantization_config.activations_calibration_method,
             input_scale_precision=quantization_config.scale_precision,
             input_scale_type="static" if quantization_config.is_static else "dynamic",
             input_quant_granularity=quantization_config.activations_quant_granularity,