Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 2 additions & 5 deletions examples/quantization/brevitas/quantize_llm.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from argparse import ArgumentParser

from optimum.amd import BrevitasQuantizationConfig, BrevitasQuantizer
from optimum.amd import AutoQuantizationConfig, BrevitasQuantizer
from optimum.amd.brevitas.accelerate_utils import calc_cpu_device_map, calc_gpu_device_map, offload_model, remove_hooks
from optimum.amd.brevitas.data_utils import compute_perplexity, get_dataset_for_model
from optimum.amd.brevitas.export import onnx_export_from_quantized_model
Expand All @@ -14,13 +14,10 @@ def main(args):
use_accelerate = args.device == "auto"

# Prepare the quantizer, specifying its configuration and loading the model.
qconfig = BrevitasQuantizationConfig(
qconfig = AutoQuantizationConfig.ipu_transformers_config(
apply_gptq=args.apply_gptq,
apply_weight_equalization=args.apply_weight_equalization,
activations_equalization=args.activations_equalization,
is_static=args.is_static,
weights_symmetric=True,
activations_symmetric=args.is_static, # ONNX export only supports unsigned for dynamic quantization
gpu_device_map=args.gpu_device_map,
cpu_device_map=args.cpu_device_map,
)
Expand Down
3 changes: 2 additions & 1 deletion optimum/amd/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

_import_structure = {
"brevitas.configuration": [
"AutoQuantizationConfig",
"BrevitasQuantizationConfig",
],
"brevitas.quantizer": [
Expand All @@ -16,7 +17,7 @@
}

if TYPE_CHECKING:
from .brevitas.configuration import BrevitasQuantizationConfig
from .brevitas.configuration import AutoQuantizationConfig, BrevitasQuantizationConfig
from .brevitas.quantizer import BrevitasQuantizer
else:
import sys
Expand Down
2 changes: 1 addition & 1 deletion optimum/amd/brevitas/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Copyright 2023 The HuggingFace Team. All rights reserved.
# Licensed under the MIT License.

from .configuration import BrevitasQuantizationConfig
from .configuration import AutoQuantizationConfig, BrevitasQuantizationConfig
from .data_utils import get_dataset_for_model
from .quantizer import BrevitasQuantizer
46 changes: 37 additions & 9 deletions optimum/amd/brevitas/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class BrevitasQuantizationConfig:
Bitwidth of the activations quantization.
weights_only (`bool`, defaults to `False`):
If set to `True`, only weights are to be quantized, otherwise activations are quantized as well.
weights_param_method (`str`, defaults to `stats`):
weights_calibration_method (`str`, defaults to `stats`):
Strategy to use to estimate the quantization parameters (scale, zero-point) for the weights. Two strategies are available:
- `"stats"`: Use min-max to estimate the range to quantize on.
- `"mse"`: Use mean-square error between the unquantized weights and quantized weights to estimate the range to quantize on.
Expand All @@ -34,7 +34,7 @@ class BrevitasQuantizationConfig:
Group size to use for the weights in case `weights_quant_granularity="per_group"`. Defaults to `128` in this case, to `None` otherwise.
quantize_zero_point (`bool`, defaults to `True`):
When set to True, the unquantized value 0.0 is exactly representable as a quantized value: the zero point. When set to False, a quantization range [a, b] is exactly reprensentable (no rounding on a and b), but the unquantized value zero is not exactly representable.
activations_param_method (`List[str]`):
activations_calibration_method (`List[str]`):
Strategy to use to estimate the quantization parameters (scale, zero-point) for the activations. Two strategies are available:
- `"stats"`: Use min-max to estimate the range to quantize on.
- `"mse"`: Use mean-square error between the unquantized activations and quantized activations to estimate the range to quantize on.
Expand Down Expand Up @@ -64,18 +64,18 @@ class BrevitasQuantizationConfig:
weights_bitwidth: int = 8
activations_bitwidth: Optional[int] = 8
weights_only: bool = False
weights_param_method: Literal["stats", "mse"] = "stats"
weights_calibration_method: Literal["stats", "mse"] = "stats"
weights_symmetric: bool = True
scale_precision: Literal["float_scale", "power_of_two_scale"] = "float_scale"
weights_quant_granularity: Literal["per_tensor", "per_channel", "per_group"] = "per_tensor"
weights_quant_granularity: Literal["per_tensor", "per_channel", "per_group"] = "per_channel"
weights_group_size: Optional[int] = None
quantize_zero_point: bool = True
activations_param_method: Optional[Literal["stats", "mse"]] = "stats"
activations_calibration_method: Optional[Literal["stats", "mse"]] = "stats"
is_static: bool = False
activations_symmetric: Optional[bool] = False
activations_quant_granularity: Optional[Literal["per_tensor", "per_row", "per_group"]] = "per_tensor"
activations_group_size: Optional[int] = None
activations_equalization: Optional[Literal[None, "layerwise", "cross_layer"]] = "cross_layer"
Comment thread
Giuseppe5 marked this conversation as resolved.
activations_equalization: Optional[Literal[None, "layerwise", "cross_layer"]] = None
apply_weight_equalization: bool = False
apply_bias_correction: bool = False
apply_gptq: bool = False
Expand All @@ -99,9 +99,9 @@ def __post_init__(self):
f'Static quantization with activations_quant_granularity="{self.activations_quant_granularity}" is not supported. The quantization granularity must be activations_quant_granularity="per_tensor" when using static quantization.'
)

if self.weights_quant_granularity == "per_group" and self.weights_param_method == "mse":
if self.weights_quant_granularity == "per_group" and self.weights_calibration_method == "mse":
raise ValueError(
'The quantization configuration `weights_quant_granularity="per_group"` is not supported along `weights_param_method="mse"`. Per group MSE weight quantization is not supported.'
'The quantization configuration `weights_quant_granularity="per_group"` is not supported along `weights_calibration_method="mse"`. Per group MSE weight quantization is not supported.'
)

if self.scale_precision == "power_of_two_scale" and (
Expand All @@ -126,12 +126,40 @@ def __post_init__(self):
'The quantization configuration `scale_precision="power_of_two_scale"` is not supported along `is_static=False`. Dynamic activation quantization with power-of-two scale factor is not supported.'
)

if self.activations_calibration_method == "mse" and self.is_static:
raise ValueError(
'The quantization configuration `activations_calibration_method="mse"` is not supported along `is_static=True`. Dynamic activation quantization with mse calibration is not supported.'
)

if self.weights_only:
self.activations_bitwidth = None
self.activations_symmetric = None
self.activations_equalization = None
self.activations_group_size = None
self.activations_param_method = None
self.activations_calibration_method = None

def requires_fx_graph(self):
return self.activations_equalization == "cross_layer" or self.apply_weight_equalization


class AutoQuantizationConfig:
@staticmethod
def ipu_transformers_config(
activations_equalization: Optional[Literal[None, "layerwise", "cross_layer"]] = None,
apply_weight_equalization: bool = False,
apply_bias_correction: bool = False,
apply_gptq: bool = False,
gptq_act_order: bool = False,
gpu_device_map: Optional[Dict[int, float]] = None,
cpu_device_map: Optional[Dict[int, float]] = None,
):
return BrevitasQuantizationConfig(
weights_quant_granularity="per_tensor",
activations_equalization=activations_equalization,
apply_weight_equalization=apply_weight_equalization,
apply_bias_correction=apply_bias_correction,
apply_gptq=apply_gptq,
gptq_act_order=gptq_act_order,
gpu_device_map=gpu_device_map,
cpu_device_map=cpu_device_map,
)
4 changes: 2 additions & 2 deletions optimum/amd/brevitas/quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,15 +184,15 @@ def quantize(
weight_quant_format="int",
weight_quant_type="sym" if quantization_config.weights_symmetric else "asym",
weight_bit_width=quantization_config.weights_bitwidth,
weight_param_method=quantization_config.weights_param_method,
weight_param_method=quantization_config.weights_calibration_method,
weight_scale_precision=quantization_config.scale_precision,
weight_quant_granularity=quantization_config.weights_quant_granularity,
weight_group_size=quantization_config.weights_group_size,
quantize_weight_zero_point=quantization_config.quantize_zero_point,
input_bit_width=None if quantization_config.weights_only else quantization_config.activations_bitwidth,
input_quant_type="sym" if quantization_config.activations_symmetric else "asym",
input_quant_format="int",
input_param_method=quantization_config.activations_param_method,
input_param_method=quantization_config.activations_calibration_method,
input_scale_precision=quantization_config.scale_precision,
input_scale_type="static" if quantization_config.is_static else "dynamic",
input_quant_granularity=quantization_config.activations_quant_granularity,
Expand Down