diff --git a/.github/workflows/test_ryzenai_nightly.yaml b/.github/workflows/test_ryzenai_nightly.yaml index f573d1a1..cf1eb81d 100644 --- a/.github/workflows/test_ryzenai_nightly.yaml +++ b/.github/workflows/test_ryzenai_nightly.yaml @@ -26,7 +26,7 @@ jobs: slow_test: true timeout_minutes: 1200 secrets: - hf_hub_read_token: ${{ secrets.HF_READ_TOKEN }} + hf_hub_read_token: ${{ secrets.HF_HUB_READ_TOKEN }} send_results: name: Send results to webhook runs-on: ubuntu-22.04 diff --git a/.github/workflows/test_ryzenai_quantization_timm.yaml b/.github/workflows/test_ryzenai_quantization_timm.yaml index bc313660..d8900331 100644 --- a/.github/workflows/test_ryzenai_quantization_timm.yaml +++ b/.github/workflows/test_ryzenai_quantization_timm.yaml @@ -27,4 +27,4 @@ jobs: with: test_file: "tests/ryzenai/test_quantization.py" secrets: - hf_hub_read_token: ${{ secrets.HF_READ_TOKEN }} + hf_hub_read_token: ${{ secrets.HF_HUB_READ_TOKEN }} diff --git a/docs/source/ryzenai/package_reference/quantization.mdx b/docs/source/ryzenai/package_reference/quantization.mdx index 8dcd8b45..fcda9e88 100644 --- a/docs/source/ryzenai/package_reference/quantization.mdx +++ b/docs/source/ryzenai/package_reference/quantization.mdx @@ -15,3 +15,21 @@ Please refer to the guide [How to apply quantization](https://huggingface.co/doc ### QuantizationConfig [[autodoc]] ryzenai.QuantizationConfig + +### CalibrationMethod + +[[autodoc]] ryzenai.CalibrationMethod + +### QuantFormat + +[[autodoc]] ryzenai.QuantFormat + +### QuantType + +[[autodoc]] ryzenai.QuantType + +### ExtraOptions + +[[autodoc]] ryzenai.ExtraOptions + + diff --git a/optimum/amd/ryzenai/__init__.py b/optimum/amd/ryzenai/__init__.py index 59569806..9e9f6966 100644 --- a/optimum/amd/ryzenai/__init__.py +++ b/optimum/amd/ryzenai/__init__.py @@ -7,7 +7,16 @@ _import_structure = { - "configuration": ["RyzenAIConfig", "QuantizationConfig", "AutoQuantizationConfig"], + "configuration": [ + "AutoQuantizationConfig", + "CalibrationMethod", + "ExtraOptions", + "QuantFormat", + "QuantizationConfig", + "QuantType", + "AutoQuantizationConfig", + "RyzenAIConfig", + ], "modeling": [ "RyzenAIModel", "RyzenAIModelForCustomTasks", @@ -24,7 +33,15 @@ # Direct imports for type-checking if TYPE_CHECKING: - from .configuration import AutoQuantizationConfig, QuantizationConfig, RyzenAIConfig + from .configuration import ( + AutoQuantizationConfig, + CalibrationMethod, + ExtraOptions, + QuantFormat, + QuantizationConfig, + QuantType, + RyzenAIConfig, + ) from .modeling import ( RyzenAIModel, RyzenAIModelForCustomTasks, diff --git a/optimum/amd/ryzenai/configuration.py b/optimum/amd/ryzenai/configuration.py index c043d473..51efca39 100644 --- a/optimum/amd/ryzenai/configuration.py +++ b/optimum/amd/ryzenai/configuration.py @@ -2,16 +2,266 @@ # Licensed under the MIT License. """Configuration classes for quantization with RyzenAI.""" -from dataclasses import asdict, dataclass +import re +from dataclasses import asdict, dataclass, field, fields from enum import Enum -from typing import Optional +from typing import Dict, List, Literal, Optional, Tuple, Union import vai_q_onnx -from onnxruntime.quantization import CalibrationMethod, QuantFormat, QuantType from optimum.configuration_utils import BaseConfig +QUANT_TYPE_MAPPING = { + "uint8": vai_q_onnx.QuantType.QUInt8, + "int8": vai_q_onnx.QuantType.QInt8, + "uint16": vai_q_onnx.VitisQuantType.QUInt16, + "int16": vai_q_onnx.VitisQuantType.QInt16, + "uint32": vai_q_onnx.VitisQuantType.QUInt32, + "int32": vai_q_onnx.VitisQuantType.QInt32, + "float16": vai_q_onnx.VitisQuantType.QFloat16, + "bfloat16": vai_q_onnx.VitisQuantType.QBFloat16, +} + +QUANT_FORMAT_MAPPING = { + "qop": vai_q_onnx.QuantFormat.QOperator, + "qdq": vai_q_onnx.QuantFormat.QDQ, + "vitisqdq": vai_q_onnx.VitisQuantFormat.QDQ, + "vitisfixneuron": vai_q_onnx.VitisQuantFormat.FixNeuron, +} + +CALIBRATION_METHOD_MAPPING = { + "minmax": vai_q_onnx.CalibrationMethod.MinMax, + "entropy": vai_q_onnx.CalibrationMethod.Entropy, + "percentile": vai_q_onnx.CalibrationMethod.Percentile, + "nonoverflow": vai_q_onnx.PowerOfTwoMethod.NonOverflow, + "mse": vai_q_onnx.PowerOfTwoMethod.MinMSE, +} + + +class CalibrationMethod(Enum): + """CalibrationMethod is an enumeration of the calibration methods supported by RyzenAI quantization.""" + + MinMax = vai_q_onnx.CalibrationMethod.MinMax + Entropy = vai_q_onnx.CalibrationMethod.Entropy + Percentile = vai_q_onnx.CalibrationMethod.Percentile + NonOverflow = vai_q_onnx.PowerOfTwoMethod.NonOverflow + MinMSE = vai_q_onnx.PowerOfTwoMethod.MinMSE + + +@dataclass +class ExtraOptions: + """ + ExtraOptions is a dataclass handling additional options for quantization. + + Args: + activation_symmetric (`bool`, defaults to `False`): + If True, symmetrize calibration data for activations. + weight_symmetric (`bool`, defaults to `True`): + If True, symmetrize calibration data for weights. + use_unsigned_relu (`bool`, defaults to `False`): + If True, the output tensor of ReLU and Clip, whose min is 0, will be forced to be asymmetric. + quantize_bias (`bool`, defaults to `True`): + If True, quantize the Bias as normal weights. + remove_input_init (`bool`, defaults to `True`): + If True, initializer in graph inputs will be removed because it will not be treated as a constant value/weight. + This may prevent some of the graph optimizations, like const folding. + enable_subgraph (`bool`, defaults to `False`): + If True, the subgraph will be quantized. More support for this feature is planned in the future. + force_quantize_no_input_check (`bool`, defaults to `False`): + If True, latent operators such as maxpool and transpose will always quantize their inputs, generating quantized + outputs even if their inputs have not been quantized. + matmul_const_b_only (`bool`, defaults to `False`): + If True, only MatMul operations with a constant 'B' will be quantized. + add_qdq_pair_to_weight (`bool`, defaults to `False`): + If True, both QuantizeLinear and DeQuantizeLinear nodes are inserted for weight, maintaining its floating-point format. + In the PowerOfTwoMethod calibration method, this setting will also be effective for the bias. + op_types_to_exclude_output_quantization (`List[str] or None`, defaults to `[]`): + If specified, the output of operators with these types will not be quantized. + dedicated_qdq_pair (`bool`, defaults to `False`): + If True, an identical and dedicated QDQ pair is created for each node, allowing multiple nodes to share a single QDQ pair + as their inputs. + qdq_op_type_per_channel_support_to_axis (`Dict`, defaults to `{}`): + Sets the channel axis for specific operator types (e.g., {'MatMul': 1}). + use_qdq_vitis_custom_ops (`bool`, defaults to `True`): + If True, The UInt8 and Int8 quantization will be executed by the custom operations library, otherwise by the library + of onnxruntime extensions. Only valid in vai_q_onnx.VitisQuantFormat.QDQ. + calib_tensor_range_symmetric (`bool`, defaults to `False`): + If True, the final range of the tensor during calibration will be symmetrically set around the central point "0". + In PowerOfTwoMethod calibration method, the default is True. + calib_moving_average (`bool`, defaults to `False`): + If True, the moving average of the minimum and maximum values will be computed when the calibration method selected is + MinMax. In PowerOfTwoMethod calibration method, this should be set to False. + calib_moving_average_constant (`float`, defaults to `0.01`): + Specifies the constant smoothing factor to use when computing the moving average of the minimum and maximum values. + Only effective when the calibration method selected is MinMax and CalibMovingAverage is set to True. + In PowerOfTwoMethod calibration method, this option is unsupported. + random_data_reader_input_data_range (`Dict or None`, defaults to `None`): + Specifies the data range for each input if used random data reader (calibration_data_reader is None). + int16_scale (`bool`, defaults to `False`): + If True, the float scale will be replaced by the closest value corresponding to M and 2**N, where the range of M and 2**N + is within the representation range of int16 and uint16. + min_mse_mode (`str`, defaults to `'All'`): + When using vai_q_onnx.PowerOfTwoMethod.MinMSE, you can specify the method for calculating minmse. + By default, minmse is calculated using all calibration data. Alternatively, you can set the mode to "MostCommon", + where minmse is calculated for each batch separately and take the most common value. + convert_bn_to_conv (`bool`, defaults to `True`): + If True, the BatchNormalization operation will be converted to Conv operation when enable_ipu_cnn is True. + convert_reduce_mean_to_global_avg_pool (`bool`, defaults to `True`): + If True, the Reduce Mean operation will be converted to Global Average Pooling operation when enable_ipu_cnn is True. + split_large_kernel_pool (`bool`, defaults to `True`): + If True, the large kernel Global Average Pooling operation will be split into multiple Average Pooling operation when + enable_ipu_cnn is True. + convert_split_to_slice (`bool`, defaults to `True`): + If True, the Split operation will be converted to Slice operation when enable_ipu_cnn is True. + fuse_instance_norm (`bool`, defaults to `False`): + If True, the split instance norm operation will be fused to InstanceNorm operation when enable_ipu_cnn is True. + fuse_l2_norm (`bool`, defaults to `False`): + If True, a set of L2norm operations will be fused to L2Norm operation when enable_ipu_cnn is True. + convert_clip_to_relu (`bool`, defaults to `False`): + If True, the Clip operations that have a min value of 0 will be converted to ReLU operations. + simulate_dpu (`bool`, defaults to `True`): + If True, a simulation transformation that replaces some operations with an approximate implementation will be applied + for DPU when enable_ipu_cnn is True. + convert_leaky_relu_to_dpu_version (`bool`, defaults to `True`): + If True, the Leaky Relu operation will be converted to DPU version when SimulateDPU is True. + convert_sigmoid_to_hard_sigmoid (`bool`, defaults to `True`): + If True, the Sigmoid operation will be converted to Hard Sigmoid operation when SimulateDPU is True. + convert_hard_sigmoid_to_dpu_version (`bool`, defaults to `True`): + If True, the Hard Sigmoid operation will be converted to DPU version when SimulateDPU is True. + convert_avg_pool_to_dpu_version (`bool`, defaults to `True`): + If True, the global or kernel-based Average Pooling operation will be converted to DPU version when SimulateDPU is True. + convert_reduce_mean_to_dpu_version (`bool`, defaults to `True`): + If True, the ReduceMean operation will be converted to DPU version when SimulateDPU is True. + convert_softmax_to_dpu_version (`bool`, defaults to `False`): + If True, the Softmax operation will be converted to DPU version when SimulateDPU is True. + ipu_limitation_check (`bool`, defaults to `True`): + If True, the quantization scale will be adjusted due to the limitation of DPU/NPU. + adjust_shift_cut (`bool`, defaults to `True`): + If True, adjust the shift cut of nodes when ipu_limitation_check is True. + adjust_shift_bias (`bool`, defaults to `True`): + If True, adjust the shift bias of nodes when ipu_limitation_check is True. + adjust_shift_read (`bool`, defaults to `True`): + If True, adjust the shift read of nodes when ipu_limitation_check is True. + adjust_shift_write (`bool`, defaults to `True`): + If True, adjust the shift write of nodes when ipu_limitation_check is True. + adjust_hard_sigmoid (`bool`, defaults to `True`): + If True, adjust the pos of hard sigmoid nodes when ipu_limitation_check is True. + adjust_shift_swish (`bool`, defaults to `True`): + If True, adjust the shift swish when ipu_limitation_check is True. + align_concat (`bool`, defaults to `True`): + If True, adjust the quantization pos of concat when ipu_limitation_check is True. + align_pool (`bool`, defaults to `True`): + If True, adjust the quantization pos of pooling when ipu_limitation_check is True. + replace_clip6_relu (`bool`, defaults to `False`): + If True, replace Clip(0,6) with Relu in the model. + cle_steps (`int`, defaults to `1`): + Specifies the steps for CrossLayerEqualization execution when include_cle is set to true. When set to -1, + an adaptive CrossLayerEqualization will be conducted. + cle_total_layer_diff_threshold (`float`, defaults to `2e-7`): + Specifies The threshold represents the sum of mean transformations of CrossLayerEqualization transformations across + all layers when utilizing CrossLayerEqualization. + cle_scale_append_bias (`bool`, defaults to `True`): + Whether the bias be included when calculating the scale of the weights. + remove_qdq_conv_leaky_relu (`bool`, defaults to `False`): + If True, the QDQ between Conv and LeakyRelu will be removed for DPU when enable_ipu_cnn is True. + remove_qdq_conv_prelu (`bool`, defaults to `False`): + If True, the QDQ between Conv and PRelu will be removed for DPU when enable_ipu_cnn is True. + """ + + activation_symmetric: bool = False + weight_symmetric: bool = True + use_unsigned_relu: bool = False + quantize_bias: bool = True + remove_input_init: bool = True + enable_subgraph: bool = False + force_quantize_no_input_check: bool = False + matmul_const_b_only: bool = False + add_qdq_pair_to_weight: bool = False + op_types_to_exclude_output_quantization: Union[List[str], None] = field(default_factory=list) + dedicated_qdq_pair: bool = False + qdq_op_type_per_channel_support_to_axis: Dict = field(default_factory=dict) + use_qdq_vitis_custom_ops: bool = True + calib_tensor_range_symmetric: bool = False + calib_moving_average: bool = False + calib_moving_average_constant: float = 0.01 + random_data_reader_input_data_range: Union[Dict, None] = None + int16_scale: bool = False + min_mse_mode: str = "All" + convert_bn_to_conv: bool = True + convert_reduce_mean_to_global_avg_pool: bool = True + split_large_kernel_pool: bool = True + convert_split_to_slice: bool = True + fuse_instance_norm: bool = False + fuse_l2_norm: bool = False + convert_clip_to_relu: bool = False + simulate_dpu: bool = True + convert_leaky_relu_to_dpu_version: bool = True + convert_sigmoid_to_hard_sigmoid: bool = True + convert_hard_sigmoid_to_dpu_version: bool = True + convert_avg_pool_to_dpu_version: bool = True + convert_reduce_mean_to_dpu_version: bool = True + convert_softmax_to_dpu_version: bool = False + ipu_limitation_check: bool = True + adjust_shift_cut: bool = True + adjust_shift_bias: bool = True + adjust_shift_read: bool = True + adjust_shift_write: bool = True + adjust_hard_sigmoid: bool = True + adjust_shift_swish: bool = True + align_concat: bool = True + align_pool: bool = True + replace_clip6_relu: bool = False + cle_steps: int = 1 + cle_total_layer_diff_threshold: float = 2e-7 + cle_scale_append_bias: bool = True + remove_qdq_conv_leaky_relu: bool = False + remove_qdq_conv_prelu: bool = False + + @property + def snake_to_camel(self): + return { + "qdq_op_type_per_channel_support_to_axis": "QDQOpTypePerChannelSupportToAxis", + "ipu_limitation_check": "IPULimitationCheck", + "cle_steps": "CLESteps", + "cle_total_layer_diff_threshold": "CLETotalLayerDiffThreshold", + "cle_scale_append_bias": "CLEScaleAppendBias", + } + + @property + def camel_to_snake(self): + return {value: key for key, value in self.snake_to_camel.items()} + + def __setattr__(self, name, value): + snake_case_name = self.camel_to_snake.get(name, re.sub(r"([A-Z])", r"_\1", name).lower().lstrip("_")) + + super().__setattr__(snake_case_name, value) + + def __getattr__(self, name): + snake_case_name = self.camel_to_snake.get(name, re.sub(r"([A-Z])", r"_\1", name).lower().lstrip("_")) + return getattr(self, snake_case_name) + + def to_diff_dict(self, camel_case=False) -> dict: + """ + Returns a dictionary of non-default values in the configuration. + """ + non_default_values = {} + for option in fields(self): + if camel_case: + name = self.snake_to_camel.get( + option.name, "".join(word.capitalize() for word in option.name.split("_")) + ) + else: + name = option.name + if ( + getattr(self, option.name) != option.default + and getattr(self, option.name) != {} + and getattr(self, option.name) != [] + ): + non_default_values[name] = getattr(self, option.name) + return non_default_values + + @dataclass class QuantizationConfig: """ @@ -20,82 +270,271 @@ class QuantizationConfig: Args: is_static (`bool`): Whether to apply static quantization or dynamic quantization. - format (`QuantFormat`): - Targeted RyzenAI quantization representation format. - For the Operator Oriented (QOperator) format, all the quantized operators have their own ONNX definitions. - For the Tensor Oriented (QDQ) format, the model is quantized by inserting QuantizeLinear / DeQuantizeLinear - operators. - calibration_method (`CalibrationMethod`): - The method chosen to calculate the activations quantization parameters using the calibration dataset. - activations_dtype (`QuantType`, defaults to `QuantType.QUInt8`): - The quantization data types to use for the activations. - activations_symmetric (`bool`, defaults to `False`): - Whether to apply symmetric quantization on the activations. - weights_dtype (`QuantType`, defaults to `QuantType.QInt8`): - The quantization data types to use for the weights. - weights_symmetric (`bool`, defaults to `True`): - Whether to apply symmetric quantization on the weights. - enable_dpu (`bool`, defaults to `True`): - Determines whether to generate a quantized model that is suitable for the DPU. If set to True, the quantization - process will create a model that is optimized for DPU computations. - + format (`Union[QuantFormat, str]`, defaults to `QuantFormat.QDQ`): + This parameter is used to specify the quantization format of the model. + Options: + - `QuantFormat.QOperator`: Quantizes the model directly using quantized operators. + - `QuantFormat.QDQ`: Quantizes the model by inserting QuantizeLinear/DeQuantizeLinear into the tensor. + Supports 8-bit quantization only. + - `QuantFormat.VitisQuantFormat`: Quantizes the model by inserting VitisQuantizeLinear/VitisDequantizeLinear + into the tensor. Supports a wider range of bit-widths and precisions. + - `QuantFormat.FixNeuron` (Experimental): Quantizes the model by inserting FixNeuron (a combination of + QuantizeLinear and DeQuantizeLinear) into the tensor. Experimental and not recommended for deployment. + calibration_method (`Union[CalibrationMethod, str]`, defaults to `CalibrationMethod.MinMSE`): + The method used in calibration. + Options (for CNNs running on NPU, power-of-two methods; for Transformers on NPU or CNNs on CPU, float scale methods): + - `CalibrationMethod.NonOverflow`: Power-of-two method to prevent min/max values from overflowing. + - `CalibrationMethod.MinMSE`: Power-of-two method to minimize mean-square-loss of quantized values and float values. + Longer calibration time but usually better accuracy. + - `CalibrationMethod.MinMax`: Obtain quantization parameters based on minimum and maximum values of each tensor. + - `CalibrationMethod.Entropy`: Determine quantization parameters based on the entropy algorithm of each tensor's distribution. + - `CalibrationMethod.Percentile`: Calculate quantization parameters using percentiles of tensor values. + enable_ipu_cnn (`bool`, defaults to `True`): + Flag to generate a quantized model suitable for DPU/NPU computations. If True, the quantization process will + consider specific limitations and requirements of DPU/NPU, optimizing the model accordingly. + input_nodes (`List[str]`, defaults to an empty list `[]`): + List of names of starting nodes to be quantized. Nodes before these nodes will not be quantized. + output_nodes (`List[str]`, defaults to an empty list `[]`): + List of names of end nodes to be quantized. Nodes after these nodes will not be quantized. + op_types_to_quantize (`List[str]`, defaults to an empty list `[]`): + If specified, only operators of the given types will be quantized (e.g., ['Conv'] to quantize Convolutional layers). + random_data_reader_input_shape (`Union[List[int], Tuple[int], Dict[str, List[int]]]`, defaults to an empty list `[]`): + Shapes of input nodes for internal random data reader. If dynamic axes require specific values, provide shapes. + Format: list/tuple for single input, list of lists for multiple inputs, or dict {name: shape} for named inputs. + per_channel (`bool`, defaults to `False`): + Determines whether weights should be quantized per channel. Must be False for DPU/NPU devices. + reduce_range (`bool`, defaults to `False`): + If True, quantizes weights with 7-bits. Must be False for DPU/NPU devices. + activations_dtype (`QuantType`, defaults to `quint8`): + Specifies the quantization data type for activations. + weights_dtype (`QuantType`, defaults to `qint8`): + Specifies the quantization data type for weights. Must be `QuantType.QInt8` for NPU devices. + nodes_to_quantize (`List[str]`, defaults to an empty list `[]`): + If specified, only the nodes in this list are quantized. + nodes_to_exclude (`List[str]`, defaults to an empty list `[]`): + If specified, nodes in this list will be excluded from quantization. + optimize_model (`bool`, defaults to `True`): + If True, optimizes the model before quantization. + use_external_data_format (`bool`, defaults to `False`): + Flag for large size (>2GB) models. If True, model proto and data will be stored in separate files. + execution_providers (`List[str]`, defaults to `['CPUExecutionProvider']`): + Defines the execution providers used by ONNX Runtime for model calibration. + convert_fp16_to_fp32 (`bool`, defaults to `False`): + Controls whether to convert the input model from float16 to float32 before quantization. + convert_nchw_to_nhwc (`bool`, defaults to `False`): + Controls whether to convert the input NCHW model to NHWC model before quantization. + include_cle (`bool`, defaults to `False`): + Flag to optimize models using CrossLayerEqualization; can improve accuracy for some models. + extra_options (`Union[Dict, None, ExtraOptions]`, defaults to an instance of `ExtraOptions` with default values): + Contains key-value pairs for various options in different cases. """ - format: QuantFormat = QuantFormat.QDQ - calibration_method: CalibrationMethod = vai_q_onnx.PowerOfTwoMethod.MinMSE - activations_dtype: QuantType = QuantType.QUInt8 - activations_symmetric: bool = True - weights_dtype: QuantType = QuantType.QInt8 - weights_symmetric: bool = True - enable_dpu: bool = True + format: Literal["qdq", "qop", "vitisqdq"] = "qdq" + calibration_method: Literal["nonoverflow", "mse", "minmax", "entropy", "percentile"] = "mse" + input_nodes: List[str] = field(default_factory=list) + output_nodes: List[str] = field(default_factory=list) + op_types_to_quantize: List[str] = field(default_factory=list) + random_data_reader_input_shape: Union[List[int], Tuple[int], Dict[str, List[int]]] = field(default_factory=list) + per_channel: bool = False + reduce_range: bool = False + activations_dtype: Literal["uint8", "int8", "uint16", "int16", "uint32", "int32", "bfloat16", "float16"] = "uint8" + weights_dtype: Literal["uint8", "int8", "uint16", "int16", "uint32", "int32", "bfloat16", "float16"] = "int8" + nodes_to_quantize: List[str] = field(default_factory=list) + nodes_to_exclude: List[str] = field(default_factory=list) + optimize_model: bool = True + use_external_data_format: bool = False + execution_providers: List[str] = field(default_factory=lambda: ["CPUExecutionProvider"]) + enable_ipu_cnn: bool = False + convert_fp16_to_fp32: bool = False + convert_nchw_to_nhwc: bool = False + include_cle: bool = False + extra_options: ExtraOptions = field(default_factory=ExtraOptions) - @staticmethod - def quantization_type_str(activations_dtype: QuantType, weights_dtype: QuantType) -> str: - return ( - f"{'s8' if activations_dtype == QuantType.QInt8 else 'u8'}" - f"/" - f"{'s8' if weights_dtype == QuantType.QInt8 else 'u8'}" - ) + def __post_init__(self): + if isinstance(self.extra_options, dict): + self.extra_options = ExtraOptions(**self.extra_options) + + if self.calibration_method in {vai_q_onnx.PowerOfTwoMethod.NonOverflow, vai_q_onnx.PowerOfTwoMethod.MinMSE}: + self.extra_options.calib_tensor_range_symmetric = True + + if ( + self.activations_dtype not in {vai_q_onnx.QuantType.QUInt8, vai_q_onnx.QuantType.QInt8} + and self.format != vai_q_onnx.VitisQuantFormat.QDQ + ): + raise ValueError( + f'activations_dtype is: "{self.activations_dtype.name.lower()}", format must be "vitisqdq".' + ) + if ( + self.weights_dtype not in {vai_q_onnx.QuantType.QUInt8, vai_q_onnx.QuantType.QInt8} + and self.format != vai_q_onnx.VitisQuantFormat.QDQ + ): + raise ValueError(f'weights_dtype is: "{self.weights_dtype.name.lower()}", format must be "vitisqdq".') + + if self.enable_ipu_cnn: + if self.format != vai_q_onnx.QuantFormat.QDQ: + raise ValueError(f'ipu cnn configuration only support format "qdq". Got {self.format}.') + + if self.calibration_method not in { + vai_q_onnx.PowerOfTwoMethod.NonOverflow, + vai_q_onnx.PowerOfTwoMethod.MinMSE, + }: + raise ValueError( + f'ipu cnn configuration only support calibration_method "nonoverflow" and "mse". Got {self.calibration_method.name.lower()}.' + ) + + if not (self.extra_options.activation_symmetric and self.extra_options.weight_symmetric): + raise ValueError( + "ipu cnn configuration requires setting activation_symmetric and weight_symmetric to true." + ) + + if self.weights_dtype != vai_q_onnx.QuantType.QInt8: + raise ValueError( + f'ipu cnn configuration only support weights_dtype "int8". Got {self.weights_dtype.name.lower()}.' + ) + + if self.per_channel: + raise ValueError("ipu cnn configuration only supports per tensor. Got per_channel=True.") + + def __setattr__(self, name, value): + if name == "extra_options" and isinstance(value, dict): + setattr(self, "extra_options", ExtraOptions(**value)) + else: + super().__setattr__(name, value) + + def __getattribute__(self, name: str): + value = super().__getattribute__(name) + if isinstance(value, str): + if name == "format": + value = QUANT_FORMAT_MAPPING[value] + elif name == "calibration_method": + value = CALIBRATION_METHOD_MAPPING[value] + elif name == "activations_dtype": + value = QUANT_TYPE_MAPPING[value] + elif name == "weights_dtype": + value = QUANT_TYPE_MAPPING[value] + + return value + + def to_dict(self): + options_dict = self.__dict__.copy() + options_dict["extra_options"] = options_dict["extra_options"].to_diff_dict() + return options_dict + + def to_diff_dict(self) -> dict: + """ + Returns a dictionary of non-default values in the configuration. + """ + non_default_values = {} + for option in fields(self): + if option.name == "extra_options": + extra_options_dict = getattr(self, option.name).to_diff_dict() + if extra_options_dict: + non_default_values[option.name] = extra_options_dict + else: + value = getattr(self, option.name) + + if value != option.default and value not in ({}, []): + if option.name == "execution_providers" and value == ["CPUExecutionProvider"]: + continue + + if isinstance(value, Enum): + value = value.name + elif isinstance(value, list): + value = [elem.name if isinstance(elem, Enum) else elem for elem in value] + + non_default_values[option.name] = value + return non_default_values @property def use_symmetric_calibration(self) -> bool: - return self.activations_symmetric and self.weights_symmetric + if self.extra_options: + return self.extra_options.activation_symmetric and self.extra_options.weight_symmetric + + return ExtraOptions().activation_symmetric and ExtraOptions().weight_symmetric def __str__(self): return ( f"{self.format} (" - f"schema: {QuantizationConfig.quantization_type_str(self.activations_dtype, self.weights_dtype)}, " - f"enable_dpu: {self.enable_dpu})" + f"schema: {self.activations_dtype.name}/{self.weights_dtype.name}, " + f"enable_ipu_cnn: {self.enable_ipu_cnn})" ) class AutoQuantizationConfig: @staticmethod - def ipu_cnn_config(): + def ipu_cnn_config( + calibrate_method: Literal["nonoverflow", "mse", "minmax", "entropy", "percentile"] = "mse", + nodes_to_quantize: List[str] = [], + nodes_to_exclude: List[str] = [], + op_types_to_quantize: List[str] = [], + extra_options: Optional[Union[Dict[str, bool], ExtraOptions]] = None, + ): + extra_options = extra_options or {} + if isinstance(extra_options, dict): + extra_options = ExtraOptions(**extra_options) + + extra_options_dict = extra_options.to_diff_dict() + extra_options_dict["activation_symmetric"] = extra_options_dict.get("activation_symmetric", True) + return QuantizationConfig( - format=QuantFormat.QDQ, - calibration_method=vai_q_onnx.PowerOfTwoMethod.MinMSE, - activations_dtype=QuantType.QUInt8, - activations_symmetric=True, - weights_dtype=QuantType.QInt8, - weights_symmetric=True, - enable_dpu=True, + format="qdq", + calibration_method=calibrate_method, + activations_dtype="uint8", + weights_dtype="int8", + enable_ipu_cnn=True, + op_types_to_quantize=op_types_to_quantize, + nodes_to_quantize=nodes_to_quantize, + nodes_to_exclude=nodes_to_exclude, + extra_options=ExtraOptions(**extra_options_dict), + ) + + @staticmethod + def ipu_transformer_config( + calibrate_method: Literal["nonoverflow", "mse", "minmax", "entropy", "percentile"] = "minmax", + nodes_to_quantize: List[str] = [], + nodes_to_exclude: List[str] = [], + op_types_to_quantize: List[str] = [], + extra_options: Optional[Union[Dict[str, bool], ExtraOptions]] = None, + ): + extra_options = extra_options or {} + if isinstance(extra_options, dict): + extra_options = ExtraOptions(**extra_options) + + extra_options_dict = extra_options.to_diff_dict() + extra_options_dict["activation_symmetric"] = extra_options_dict.get("activation_symmetric", True) + + return QuantizationConfig( + format="qdq", + calibration_method=calibrate_method, + activations_dtype="int8", + weights_dtype="int8", + op_types_to_quantize=op_types_to_quantize, + nodes_to_quantize=nodes_to_quantize, + nodes_to_exclude=nodes_to_exclude, + extra_options=ExtraOptions(**extra_options_dict), ) @staticmethod def cpu_cnn_config( - use_symmetric_activations: bool = False, - use_symmetric_weights: bool = True, - enable_dpu: bool = False, + calibrate_method: Literal["nonoverflow", "mse", "minmax", "entropy", "percentile"] = "minmax", + nodes_to_quantize: List[str] = [], + nodes_to_exclude: List[str] = [], + op_types_to_quantize: List[str] = [], + extra_options: Optional[Union[Dict[str, bool], ExtraOptions]] = None, ): + extra_options = extra_options or {} + if isinstance(extra_options, dict): + extra_options = ExtraOptions(**extra_options) + return QuantizationConfig( - format=QuantFormat.QDQ, - calibration_method=vai_q_onnx.CalibrationMethod.MinMax, - activations_dtype=QuantType.QUInt8, - activations_symmetric=use_symmetric_activations, - weights_dtype=QuantType.QInt8, - weights_symmetric=use_symmetric_weights, - enable_dpu=enable_dpu, + format="qdq", + calibration_method=calibrate_method, + activations_dtype="uint8", + weights_dtype="int8", + op_types_to_quantize=op_types_to_quantize, + nodes_to_quantize=nodes_to_quantize, + nodes_to_exclude=nodes_to_exclude, + extra_options=extra_options, ) @@ -122,7 +561,7 @@ def __init__( ): super().__init__() self.opset = opset - self.quantization = self.dataclass_to_dict(quantization) + self.quantization = quantization.to_dict() if quantization is not None else None self.optimum_version = kwargs.pop("optimum_version", None) @staticmethod diff --git a/optimum/amd/ryzenai/quantization.py b/optimum/amd/ryzenai/quantization.py index 28fc5621..92aadaf5 100644 --- a/optimum/amd/ryzenai/quantization.py +++ b/optimum/amd/ryzenai/quantization.py @@ -163,19 +163,35 @@ def quantize( quantized_model_path = save_dir.joinpath(f"{self.onnx_model_path.stem}{suffix}").with_suffix(".onnx") LOGGER.info("Quantizing model...") + quantize_static( model_input=Path(self.onnx_model_path).as_posix(), model_output=quantized_model_path.as_posix(), calibration_data_reader=reader, quant_format=quantization_config.format, calibrate_method=quantization_config.calibration_method, - weight_type=quantization_config.weights_dtype, + input_nodes=quantization_config.input_nodes, + output_nodes=quantization_config.output_nodes, + op_types_to_quantize=quantization_config.op_types_to_quantize, + random_data_reader_input_shape=quantization_config.random_data_reader_input_shape, + per_channel=quantization_config.per_channel, + reduce_range=quantization_config.reduce_range, activation_type=quantization_config.activations_dtype, - enable_dpu=quantization_config.enable_dpu, - extra_options={ - "WeightSymmetric": quantization_config.weights_symmetric, - "ActivationSymmetric": quantization_config.activations_symmetric, - }, + weight_type=quantization_config.weights_dtype, + nodes_to_quantize=quantization_config.nodes_to_quantize, + nodes_to_exclude=quantization_config.nodes_to_exclude, + optimize_model=quantization_config.optimize_model, + use_external_data_format=quantization_config.use_external_data_format, + execution_providers=quantization_config.execution_providers, + enable_ipu_cnn=quantization_config.enable_ipu_cnn, + convert_fp16_to_fp32=quantization_config.convert_fp16_to_fp32, + convert_nchw_to_nhwc=quantization_config.convert_nchw_to_nhwc, + include_cle=quantization_config.include_cle, + extra_options=( + quantization_config.extra_options.to_diff_dict(camel_case=True) + if quantization_config.extra_options + else {} + ), ) LOGGER.info(f"Saved quantized model at: {save_dir}") diff --git a/tests/ryzenai/test_configuration.py b/tests/ryzenai/test_configuration.py new file mode 100644 index 00000000..29382ceb --- /dev/null +++ b/tests/ryzenai/test_configuration.py @@ -0,0 +1,169 @@ +import unittest + +import vai_q_onnx +from parameterized import parameterized + +from optimum.amd.ryzenai import AutoQuantizationConfig, ExtraOptions, QuantizationConfig + + +class TestExtraOptions(unittest.TestCase): + def test_default_values(self): + options = ExtraOptions() + self.assertEqual(options.activation_symmetric, False) + self.assertEqual(options.weight_symmetric, True) + self.assertEqual(options.use_unsigned_relu, False) + self.assertEqual(options.quantize_bias, True) + self.assertEqual(options.remove_input_init, True) + self.assertEqual(options.enable_subgraph, False) + self.assertEqual(options.force_quantize_no_input_check, False) + self.assertEqual(options.matmul_const_b_only, False) + self.assertEqual(options.add_qdq_pair_to_weight, False) + self.assertEqual(options.op_types_to_exclude_output_quantization, []) + self.assertEqual(options.dedicated_qdq_pair, False) + self.assertEqual(options.qdq_op_type_per_channel_support_to_axis, {}) + self.assertEqual(options.use_qdq_vitis_custom_ops, True) + self.assertEqual(options.calib_tensor_range_symmetric, False) + self.assertEqual(options.calib_moving_average, False) + self.assertEqual(options.calib_moving_average_constant, 0.01) + self.assertEqual(options.random_data_reader_input_data_range, None) + self.assertEqual(options.int16_scale, False) + self.assertEqual(options.min_mse_mode, "All") + self.assertEqual(options.convert_bn_to_conv, True) + self.assertEqual(options.convert_reduce_mean_to_global_avg_pool, True) + self.assertEqual(options.split_large_kernel_pool, True) + self.assertEqual(options.convert_split_to_slice, True) + self.assertEqual(options.fuse_instance_norm, False) + self.assertEqual(options.fuse_l2_norm, False) + self.assertEqual(options.convert_clip_to_relu, False) + self.assertEqual(options.simulate_dpu, True) + self.assertEqual(options.convert_leaky_relu_to_dpu_version, True) + self.assertEqual(options.convert_sigmoid_to_hard_sigmoid, True) + self.assertEqual(options.convert_hard_sigmoid_to_dpu_version, True) + self.assertEqual(options.convert_avg_pool_to_dpu_version, True) + self.assertEqual(options.convert_reduce_mean_to_dpu_version, True) + self.assertEqual(options.convert_softmax_to_dpu_version, False) + self.assertEqual(options.ipu_limitation_check, True) + self.assertEqual(options.adjust_shift_cut, True) + self.assertEqual(options.adjust_shift_bias, True) + self.assertEqual(options.adjust_shift_read, True) + self.assertEqual(options.adjust_shift_write, True) + self.assertEqual(options.adjust_hard_sigmoid, True) + self.assertEqual(options.adjust_shift_swish, True) + self.assertEqual(options.align_concat, True) + self.assertEqual(options.align_pool, True) + self.assertEqual(options.replace_clip6_relu, False) + self.assertEqual(options.cle_steps, 1) + self.assertEqual(options.cle_total_layer_diff_threshold, 2e-7) + self.assertEqual(options.cle_scale_append_bias, True) + self.assertEqual(options.remove_qdq_conv_leaky_relu, False) + self.assertEqual(options.remove_qdq_conv_prelu, False) + + def test_snake_to_camel(self): + options = ExtraOptions() + camel_case_dict = options.snake_to_camel + self.assertEqual( + camel_case_dict["qdq_op_type_per_channel_support_to_axis"], "QDQOpTypePerChannelSupportToAxis" + ) + self.assertEqual(camel_case_dict["ipu_limitation_check"], "IPULimitationCheck") + self.assertEqual(camel_case_dict["cle_steps"], "CLESteps") + self.assertEqual(camel_case_dict["cle_total_layer_diff_threshold"], "CLETotalLayerDiffThreshold") + self.assertEqual(camel_case_dict["cle_scale_append_bias"], "CLEScaleAppendBias") + + def test_camel_to_snake_setattr(self): + options = ExtraOptions() + options.QDQOpTypePerChannelSupportToAxis = "some_value" + options.IPULimitationCheck = False + options.CLESteps = 5 + options.CLETotalLayerDiffThreshold = 1e-7 + options.CLEScaleAppendBias = False + self.assertEqual(options.qdq_op_type_per_channel_support_to_axis, "some_value") + self.assertFalse(options.ipu_limitation_check) + self.assertEqual(options.cle_steps, 5) + self.assertEqual(options.cle_total_layer_diff_threshold, 1e-7) + self.assertFalse(options.cle_scale_append_bias) + + def test_to_diff_dict(self): + options = ExtraOptions(activation_symmetric=True) + diff_dict = options.to_diff_dict() + self.assertEqual(diff_dict, {"activation_symmetric": True}) + + @parameterized.expand( + [ + ("activation_symmetric", False, True), + ("weight_symmetric", True, False), + ("use_unsigned_relu", False, True), + ] + ) + def test_parametric_setting_attributes(self, attribute, default_value, new_value): + options = ExtraOptions() + self.assertEqual(getattr(options, attribute), default_value) + setattr(options, attribute, new_value) + self.assertEqual(getattr(options, attribute), new_value) + + +class TestQuantizationConfig(unittest.TestCase): + def test_default_values(self): + config = QuantizationConfig() + self.assertEqual(config.format, vai_q_onnx.QuantFormat.QDQ) + self.assertEqual(config.calibration_method, vai_q_onnx.PowerOfTwoMethod.MinMSE) + self.assertEqual(config.input_nodes, []) + self.assertEqual(config.output_nodes, []) + self.assertEqual(config.op_types_to_quantize, []) + self.assertEqual(config.random_data_reader_input_shape, []) + self.assertFalse(config.per_channel) + self.assertFalse(config.reduce_range) + self.assertEqual(config.activations_dtype, vai_q_onnx.QuantType.QUInt8) + self.assertEqual(config.weights_dtype, vai_q_onnx.QuantType.QInt8) + self.assertEqual(config.nodes_to_quantize, []) + self.assertEqual(config.nodes_to_exclude, []) + self.assertTrue(config.optimize_model) + self.assertFalse(config.use_external_data_format) + self.assertEqual(config.execution_providers, ["CPUExecutionProvider"]) + self.assertFalse(config.enable_ipu_cnn) + self.assertFalse(config.convert_fp16_to_fp32) + self.assertFalse(config.convert_nchw_to_nhwc) + self.assertFalse(config.include_cle) + self.assertIsInstance(config.extra_options, ExtraOptions) + + def test_extra_options_initialization(self): + extra_options = ExtraOptions(activation_symmetric=True) + config = QuantizationConfig(extra_options=extra_options) + self.assertEqual(config.extra_options.activation_symmetric, True) + + def test_use_symmetric_calibration(self): + config = QuantizationConfig(extra_options=ExtraOptions(activation_symmetric=True, weight_symmetric=True)) + self.assertTrue(config.use_symmetric_calibration) + + @parameterized.expand( + [ + ("format", vai_q_onnx.QuantFormat.QDQ, vai_q_onnx.QuantFormat.QOperator), + ("calibration_method", vai_q_onnx.PowerOfTwoMethod.MinMSE, vai_q_onnx.CalibrationMethod.Entropy), + ("activations_dtype", vai_q_onnx.QuantType.QUInt8, vai_q_onnx.QuantType.QInt8), + ("weights_dtype", vai_q_onnx.QuantType.QInt8, vai_q_onnx.QuantType.QUInt8), + ] + ) + def test_parametric_setting_attributes(self, attribute, default_value, new_value): + config = QuantizationConfig() + self.assertEqual(getattr(config, attribute), default_value) + setattr(config, attribute, new_value) + self.assertEqual(getattr(config, attribute), new_value) + + +class TestAutoQuantizationConfig(unittest.TestCase): + def test_ipu_cnn_config(self): + config = AutoQuantizationConfig.ipu_cnn_config() + self.assertEqual(config.format, vai_q_onnx.QuantFormat.QDQ) + self.assertEqual(config.calibration_method, vai_q_onnx.PowerOfTwoMethod.MinMSE) + self.assertTrue(config.extra_options.activation_symmetric) + + def test_ipu_transformer_config(self): + config = AutoQuantizationConfig.ipu_transformer_config() + self.assertEqual(config.format, vai_q_onnx.QuantFormat.QDQ) + self.assertEqual(config.calibration_method, vai_q_onnx.CalibrationMethod.MinMax) + self.assertTrue(config.extra_options.activation_symmetric) + + def test_cpu_cnn_config(self): + config = AutoQuantizationConfig.cpu_cnn_config() + + self.assertEqual(config.format, vai_q_onnx.QuantFormat.QDQ) + self.assertEqual(config.calibration_method, vai_q_onnx.CalibrationMethod.MinMax)