huggingface · mht-sharma · Sep 10, 2024
diff --git a/Makefile b/Makefile
@@ -36,3 +36,23 @@ clean:
 	rm -rf build/
 	rm -rf dist/
 	rm -rf optimum_amd.egg-info/
+
+build-quark:
+	docker build -t quark-mht docker/quantization-quark/
+
+interact:
+	docker run --rm -it   --entrypoint bash \
+	--cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd \
+	--device=/dev/dri --group-add video --ipc=host --shm-size 64g --net host \
+	-v /home/amd/.cache/huggingface/hub:/data \
+	-v $(PWD):/tgi \
+	tgi-mht:2.5
+
+interact-quark:
+	docker run --rm -it   --entrypoint bash \
+	--cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd \
+	--device=/dev/dri --group-add video --ipc=host --shm-size 64g --net host \
+	-v /home/amd/.cache/huggingface/hub:/data \
+	-v $(PWD):/quark \
+	-v $(PWD)/../transformers:/tr \
+	quark-mht
diff --git a/docker/quantization-quark/Dockerfile b/docker/quantization-quark/Dockerfile
@@ -0,0 +1,42 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+# Licensed under the MIT License.
+
+FROM rocm/dev-ubuntu-22.04:6.1
+
+LABEL maintainer="Hugging Face"
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    sudo \
+    python3.10 \
+    python3.10-dev \
+    python3-pip \
+    git \
+    wget \
+    unzip \
+    libsndfile1-dev \
+    tesseract-ocr \
+    espeak-ng \
+    rocthrust-dev \
+    hipsparse-dev \
+    hipblaslt-dev \
+    hipsolver-dev \
+    hipblas-dev && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* && \
+    update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1 && \
+    python -m pip install -U pip
+
+RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.1 --no-cache-dir
+
+WORKDIR /quark
+RUN wget -O quark-0.2.0-23-py3-none-any.whl https://www.xilinx.com/bin/public/openDownload?filename=quark-0.2.0+6af1bac23-py3-none-any.whl && \
+    pip install quark-0.2.0-23-py3-none-any.whl && \
+    rm -rf quark-0.2.0-23-py3-none-any.whl
+
+RUN python -c "import quark.torch.kernel"
+
+RUN pip install git+https://github.com/mht-sharma/transformers.git@fc62c00e1f2a927acb354e28e43828a47fa776b6
+
+ENTRYPOINT ["bash"]
diff --git a/optimum/amd/quantizers/__init__.py b/optimum/amd/quantizers/__init__.py
@@ -0,0 +1,4 @@
+from .quark import (
+    AutoQuantizationConfig,
+    QuarkPlugin,
+)
diff --git a/optimum/amd/quantizers/quark/__init__.py b/optimum/amd/quantizers/quark/__init__.py
@@ -0,0 +1,2 @@
+from .configuration import AutoQuantizationConfig
+from .quantizer import QuarkPlugin
diff --git a/optimum/amd/quantizers/quark/algo_config_constants.py b/optimum/amd/quantizers/quark/algo_config_constants.py
@@ -0,0 +1,195 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+# Licensed under the MIT License.
+
+
+ALGO_CONFIG_PARAMS = {
+    "llama": {
+        "scaling_layers": [
+            {
+                "prev_op": "input_layernorm",
+                "layers": ["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj"],
+                "inp": "self_attn.q_proj",
+                "module2inspect": "self_attn",
+                "has_kwargs": True,
+                "help": "attention input",
+            },
+            {
+                "prev_op": "self_attn.v_proj",
+                "layers": ["self_attn.o_proj"],
+                "inp": "self_attn.o_proj",
+                "module2inspect": None,
+                "has_kwargs": False,
+                "help": "attention out, Please refer to https://github.com/mit-han-lab/llm-awq/pull/67#issue-1850622696, if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape",
+                "condition": "module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape",
+            },
+            {
+                "prev_op": "post_attention_layernorm",
+                "layers": ["mlp.gate_proj", "mlp.up_proj"],
+                "inp": "mlp.gate_proj",
+                "module2inspect": "mlp",
+                "has_kwargs": False,
+                "help": "linear 1",
+            },
+            {
+                "prev_op": "mlp.up_proj",
+                "layers": ["mlp.down_proj"],
+                "inp": "mlp.down_proj",
+                "module2inspect": None,
+                "has_kwargs": False,
+                "help": "linear 2",
+            },
+        ],
+        "inside_layer_modules": [
+            "self_attn.k_proj",
+            "self_attn.v_proj",
+            "self_attn.q_proj",
+            "self_attn.o_proj",
+            "mlp.up_proj",
+            "mlp.gate_proj",
+            "mlp.down_proj",
+        ],
+        "model_decoder_layers": "model.layers",
+        "embedding_layers": ["model.embed_tokens"],
+    },
+    "mistral": {
+        "scaling_layers": [
+            {
+                "prev_op": "input_layernorm",
+                "layers": ["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj"],
+                "inp": "self_attn.q_proj",
+                "module2inspect": "self_attn",
+                "has_kwargs": True,
+                "help": "attention input",
+            },
+            {
+                "prev_op": "self_attn.v_proj",
+                "layers": ["self_attn.o_proj"],
+                "inp": "self_attn.o_proj",
+                "module2inspect": None,
+                "has_kwargs": False,
+                "help": "attention out, Please refer to https://github.com/mit-han-lab/llm-awq/pull/67#issue-1850622696, if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape",
+                "condition": "module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape",
+            },
+            {
+                "prev_op": "post_attention_layernorm",
+                "layers": ["mlp.gate_proj", "mlp.up_proj"],
+                "inp": "mlp.gate_proj",
+                "module2inspect": "mlp",
+                "has_kwargs": False,
+                "help": "linear 1",
+            },
+            {
+                "prev_op": "mlp.up_proj",
+                "layers": ["mlp.down_proj"],
+                "inp": "mlp.down_proj",
+                "module2inspect": None,
+                "has_kwargs": False,
+                "help": "linear 2",
+            },
+        ],
+        "inside_layer_modules": [
+            "self_attn.k_proj",
+            "self_attn.v_proj",
+            "self_attn.q_proj",
+            "self_attn.o_proj",
+            "mlp.up_proj",
+            "mlp.gate_proj",
+            "mlp.down_proj",
+        ],
+        "model_decoder_layers": "model.layers",
+        "embedding_layers": ["model.embed_tokens"],
+    },
+    "opt": {
+        "scaling_layers": [
+            {
+                "prev_op": "self_attn_layer_norm",
+                "layers": ["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj"],
+                "inp": "self_attn.q_proj",
+                "module2inspect": "self_attn",
+                "has_kwargs": True,
+                "help": "attention input",
+            },
+            {
+                "prev_op": "self_attn.v_proj",
+                "layers": ["self_attn.out_proj"],
+                "inp": "self_attn.out_proj",
+                "module2inspect": None,
+                "has_kwargs": False,
+                "help": "attention out",
+            },
+            {
+                "prev_op": "final_layer_norm",
+                "layers": ["fc1"],
+                "inp": "fc1",
+                "module2inspect": None,
+                "has_kwargs": False,
+                "help": "linear 1",
+            },
+            {
+                "prev_op": "fc1",
+                "layers": ["fc2"],
+                "inp": "fc2",
+                "module2inspect": None,
+                "has_kwargs": False,
+                "help": "linear 2",
+            },
+        ],
+        "inside_layer_modules": [
+            "self_attn.k_proj",
+            "self_attn.v_proj",
+            "self_attn.q_proj",
+            "self_attn.out_proj",
+            "fc1",
+            "fc2",
+        ],
+        "model_decoder_layers": "model.decoder.layers",
+        "embedding_layers": ["model.decoder.embed_tokens", "model.decoder.embed_positions"],
+    },
+    "qwen2": {
+        "scaling_layers": [
+            {
+                "prev_op": "input_layernorm",
+                "layers": ["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj"],
+                "inp": "self_attn.q_proj",
+                "module2inspect": "self_attn",
+                "has_kwargs": True,
+                "help": "attention input",
+            },
+            {
+                "prev_op": "self_attn.v_proj",
+                "layers": ["self_attn.o_proj"],
+                "inp": "self_attn.o_proj",
+                "module2inspect": None,
+                "has_kwargs": False,
+                "help": "attention out, Please refer to https://github.com/mit-han-lab/llm-awq/pull/67#issue-1850622696, if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape",
+            },
+            {
+                "prev_op": "post_attention_layernorm",
+                "layers": ["mlp.gate_proj", "mlp.up_proj"],
+                "inp": "mlp.gate_proj",
+                "module2inspect": "mlp",
+                "has_kwargs": False,
+                "help": "linear 1",
+            },
+            {
+                "prev_op": "mlp.up_proj",
+                "layers": ["mlp.down_proj"],
+                "inp": "mlp.down_proj",
+                "module2inspect": None,
+                "has_kwargs": False,
+                "help": "linear 2",
+            },
+        ],
+        "inside_layer_modules": [
+            "self_attn.k_proj",
+            "self_attn.v_proj",
+            "self_attn.q_proj",
+            "self_attn.o_proj",
+            "mlp.up_proj",
+            "mlp.gate_proj",
+            "mlp.down_proj",
+        ],
+        "model_decoder_layers": "model.layers",
+        "embedding_layers": ["model.embed_tokens"],
+    },
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .configuration import AutoQuantizationConfig
		from .quantizer import QuarkPlugin