Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,23 @@ clean:
rm -rf build/
rm -rf dist/
rm -rf optimum_amd.egg-info/

build-quark:
docker build -t quark-mht docker/quantization-quark/

interact:
docker run --rm -it --entrypoint bash \
--cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd \
--device=/dev/dri --group-add video --ipc=host --shm-size 64g --net host \
-v /home/amd/.cache/huggingface/hub:/data \
-v $(PWD):/tgi \
tgi-mht:2.5

interact-quark:
docker run --rm -it --entrypoint bash \
--cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd \
--device=/dev/dri --group-add video --ipc=host --shm-size 64g --net host \
-v /home/amd/.cache/huggingface/hub:/data \
-v $(PWD):/quark \
-v $(PWD)/../transformers:/tr \
quark-mht
42 changes: 42 additions & 0 deletions docker/quantization-quark/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
# Licensed under the MIT License.

FROM rocm/dev-ubuntu-22.04:6.1

LABEL maintainer="Hugging Face"

ARG DEBIAN_FRONTEND=noninteractive

RUN apt-get update && apt-get install -y --no-install-recommends \
sudo \
python3.10 \
python3.10-dev \
python3-pip \
git \
wget \
unzip \
libsndfile1-dev \
tesseract-ocr \
espeak-ng \
rocthrust-dev \
hipsparse-dev \
hipblaslt-dev \
hipsolver-dev \
hipblas-dev && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* && \
update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1 && \
python -m pip install -U pip

RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.1 --no-cache-dir

WORKDIR /quark
RUN wget -O quark-0.2.0-23-py3-none-any.whl https://www.xilinx.com/bin/public/openDownload?filename=quark-0.2.0+6af1bac23-py3-none-any.whl && \
pip install quark-0.2.0-23-py3-none-any.whl && \
rm -rf quark-0.2.0-23-py3-none-any.whl

RUN python -c "import quark.torch.kernel"

RUN pip install git+https://github.com/mht-sharma/transformers.git@fc62c00e1f2a927acb354e28e43828a47fa776b6

ENTRYPOINT ["bash"]
4 changes: 4 additions & 0 deletions optimum/amd/quantizers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .quark import (
AutoQuantizationConfig,
QuarkPlugin,
)
2 changes: 2 additions & 0 deletions optimum/amd/quantizers/quark/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .configuration import AutoQuantizationConfig
from .quantizer import QuarkPlugin
195 changes: 195 additions & 0 deletions optimum/amd/quantizers/quark/algo_config_constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
# Copyright 2023 The HuggingFace Team. All rights reserved.
# Licensed under the MIT License.


ALGO_CONFIG_PARAMS = {
"llama": {
"scaling_layers": [
{
"prev_op": "input_layernorm",
"layers": ["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj"],
"inp": "self_attn.q_proj",
"module2inspect": "self_attn",
"has_kwargs": True,
"help": "attention input",
},
{
"prev_op": "self_attn.v_proj",
"layers": ["self_attn.o_proj"],
"inp": "self_attn.o_proj",
"module2inspect": None,
"has_kwargs": False,
"help": "attention out, Please refer to https://github.com/mit-han-lab/llm-awq/pull/67#issue-1850622696, if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape",
"condition": "module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape",
},
{
"prev_op": "post_attention_layernorm",
"layers": ["mlp.gate_proj", "mlp.up_proj"],
"inp": "mlp.gate_proj",
"module2inspect": "mlp",
"has_kwargs": False,
"help": "linear 1",
},
{
"prev_op": "mlp.up_proj",
"layers": ["mlp.down_proj"],
"inp": "mlp.down_proj",
"module2inspect": None,
"has_kwargs": False,
"help": "linear 2",
},
],
"inside_layer_modules": [
"self_attn.k_proj",
"self_attn.v_proj",
"self_attn.q_proj",
"self_attn.o_proj",
"mlp.up_proj",
"mlp.gate_proj",
"mlp.down_proj",
],
"model_decoder_layers": "model.layers",
"embedding_layers": ["model.embed_tokens"],
},
"mistral": {
"scaling_layers": [
{
"prev_op": "input_layernorm",
"layers": ["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj"],
"inp": "self_attn.q_proj",
"module2inspect": "self_attn",
"has_kwargs": True,
"help": "attention input",
},
{
"prev_op": "self_attn.v_proj",
"layers": ["self_attn.o_proj"],
"inp": "self_attn.o_proj",
"module2inspect": None,
"has_kwargs": False,
"help": "attention out, Please refer to https://github.com/mit-han-lab/llm-awq/pull/67#issue-1850622696, if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape",
"condition": "module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape",
},
{
"prev_op": "post_attention_layernorm",
"layers": ["mlp.gate_proj", "mlp.up_proj"],
"inp": "mlp.gate_proj",
"module2inspect": "mlp",
"has_kwargs": False,
"help": "linear 1",
},
{
"prev_op": "mlp.up_proj",
"layers": ["mlp.down_proj"],
"inp": "mlp.down_proj",
"module2inspect": None,
"has_kwargs": False,
"help": "linear 2",
},
],
"inside_layer_modules": [
"self_attn.k_proj",
"self_attn.v_proj",
"self_attn.q_proj",
"self_attn.o_proj",
"mlp.up_proj",
"mlp.gate_proj",
"mlp.down_proj",
],
"model_decoder_layers": "model.layers",
"embedding_layers": ["model.embed_tokens"],
},
"opt": {
"scaling_layers": [
{
"prev_op": "self_attn_layer_norm",
"layers": ["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj"],
"inp": "self_attn.q_proj",
"module2inspect": "self_attn",
"has_kwargs": True,
"help": "attention input",
},
{
"prev_op": "self_attn.v_proj",
"layers": ["self_attn.out_proj"],
"inp": "self_attn.out_proj",
"module2inspect": None,
"has_kwargs": False,
"help": "attention out",
},
{
"prev_op": "final_layer_norm",
"layers": ["fc1"],
"inp": "fc1",
"module2inspect": None,
"has_kwargs": False,
"help": "linear 1",
},
{
"prev_op": "fc1",
"layers": ["fc2"],
"inp": "fc2",
"module2inspect": None,
"has_kwargs": False,
"help": "linear 2",
},
],
"inside_layer_modules": [
"self_attn.k_proj",
"self_attn.v_proj",
"self_attn.q_proj",
"self_attn.out_proj",
"fc1",
"fc2",
],
"model_decoder_layers": "model.decoder.layers",
"embedding_layers": ["model.decoder.embed_tokens", "model.decoder.embed_positions"],
},
"qwen2": {
"scaling_layers": [
{
"prev_op": "input_layernorm",
"layers": ["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj"],
"inp": "self_attn.q_proj",
"module2inspect": "self_attn",
"has_kwargs": True,
"help": "attention input",
},
{
"prev_op": "self_attn.v_proj",
"layers": ["self_attn.o_proj"],
"inp": "self_attn.o_proj",
"module2inspect": None,
"has_kwargs": False,
"help": "attention out, Please refer to https://github.com/mit-han-lab/llm-awq/pull/67#issue-1850622696, if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape",
},
{
"prev_op": "post_attention_layernorm",
"layers": ["mlp.gate_proj", "mlp.up_proj"],
"inp": "mlp.gate_proj",
"module2inspect": "mlp",
"has_kwargs": False,
"help": "linear 1",
},
{
"prev_op": "mlp.up_proj",
"layers": ["mlp.down_proj"],
"inp": "mlp.down_proj",
"module2inspect": None,
"has_kwargs": False,
"help": "linear 2",
},
],
"inside_layer_modules": [
"self_attn.k_proj",
"self_attn.v_proj",
"self_attn.q_proj",
"self_attn.o_proj",
"mlp.up_proj",
"mlp.gate_proj",
"mlp.down_proj",
],
"model_decoder_layers": "model.layers",
"embedding_layers": ["model.embed_tokens"],
},
}
Loading