Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
728 changes: 435 additions & 293 deletions .github/benchmark/sglang_benchmark_models.json

Large diffs are not rendered by default.

92 changes: 70 additions & 22 deletions .github/benchmark/sglang_models_accuracy.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
{
"model_name": "DeepSeek-R1-FP8 TP4",
"model_path": "deepseek-ai/DeepSeek-R1-0528",
"extraArgs": "--tensor-parallel-size 4",
"env_vars": "SGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1",
"extraArgs": "--trust-remote-code --tensor-parallel-size 4 --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache",
"env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=1\nTORCHINDUCTOR_COMPILE_THREADS=128",
"runner": "linux-atom-mi35x-4",
"test_level": "nightly",
"accuracy_threshold": 0.91,
Expand Down Expand Up @@ -62,8 +62,8 @@
{
"model_name": "DeepSeek-R1-FP8 TP8",
"model_path": "deepseek-ai/DeepSeek-R1-0528",
"extraArgs": "--tensor-parallel-size 8",
"env_vars": "SGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1",
"extraArgs": "--trust-remote-code --tensor-parallel-size 8 --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache",
"env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=1\nTORCHINDUCTOR_COMPILE_THREADS=128",
"runner": "linux-atom-mi35x-8",
"test_level": "nightly",
"accuracy_threshold": 0.93,
Expand All @@ -73,62 +73,110 @@
},
{
"model_name": "DeepSeek-R1-FP4 TP4",
"model_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4",
"extraArgs": "--tensor-parallel-size 4",
"env_vars": "SGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1",
"model_path": "amd/DeepSeek-R1-0528-MXFP4-v2",
"extraArgs": "--trust-remote-code --tensor-parallel-size 4 --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache",
"env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=1\nTORCHINDUCTOR_COMPILE_THREADS=128",
"runner": "linux-atom-mi35x-4",
"test_level": "nightly",
"accuracy_threshold": 0.91,
"accuracy_baseline": null,
"accuracy_baseline_model": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4",
"accuracy_baseline_model": "amd/DeepSeek-R1-0528-MXFP4-v2",
"_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k."
},
{
"model_name": "DeepSeek-R1-FP4 TP4 DP4 EP4",
"model_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4",
"extraArgs": "--trust-remote-code --tensor-parallel-size 4 --data-parallel-size 4 --expert-parallel-size 4 --enable-dp-attention --kv-cache-dtype fp8_e4m3 --attention-backend aiter --mem-fraction-static 0.8 --decode-log-interval 1000 --chunked-prefill-size 65536 --max-running-requests 24 --disable-radix-cache",
"env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nAITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nMORI_SHMEM_MODE=ISOLATION\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=128\nSGLANG_MORI_DISPATCH_DTYPE=bf16\nSGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=16384",
"model_path": "amd/DeepSeek-R1-0528-MXFP4-v2",
"extraArgs": "--trust-remote-code --tensor-parallel-size 4 --expert-parallel-size 4 --data-parallel-size 4 --enable-dp-attention --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache",
"env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nSGLANG_ENABLE_TORCH_COMPILE=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nMORI_SHMEM_MODE=ISOLATION\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nTORCHINDUCTOR_COMPILE_THREADS=128",
"runner": "linux-atom-mi35x-8",
"test_level": "nightly",
"accuracy_threshold": 0.91,
"accuracy_baseline": null,
"accuracy_baseline_model": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4",
"accuracy_baseline_model": "amd/DeepSeek-R1-0528-MXFP4-v2",
"_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k."
},
{
"model_name": "DeepSeek-R1-FP4 TP4 DP8 EP8",
"model_path": "amd/DeepSeek-R1-0528-MXFP4-v2",
"extraArgs": "--trust-remote-code --tensor-parallel-size 4 --expert-parallel-size 8 --data-parallel-size 8 --enable-dp-attention --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache",
"env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nSGLANG_ENABLE_TORCH_COMPILE=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nMORI_SHMEM_MODE=ISOLATION\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nTORCHINDUCTOR_COMPILE_THREADS=128",
"runner": "linux-atom-mi35x-8",
"test_level": "nightly",
"accuracy_threshold": 0.91,
"accuracy_baseline": null,
"accuracy_baseline_model": "amd/DeepSeek-R1-0528-MXFP4-v2",
"_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k."
},
{
"model_name": "DeepSeek-R1-FP4 TP8",
"model_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4",
"extraArgs": "--tensor-parallel-size 8",
"env_vars": "SGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1",
"model_path": "amd/DeepSeek-R1-0528-MXFP4-v2",
"extraArgs": "--trust-remote-code --tensor-parallel-size 8 --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache",
"env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=1\nTORCHINDUCTOR_COMPILE_THREADS=128",
"runner": "linux-atom-mi35x-8",
"test_level": "nightly",
"accuracy_threshold": 0.93,
"accuracy_baseline": null,
"accuracy_baseline_model": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4",
"accuracy_baseline_model": "amd/DeepSeek-R1-0528-MXFP4-v2",
"_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k."
},
{
"model_name": "DeepSeek-R1-FP4 TP8 MTP3",
"model_path": "amd/DeepSeek-R1-0528-MXFP4-v2",
"extraArgs": "--trust-remote-code --tensor-parallel-size 8 --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache --speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 --max-running-requests 256 --cuda-graph-bs 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 160 192 224 256",
"env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nSGLANG_ENABLE_SPEC_V2=1\nSGLANG_ENABLE_TORCH_COMPILE=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nTORCHINDUCTOR_COMPILE_THREADS=128",
"runner": "linux-atom-mi35x-8",
"test_level": "nightly",
"accuracy_threshold": 0.93,
"accuracy_baseline": null,
"accuracy_baseline_model": "amd/DeepSeek-R1-0528-MXFP4-v2",
"_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k."
},
{
"model_name": "DeepSeek-R1-FP4-MTP-MoEFP4 TP8",
"model_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4",
"extraArgs": "--tensor-parallel-size 8 --speculative-algorithm NEXTN --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 --max-running-requests 256 --cuda-graph-bs 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 160 192 224 256",
"env_vars": "SGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_ENABLE_SPEC_V2=1",
"extraArgs": "--trust-remote-code --tensor-parallel-size 8 --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache",
"env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=1\nTORCHINDUCTOR_COMPILE_THREADS=128",
"runner": "linux-atom-mi35x-8",
"test_level": "nightly",
"accuracy_threshold": 0.93,
"accuracy_baseline": null,
"accuracy_baseline_model": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4",
"_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k."
"_baseline_note": "Coverage for the legacy MTP-MoEFP4 artifact using the SGLang benchmark TP8 server configuration."
},
{
"model_name": "DeepSeek-R1-FP4 TP8 MTP1",
"model_name": "DeepSeek-R1-FP4-MTP-MoEFP4 TP8 DP8 EP8",
"model_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4",
"extraArgs": "--trust-remote-code --tensor-parallel-size 8 --expert-parallel-size 8 --data-parallel-size 8 --enable-dp-attention --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache",
"env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nSGLANG_ENABLE_TORCH_COMPILE=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nMORI_SHMEM_MODE=ISOLATION\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nTORCHINDUCTOR_COMPILE_THREADS=128",
"runner": "linux-atom-mi35x-8",
"test_level": "nightly",
"accuracy_threshold": 0.91,
"accuracy_baseline": null,
"accuracy_baseline_model": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4",
"_baseline_note": "Coverage for the legacy MTP-MoEFP4 artifact using TP8 with DP-attention and EP8."
},
{
"model_name": "DeepSeek-R1-FP4-MTP-MoEFP4 TP8 MTP3",
"model_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4",
"extraArgs": "--tensor-parallel-size 8 --speculative-algorithm NEXTN --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 --max-running-requests 256 --cuda-graph-bs 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 160 192 224 256",
"env_vars": "SGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_ENABLE_SPEC_V2=1",
"extraArgs": "--trust-remote-code --tensor-parallel-size 8 --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache --speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 --max-running-requests 256 --cuda-graph-bs 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 160 192 224 256",
"env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nSGLANG_ENABLE_SPEC_V2=1\nSGLANG_ENABLE_TORCH_COMPILE=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nTORCHINDUCTOR_COMPILE_THREADS=128",
"runner": "linux-atom-mi35x-8",
"test_level": "nightly",
"accuracy_threshold": 0.93,
"accuracy_baseline": null,
"accuracy_baseline_model": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4",
"_baseline_note": "Coverage for the legacy MTP-MoEFP4 artifact using the SGLang benchmark TP8 MTP3 server configuration."
},
{
"model_name": "DeepSeek-R1-FP4 TP8 MTP1",
"model_path": "amd/DeepSeek-R1-0528-MXFP4-v2",
"extraArgs": "--trust-remote-code --tensor-parallel-size 8 --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache --speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 --max-running-requests 256 --cuda-graph-bs 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 160 192 224 256",
"env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nSGLANG_ENABLE_SPEC_V2=1\nSGLANG_ENABLE_TORCH_COMPILE=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nTORCHINDUCTOR_COMPILE_THREADS=128",
"runner": "linux-atom-mi35x-8",
"test_level": "nightly",
"accuracy_threshold": 0.93,
"accuracy_baseline": null,
"accuracy_baseline_model": "amd/DeepSeek-R1-0528-MXFP4-v2",
"_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k."
}
]
32 changes: 30 additions & 2 deletions .github/scripts/atom_sglang_mesh_benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,27 @@ if [[ -n "${SERVER_EXTRA_ARGS}" ]]; then
shlex_split_to_array "${SERVER_EXTRA_ARGS}" server_args
fi

declare -a filtered_server_args=()
for ((i = 0; i < ${#server_args[@]}; i++)); do
case "${server_args[$i]}" in
--chunked-prefill-size)
if (( i + 1 >= ${#server_args[@]} )); then
echo "ERROR: --chunked-prefill-size requires a value."
exit 2
fi
prefill_size="${server_args[$((i + 1))]}"
i=$((i + 1))
;;
--chunked-prefill-size=*)
prefill_size="${server_args[$i]#--chunked-prefill-size=}"
;;
*)
filtered_server_args+=("${server_args[$i]}")
;;
esac
done
server_args=("${filtered_server_args[@]}")

if [[ "${SPEC_MODE}" == "mtp" ]]; then
export SGLANG_ENABLE_SPEC_V2="${SGLANG_ENABLE_SPEC_V2:-1}"
if [[ "${SERVER_EXTRA_ARGS}" != *"--speculative-algorithm"* ]]; then
Expand Down Expand Up @@ -194,6 +215,13 @@ if [[ -n "${BENCH_EXTRA_ARGS}" ]]; then
shlex_split_to_array "${BENCH_EXTRA_ARGS}" bench_args
fi

bench_num_prompts="$(( CONC * 10 ))"
bench_num_warmups="$(( 2 * CONC ))"
if (( DP_SIZE > 1 && EP_SIZE > 1 )); then
bench_num_prompts="$(( CONC * 3 ))"
bench_num_warmups="${CONC}"
fi

set -x
PYTHONDONTWRITEBYTECODE=1 python "${BENCH_SERVING_DIR}/benchmark_serving.py" \
--model="${resolved_model_path}" \
Expand All @@ -203,9 +231,9 @@ PYTHONDONTWRITEBYTECODE=1 python "${BENCH_SERVING_DIR}/benchmark_serving.py" \
--random-input-len="${ISL}" \
--random-output-len="${OSL}" \
--random-range-ratio "${RANDOM_RANGE_RATIO}" \
--num-prompts="$(( CONC * 10 ))" \
--num-prompts="${bench_num_prompts}" \
--max-concurrency="${CONC}" \
--num-warmups="$(( 2 * CONC ))" \
--num-warmups="${bench_num_warmups}" \
--request-rate=inf \
--ignore-eos \
--save-result \
Expand Down
Loading
Loading