diff --git a/.github/benchmark/sglang_benchmark_models.json b/.github/benchmark/sglang_benchmark_models.json index af06763d8..fdcb33eff 100644 --- a/.github/benchmark/sglang_benchmark_models.json +++ b/.github/benchmark/sglang_benchmark_models.json @@ -1,302 +1,444 @@ -[ - { - "display": "DeepSeek-R1-0528 FP8 TP8", - "source_path": "deepseek-ai/DeepSeek-R1-0528", - "path": "deepseek-ai/DeepSeek-R1-0528", - "prefix": "deepseek-r1-fp8-tp8", - "extra_args": "--trust-remote-code --tensor-parallel-size 8", - "bench_args": "", - "runner": "atom-mi355-8gpu-aac-runner", - "nightly_group": "A", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1" - }, - { - "display": "DeepSeek-R1-0528 FP8 TP4", - "dashboard_model": "DeepSeek-R1-0528-tp4", - "source_path": "deepseek-ai/DeepSeek-R1-0528", - "path": "deepseek-ai/DeepSeek-R1-0528", - "prefix": "deepseek-r1-fp8-tp4", - "extra_args": "--trust-remote-code --tensor-parallel-size 4", - "bench_args": "", - "runner": "atom-mi355-8gpu-aac-runner", - "nightly_group": "A", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1" - }, - { - "display": "DeepSeek-R1-0528-MXFP4 FP4 TP8", - "source_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "prefix": "deepseek-r1-fp4-tp8", - "extra_args": "--trust-remote-code --tensor-parallel-size 8", - "bench_args": "", - "runner": "atom-mi355-8gpu-aac-runner", - "nightly_group": "A", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1" - }, - { - "display": "DeepSeek-R1-0528-MXFP4 FP4 TP4", - "dashboard_model": "DeepSeek-R1-0528-MXFP4-tp4", - "source_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "prefix": "deepseek-r1-fp4-tp4", - "extra_args": "--trust-remote-code --tensor-parallel-size 4 --attention-backend aiter", - "bench_args": "", - "runner": "atom-mi355-8gpu-aac-runner", - "nightly_group": "A", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1" - }, - { - "display": "DeepSeek-R1-0528-MXFP4 FP4 TP4 DP4 EP4", - "dashboard_model": "DeepSeek-R1-0528-MXFP4-tp4-dp4-ep4", - "source_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "prefix": "deepseek-r1-fp4-tp4-dp4-ep4", - "extra_args": "--trust-remote-code --tensor-parallel-size 4 --data-parallel-size 4 --expert-parallel-size 4 --enable-dp-attention --kv-cache-dtype fp8_e4m3 --attention-backend aiter --mem-fraction-static 0.8 --decode-log-interval 1000 --chunked-prefill-size 65536 --max-running-requests 24 --disable-radix-cache", - "bench_args": "", - "runner": "atom-mi355-8gpu-aac-runner", - "nightly_group": "A", - "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nAITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nMORI_SHMEM_MODE=ISOLATION\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=128\nSGLANG_MORI_DISPATCH_DTYPE=bf16\nSGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=16384" - }, - { - "display": "DeepSeek-R1-0528-MXFP4 FP4 TP8 EP8", - "dashboard_model": "DeepSeek-R1-0528-MXFP4-tp8-ep8", - "source_path": "amd/DeepSeek-R1-0528-MXFP4-V2", - "path": "amd/DeepSeek-R1-0528-MXFP4-V2", - "prefix": "deepseek-r1-fp4-tp8-ep8", - "extra_args": "--trust-remote-code --tensor-parallel-size 8 --expert-parallel-size 8", - "bench_args": "", - "runner": "atom-mi355-8gpu-aac-runner", - "nightly_group": "A", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1" - }, - { - "display": "DeepSeek-R1-0528-MXFP4 FP4 TP8 MTP3", - "dashboard_model": "DeepSeek-R1-0528-MXFP4-mtp3", - "source_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "prefix": "deepseek-r1-fp4-tp8-mtp3", - "extra_args": "--trust-remote-code --tensor-parallel-size 8 --speculative-algorithm NEXTN --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 --max-running-requests 256 --cuda-graph-bs 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 160 192 224 256", - "bench_args": "", - "runner": "atom-mi355-8gpu-aac-runner", - "nightly_group": "B", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_ENABLE_SPEC_V2=1" - }, - { - "display": "DeepSeek-R1-0528-MXFP4 FP4 TP8 MTP1", - "dashboard_model": "DeepSeek-R1-0528-MXFP4-mtp1", - "source_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "prefix": "deepseek-r1-fp4-tp8-mtp1", - "extra_args": "--trust-remote-code --tensor-parallel-size 8 --speculative-algorithm NEXTN --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 --max-running-requests 256 --cuda-graph-bs 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 160 192 224 256", - "bench_args": "", - "runner": "atom-mi355-8gpu-aac-runner", - "nightly_group": "B", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_ENABLE_SPEC_V2=1" - }, - { - "display": "DeepSeek-R1-0528-MXFP4 FP4 SGLang-Mesh TP8", - "dashboard_model": "DeepSeek-R1-0528-MXFP4-mesh-tp8", - "source_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "prefix": "deepseek-r1-fp4-mesh-tp8", - "workload_label": "SGLang-Mesh", - "mesh_spec_mode": "none", - "mesh_presets": ["all", "tp8", "non-mtp"], - "tp_size": 8, - "dp_size": 1, - "ep_size": 1, - "extra_args": "", - "bench_args": "", - "runner": "atom-mi355-8gpu-aac-runner", - "nightly_group": "A", - "supported_input_output_pairs": ["8192x1", "1x1024"], - "supported_concurrency_values_by_pair": { - "8192x1": [2, 4, 8, 16, 32, 64, 128, 256], - "1x1024": [2, 4, 8, 16, 32, 64, 128, 256] +{ + "templates": { + "extra_args": { + "trust_remote_code": "--trust-remote-code", + "aiter_runtime": "--attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache", + "qwen_reasoning": "--mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache", + "mtp1_common": "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 --max-running-requests 256 --cuda-graph-bs 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 160 192 224 256", + "mtp3_common": "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 --cuda-graph-bs 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 160 192 224 256" }, - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1" + "env_vars": { + "deepseek_common": "SGLANG_DEFAULT_SERVER_ARGS=\nAITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=1\nTORCHINDUCTOR_COMPILE_THREADS=128", + "deepseek_dp_common": "SGLANG_DEFAULT_SERVER_ARGS=\nAITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nMORI_SHMEM_MODE=ISOLATION\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=1\nTORCHINDUCTOR_COMPILE_THREADS=128", + "deepseek_mtp_common": "SGLANG_DEFAULT_SERVER_ARGS=\nAITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_ENABLE_SPEC_V2=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=1\nTORCHINDUCTOR_COMPILE_THREADS=128", + "deepseek_mtp_dp_common": "SGLANG_DEFAULT_SERVER_ARGS=\nAITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_ENABLE_SPEC_V2=1\nMORI_SHMEM_MODE=ISOLATION\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=1\nTORCHINDUCTOR_COMPILE_THREADS=128", + "qwen_common": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0" + } }, - { - "display": "DeepSeek-R1-0528-MXFP4 FP4 SGLang-Mesh TP8 MTP", - "dashboard_model": "DeepSeek-R1-0528-MXFP4-mesh-tp8-mtp", - "source_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "prefix": "deepseek-r1-fp4-mesh-tp8-mtp", - "workload_label": "SGLang-Mesh", - "mesh_spec_mode": "mtp", - "mesh_presets": ["all", "tp8-mtp", "mtp"], - "tp_size": 8, - "dp_size": 1, - "ep_size": 1, - "extra_args": "--speculative-algorithm NEXTN --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 --max-running-requests 256 --cuda-graph-bs 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 160 192 224 256", - "bench_args": "", - "runner": "atom-mi355-8gpu-aac-runner", - "nightly_group": "A", - "supported_input_output_pairs": ["8192x1", "1x1024"], - "supported_concurrency_values_by_pair": { - "8192x1": [2, 4, 8, 16, 32, 64, 128, 256], - "1x1024": [2, 4, 8, 16, 32, 64, 128, 256] + "models": [ + { + "display": "DeepSeek-R1-0528 FP8 TP4", + "dashboard_model": "DeepSeek-R1-0528-tp4", + "workload_label": "SGLang-OOB", + "source_path": "deepseek-ai/DeepSeek-R1-0528", + "path": "deepseek-ai/DeepSeek-R1-0528", + "prefix": "deepseek-r1-fp8-tp4", + "extra_args": ["trust_remote_code", "--tensor-parallel-size 4", "aiter_runtime"], + "bench_args": "", + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "A", + "env_vars": "deepseek_common" }, - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1" - }, - { - "display": "DeepSeek-R1-0528-MXFP4 FP4 SGLang-Mesh TP4", - "dashboard_model": "DeepSeek-R1-0528-MXFP4-mesh-tp4", - "source_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "prefix": "deepseek-r1-fp4-mesh-tp4", - "workload_label": "SGLang-Mesh", - "mesh_spec_mode": "none", - "mesh_presets": ["all", "tp4", "non-mtp"], - "tp_size": 4, - "dp_size": 1, - "ep_size": 1, - "extra_args": "--attention-backend aiter", - "bench_args": "", - "runner": "atom-mi355-8gpu-aac-runner", - "nightly_group": "A", - "supported_input_output_pairs": ["8192x1", "1x1024"], - "supported_concurrency_values_by_pair": { - "8192x1": [64, 128, 256], - "1x1024": [64, 128, 256] + { + "display": "DeepSeek-R1-0528 FP8 TP8", + "workload_label": "SGLang-OOB", + "source_path": "deepseek-ai/DeepSeek-R1-0528", + "path": "deepseek-ai/DeepSeek-R1-0528", + "prefix": "deepseek-r1-fp8-tp8", + "extra_args": ["trust_remote_code", "--tensor-parallel-size 8", "aiter_runtime"], + "bench_args": "", + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "A", + "env_vars": "deepseek_common" }, - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1" - }, - { - "display": "DeepSeek-R1-0528-MXFP4 FP4 SGLang-Mesh TP4 MTP", - "dashboard_model": "DeepSeek-R1-0528-MXFP4-mesh-tp4-mtp", - "source_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "prefix": "deepseek-r1-fp4-mesh-tp4-mtp", - "workload_label": "SGLang-Mesh", - "mesh_spec_mode": "mtp", - "mesh_presets": ["all", "tp4-mtp", "mtp"], - "tp_size": 4, - "dp_size": 1, - "ep_size": 1, - "extra_args": "--attention-backend aiter --speculative-algorithm NEXTN --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 --max-running-requests 256 --cuda-graph-bs 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 160 192 224 256", - "bench_args": "", - "runner": "atom-mi355-8gpu-aac-runner", - "nightly_group": "A", - "supported_input_output_pairs": ["8192x1", "1x1024"], - "supported_concurrency_values_by_pair": { - "8192x1": [64, 128, 256], - "1x1024": [64, 128, 256] + { + "display": "DeepSeek-R1-0528-MXFP4 FP4 TP4", + "dashboard_model": "DeepSeek-R1-0528-MXFP4-tp4", + "workload_label": "SGLang-OOB", + "source_path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "prefix": "deepseek-r1-fp4-tp4", + "extra_args": ["trust_remote_code", "--tensor-parallel-size 4", "aiter_runtime"], + "bench_args": "", + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "A", + "env_vars": "deepseek_common" }, - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1" - }, - { - "display": "DeepSeek-R1-0528-MXFP4 FP4 SGLang-Mesh DPA4 EP4", - "dashboard_model": "DeepSeek-R1-0528-MXFP4-mesh-dpa4-ep4", - "source_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "prefix": "deepseek-r1-fp4-mesh-dpa4-ep4", - "workload_label": "SGLang-Mesh", - "mesh_spec_mode": "none", - "mesh_presets": ["all", "dpa4-ep4", "non-mtp"], - "tp_size": 4, - "dp_size": 4, - "ep_size": 4, - "extra_args": "--dp-size 4 --enable-dp-attention --ep-size 4 --max-running-requests 4096", - "bench_args": "", - "runner": "atom-mi355-8gpu-aac-runner", - "nightly_group": "A", - "supported_input_output_pairs": ["8192x1"], - "supported_concurrency_values_by_pair": { - "8192x1": [256, 512, 1024] + { + "display": "DeepSeek-R1-0528-MXFP4 FP4 TP8", + "workload_label": "SGLang-OOB", + "source_path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "prefix": "deepseek-r1-fp4-tp8", + "extra_args": ["trust_remote_code", "--tensor-parallel-size 8", "aiter_runtime"], + "bench_args": "", + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "A", + "env_vars": "deepseek_common" }, - "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nAITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nMORI_SHMEM_MODE=ISOLATION\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=128\nSGLANG_MORI_DISPATCH_DTYPE=bf16\nSGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=16384" - }, - { - "display": "DeepSeek-R1-0528-MXFP4 FP4 SGLang-Mesh DPA4 EP4 MTP", - "dashboard_model": "DeepSeek-R1-0528-MXFP4-mesh-dpa4-ep4-mtp", - "source_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "prefix": "deepseek-r1-fp4-mesh-dpa4-ep4-mtp", - "workload_label": "SGLang-Mesh", - "mesh_spec_mode": "mtp", - "mesh_presets": ["all", "dpa4-ep4-mtp", "mtp"], - "tp_size": 4, - "dp_size": 4, - "ep_size": 4, - "extra_args": "--dp-size 4 --enable-dp-attention --ep-size 4 --max-running-requests 4096 --speculative-algorithm NEXTN --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 --cuda-graph-bs 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128", - "bench_args": "", - "runner": "atom-mi355-8gpu-aac-runner", - "nightly_group": "A", - "supported_input_output_pairs": ["8192x1"], - "supported_concurrency_values_by_pair": { - "8192x1": [256, 512, 1024] + { + "display": "DeepSeek-R1-0528-MXFP4 FP4 TP4 DP4 EP4", + "dashboard_model": "DeepSeek-R1-0528-MXFP4-tp4-dp4-ep4", + "workload_label": "SGLang-OOB", + "source_path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "prefix": "deepseek-r1-fp4-tp4-dp4-ep4", + "extra_args": ["trust_remote_code", "--tensor-parallel-size 4 --expert-parallel-size 4 --data-parallel-size 4 --enable-dp-attention", "aiter_runtime"], + "bench_args": "", + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "A", + "env_vars": "deepseek_dp_common" }, - "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nAITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nMORI_SHMEM_MODE=ISOLATION\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=128\nSGLANG_MORI_DISPATCH_DTYPE=bf16\nSGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=16384" - }, - { - "display": "DeepSeek-R1-0528-MXFP4 FP4 SGLang-Mesh DPA8 EP8", - "dashboard_model": "DeepSeek-R1-0528-MXFP4-mesh-dpa8-ep8", - "source_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "prefix": "deepseek-r1-fp4-mesh-dpa8-ep8", - "workload_label": "SGLang-Mesh", - "mesh_spec_mode": "none", - "mesh_presets": ["all", "dpa8-ep8", "non-mtp"], - "tp_size": 8, - "dp_size": 8, - "ep_size": 8, - "extra_args": "--dp-size 8 --enable-dp-attention --ep-size 8 --max-running-requests 4096", - "bench_args": "", - "runner": "atom-mi355-8gpu-aac-runner", - "nightly_group": "A", - "supported_input_output_pairs": ["8192x1", "1x1024"], - "supported_concurrency_values_by_pair": { - "8192x1": [512, 1024, 2048], - "1x1024": [1024, 2048, 4096] + { + "display": "DeepSeek-R1-0528-MXFP4 FP4 TP8 EP8", + "dashboard_model": "DeepSeek-R1-0528-MXFP4-tp8-ep8", + "workload_label": "SGLang-OOB", + "source_path": "amd/DeepSeek-R1-0528-MXFP4-V2", + "path": "amd/DeepSeek-R1-0528-MXFP4-V2", + "prefix": "deepseek-r1-fp4-tp8-ep8", + "extra_args": ["trust_remote_code", "--tensor-parallel-size 8 --expert-parallel-size 8 --data-parallel-size 8 --enable-dp-attention", "aiter_runtime"], + "bench_args": "", + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "A", + "env_vars": "deepseek_dp_common" }, - "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nAITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nMORI_SHMEM_MODE=ISOLATION\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=128\nSGLANG_MORI_DISPATCH_DTYPE=bf16\nSGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=16384" - }, - { - "display": "DeepSeek-R1-0528-MXFP4 FP4 SGLang-Mesh DPA8 EP8 MTP", - "dashboard_model": "DeepSeek-R1-0528-MXFP4-mesh-dpa8-ep8-mtp", - "source_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "prefix": "deepseek-r1-fp4-mesh-dpa8-ep8-mtp", - "workload_label": "SGLang-Mesh", - "mesh_spec_mode": "mtp", - "mesh_presets": ["all", "dpa8-ep8-mtp", "mtp"], - "tp_size": 8, - "dp_size": 8, - "ep_size": 8, - "extra_args": "--dp-size 8 --enable-dp-attention --ep-size 8 --max-running-requests 4096 --speculative-algorithm NEXTN --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 --cuda-graph-bs 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128", - "bench_args": "", - "runner": "atom-mi355-8gpu-aac-runner", - "nightly_group": "A", - "supported_input_output_pairs": ["8192x1", "1x1024"], - "supported_concurrency_values_by_pair": { - "8192x1": [512, 1024, 2048], - "1x1024": [1024, 2048, 4096] + { + "display": "DeepSeek-R1-0528-MXFP4 FP4 TP4 DP8 EP8", + "dashboard_model": "DeepSeek-R1-0528-MXFP4-tp4-dp8-ep8", + "workload_label": "SGLang-OOB", + "source_path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "prefix": "deepseek-r1-fp4-tp4-dp8-ep8", + "extra_args": ["trust_remote_code", "--tensor-parallel-size 4 --expert-parallel-size 8 --data-parallel-size 8 --enable-dp-attention", "aiter_runtime"], + "bench_args": "", + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "A", + "env_vars": "deepseek_dp_common" }, - "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nAITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nMORI_SHMEM_MODE=ISOLATION\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=128\nSGLANG_MORI_DISPATCH_DTYPE=bf16\nSGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=16384" - }, - { - "display": "Qwen3.5-397B-A17B-FP8 TP4", - "dashboard_model": "Qwen3.5-397B-A17B-FP8-tp4", - "source_path": "Qwen/Qwen3.5-397B-A17B-FP8", - "path": "Qwen/Qwen3.5-397B-A17B-FP8", - "prefix": "qwen3-5-397b-a17b-fp8-tp4", - "extra_args": "--tensor-parallel-size 4 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache", - "bench_args": "", - "runner": "atom-mi355-8gpu-aac-runner", - "nightly_group": "B", - "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0" - }, - { - "display": "Qwen3.5-397B-A17B-FP8 TP8", - "dashboard_model": "Qwen3.5-397B-A17B-FP8", - "source_path": "Qwen/Qwen3.5-397B-A17B-FP8", - "path": "Qwen/Qwen3.5-397B-A17B-FP8", - "prefix": "qwen3-5-397b-a17b-fp8-tp8", - "extra_args": "--tensor-parallel-size 8 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache", - "bench_args": "", - "runner": "atom-mi355-8gpu-aac-runner", - "nightly_group": "B", - "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0" - } -] + { + "display": "DeepSeek-R1-0528-MXFP4 FP4 TP8 MTP1", + "dashboard_model": "DeepSeek-R1-0528-MXFP4-mtp1", + "workload_label": "SGLang-OOB", + "source_path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "prefix": "deepseek-r1-fp4-tp8-mtp1", + "extra_args": ["trust_remote_code", "--tensor-parallel-size 8", "aiter_runtime", "mtp1_common"], + "bench_args": "", + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "B", + "env_vars": "deepseek_mtp_common" + }, + { + "display": "DeepSeek-R1-0528-MXFP4 FP4 TP8 MTP3", + "dashboard_model": "DeepSeek-R1-0528-MXFP4-mtp3", + "workload_label": "SGLang-OOB", + "source_path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "prefix": "deepseek-r1-fp4-tp8-mtp3", + "extra_args": ["trust_remote_code", "--tensor-parallel-size 8", "aiter_runtime", "mtp3_common", "--max-running-requests 256"], + "bench_args": "", + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "B", + "env_vars": "deepseek_mtp_common" + }, + { + "display": "DeepSeek-R1-0528-MXFP4 FP4 MTP3 TP4 DP4 EP4", + "dashboard_model": "DeepSeek-R1-0528-MXFP4-mtp3-tp4-dp4-ep4", + "workload_label": "SGLang-OOB", + "source_path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "prefix": "deepseek-r1-fp4-mtp3-tp4-dp4-ep4", + "extra_args": ["trust_remote_code", "--tensor-parallel-size 4 --expert-parallel-size 4 --data-parallel-size 4 --enable-dp-attention", "aiter_runtime", "mtp3_common", "--max-running-requests 4096"], + "bench_args": "", + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "B", + "env_vars": "deepseek_mtp_dp_common" + }, + { + "display": "DeepSeek-R1-0528-MXFP4 FP4 MTP3 TP8 DP8 EP8", + "dashboard_model": "DeepSeek-R1-0528-MXFP4-mtp3-tp8-dp8-ep8", + "workload_label": "SGLang-OOB", + "source_path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "prefix": "deepseek-r1-fp4-mtp3-tp8-dp8-ep8", + "extra_args": ["trust_remote_code", "--tensor-parallel-size 8 --expert-parallel-size 8 --data-parallel-size 8 --enable-dp-attention", "aiter_runtime", "mtp3_common", "--max-running-requests 4096"], + "bench_args": "", + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "B", + "env_vars": "deepseek_mtp_dp_common" + }, + { + "display": "Qwen3.5-397B-A17B-FP8 TP4", + "dashboard_model": "Qwen3.5-397B-A17B-FP8-tp4", + "workload_label": "SGLang-OOB", + "source_path": "Qwen/Qwen3.5-397B-A17B-FP8", + "path": "Qwen/Qwen3.5-397B-A17B-FP8", + "prefix": "qwen3-5-397b-a17b-fp8-tp4", + "extra_args": ["--tensor-parallel-size 4", "qwen_reasoning"], + "bench_args": "", + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "B", + "env_vars": "qwen_common" + }, + { + "display": "Qwen3.5-397B-A17B-FP8 TP8", + "dashboard_model": "Qwen3.5-397B-A17B-FP8", + "workload_label": "SGLang-OOB", + "source_path": "Qwen/Qwen3.5-397B-A17B-FP8", + "path": "Qwen/Qwen3.5-397B-A17B-FP8", + "prefix": "qwen3-5-397b-a17b-fp8-tp8", + "extra_args": ["--tensor-parallel-size 8", "qwen_reasoning"], + "bench_args": "", + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "B", + "env_vars": "qwen_common" + }, + { + "display": "DeepSeek-R1-0528 FP8 SGLang-Mesh DPA4 EP4", + "dashboard_model": "DeepSeek-R1-0528-mesh-dpa4-ep4", + "workload_label": "SGLang-Mesh", + "source_path": "deepseek-ai/DeepSeek-R1-0528", + "path": "deepseek-ai/DeepSeek-R1-0528", + "prefix": "deepseek-r1-fp8-mesh-dpa4-ep4", + "mesh_spec_mode": "none", + "mesh_presets": ["all", "fp8-all", "fp8-dpa4-ep4", "fp8-non-mtp", "dpa4-ep4", "non-mtp"], + "tp_size": 4, + "dp_size": 4, + "ep_size": 4, + "extra_args": ["trust_remote_code", "aiter_runtime", "--dp-size 4 --enable-dp-attention --ep-size 4 --max-running-requests 4096"], + "bench_args": "", + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "A", + "supported_input_output_pairs": ["8192x1"], + "supported_concurrency_values_by_pair": { + "8192x1": [256, 512, 1024] + }, + "env_vars": "deepseek_dp_common", + "case_extra_args_by_pair": { + "8192x1": "--chunked-prefill-size 65536" + } + }, + { + "display": "DeepSeek-R1-0528 FP8 SGLang-Mesh DPA8 EP8", + "dashboard_model": "DeepSeek-R1-0528-mesh-dpa8-ep8", + "workload_label": "SGLang-Mesh", + "source_path": "deepseek-ai/DeepSeek-R1-0528", + "path": "deepseek-ai/DeepSeek-R1-0528", + "prefix": "deepseek-r1-fp8-mesh-dpa8-ep8", + "mesh_spec_mode": "none", + "mesh_presets": ["all", "fp8-all", "fp8-dpa8-ep8", "fp8-non-mtp", "dpa8-ep8", "non-mtp"], + "tp_size": 8, + "dp_size": 8, + "ep_size": 8, + "extra_args": ["trust_remote_code", "aiter_runtime", "--dp-size 8 --enable-dp-attention --ep-size 8 --max-running-requests 4096"], + "bench_args": "", + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "A", + "supported_input_output_pairs": ["8192x1", "1x1024"], + "supported_concurrency_values_by_pair": { + "8192x1": [512, 1024, 2048], + "1x1024": [1024, 2048, 4096] + }, + "env_vars": "deepseek_dp_common", + "case_extra_args_by_pair": { + "8192x1": "--chunked-prefill-size 65536" + } + }, + { + "display": "DeepSeek-R1-0528-MXFP4 FP4 SGLang-Mesh TP4", + "dashboard_model": "DeepSeek-R1-0528-MXFP4-mesh-tp4", + "workload_label": "SGLang-Mesh", + "source_path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "prefix": "deepseek-r1-fp4-mesh-tp4", + "mesh_spec_mode": "none", + "mesh_presets": ["all", "fp4-all", "fp4-tp4", "fp4-non-mtp", "tp4", "non-mtp"], + "tp_size": 4, + "dp_size": 1, + "ep_size": 1, + "extra_args": ["trust_remote_code", "aiter_runtime"], + "bench_args": "", + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "A", + "supported_input_output_pairs": ["8192x1", "1x1024"], + "supported_concurrency_values_by_pair": { + "8192x1": [64, 128, 256], + "1x1024": [64, 128, 256] + }, + "env_vars": "deepseek_common", + "case_env_vars_by_pair": { + "1x1024": "ATOM_USE_FP4_NON_SHUFFLE_TRITON_GEMM=1" + } + }, + { + "display": "DeepSeek-R1-0528-MXFP4 FP4 SGLang-Mesh TP8", + "dashboard_model": "DeepSeek-R1-0528-MXFP4-mesh-tp8", + "workload_label": "SGLang-Mesh", + "source_path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "prefix": "deepseek-r1-fp4-mesh-tp8", + "mesh_spec_mode": "none", + "mesh_presets": ["all", "fp4-all", "fp4-tp8", "fp4-non-mtp", "tp8", "non-mtp"], + "tp_size": 8, + "dp_size": 1, + "ep_size": 1, + "extra_args": ["trust_remote_code", "aiter_runtime"], + "bench_args": "", + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "A", + "supported_input_output_pairs": ["8192x1", "1x1024"], + "supported_concurrency_values_by_pair": { + "8192x1": [2, 4, 8, 16, 32, 64, 128, 256], + "1x1024": [2, 4, 8, 16, 32, 64, 128, 256] + }, + "env_vars": "deepseek_common", + "case_env_vars_by_pair": { + "1x1024": "ATOM_USE_FP4_NON_SHUFFLE_TRITON_GEMM=1" + } + }, + { + "display": "DeepSeek-R1-0528-MXFP4 FP4 SGLang-Mesh DPA4 EP4", + "dashboard_model": "DeepSeek-R1-0528-MXFP4-mesh-dpa4-ep4", + "workload_label": "SGLang-Mesh", + "source_path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "prefix": "deepseek-r1-fp4-mesh-dpa4-ep4", + "mesh_spec_mode": "none", + "mesh_presets": ["all", "fp4-all", "fp4-dpa4-ep4", "fp4-non-mtp", "dpa4-ep4", "non-mtp"], + "tp_size": 4, + "dp_size": 4, + "ep_size": 4, + "extra_args": ["trust_remote_code", "aiter_runtime", "--dp-size 4 --enable-dp-attention --ep-size 4 --max-running-requests 4096"], + "bench_args": "", + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "A", + "supported_input_output_pairs": ["8192x1"], + "supported_concurrency_values_by_pair": { + "8192x1": [256, 512, 1024] + }, + "env_vars": "deepseek_dp_common", + "case_extra_args_by_pair": { + "8192x1": "--chunked-prefill-size 65536" + } + }, + { + "display": "DeepSeek-R1-0528-MXFP4 FP4 SGLang-Mesh DPA8 EP8", + "dashboard_model": "DeepSeek-R1-0528-MXFP4-mesh-dpa8-ep8", + "workload_label": "SGLang-Mesh", + "source_path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "prefix": "deepseek-r1-fp4-mesh-dpa8-ep8", + "mesh_spec_mode": "none", + "mesh_presets": ["all", "fp4-all", "fp4-dpa8-ep8", "fp4-non-mtp", "dpa8-ep8", "non-mtp"], + "tp_size": 8, + "dp_size": 8, + "ep_size": 8, + "extra_args": ["trust_remote_code", "aiter_runtime", "--dp-size 8 --enable-dp-attention --ep-size 8 --max-running-requests 4096"], + "bench_args": "", + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "A", + "supported_input_output_pairs": ["8192x1", "1x1024"], + "supported_concurrency_values_by_pair": { + "8192x1": [512, 1024, 2048], + "1x1024": [1024, 2048, 4096] + }, + "env_vars": "deepseek_dp_common", + "case_extra_args_by_pair": { + "8192x1": "--chunked-prefill-size 65536" + } + }, + { + "display": "DeepSeek-R1-0528-MXFP4 FP4 SGLang-Mesh TP4 MTP", + "dashboard_model": "DeepSeek-R1-0528-MXFP4-mesh-tp4-mtp", + "workload_label": "SGLang-Mesh", + "source_path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "prefix": "deepseek-r1-fp4-mesh-tp4-mtp", + "mesh_spec_mode": "mtp", + "mesh_presets": ["all", "fp4-all", "fp4-tp4-mtp", "fp4-mtp", "tp4-mtp", "mtp"], + "tp_size": 4, + "dp_size": 1, + "ep_size": 1, + "extra_args": ["trust_remote_code", "aiter_runtime", "mtp3_common", "--max-running-requests 256"], + "bench_args": "", + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "A", + "supported_input_output_pairs": ["8192x1", "1x1024"], + "supported_concurrency_values_by_pair": { + "8192x1": [64, 128, 256], + "1x1024": [64, 128, 256] + }, + "env_vars": "deepseek_mtp_common", + "case_env_vars_by_pair": { + "1x1024": "ATOM_USE_FP4_NON_SHUFFLE_TRITON_GEMM=1" + } + }, + { + "display": "DeepSeek-R1-0528-MXFP4 FP4 SGLang-Mesh TP8 MTP", + "dashboard_model": "DeepSeek-R1-0528-MXFP4-mesh-tp8-mtp", + "workload_label": "SGLang-Mesh", + "source_path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "prefix": "deepseek-r1-fp4-mesh-tp8-mtp", + "mesh_spec_mode": "mtp", + "mesh_presets": ["all", "fp4-all", "fp4-tp8-mtp", "fp4-mtp", "tp8-mtp", "mtp"], + "tp_size": 8, + "dp_size": 1, + "ep_size": 1, + "extra_args": ["trust_remote_code", "aiter_runtime", "mtp3_common", "--max-running-requests 256"], + "bench_args": "", + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "A", + "supported_input_output_pairs": ["8192x1", "1x1024"], + "supported_concurrency_values_by_pair": { + "8192x1": [2, 4, 8, 16, 32, 64, 128, 256], + "1x1024": [2, 4, 8, 16, 32, 64, 128, 256] + }, + "env_vars": "deepseek_mtp_common", + "case_env_vars_by_pair": { + "1x1024": "ATOM_USE_FP4_NON_SHUFFLE_TRITON_GEMM=1" + } + }, + { + "display": "DeepSeek-R1-0528-MXFP4 FP4 SGLang-Mesh DPA4 EP4 MTP", + "dashboard_model": "DeepSeek-R1-0528-MXFP4-mesh-dpa4-ep4-mtp", + "workload_label": "SGLang-Mesh", + "source_path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "prefix": "deepseek-r1-fp4-mesh-dpa4-ep4-mtp", + "mesh_spec_mode": "mtp", + "mesh_presets": ["all", "fp4-all", "fp4-dpa4-ep4-mtp", "fp4-mtp", "dpa4-ep4-mtp", "mtp"], + "tp_size": 4, + "dp_size": 4, + "ep_size": 4, + "extra_args": ["trust_remote_code", "aiter_runtime", "--dp-size 4 --enable-dp-attention --ep-size 4", "mtp3_common", "--max-running-requests 4096"], + "bench_args": "", + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "A", + "supported_input_output_pairs": ["8192x1"], + "supported_concurrency_values_by_pair": { + "8192x1": [256, 512, 1024] + }, + "env_vars": "deepseek_mtp_dp_common", + "case_extra_args_by_pair": { + "8192x1": "--chunked-prefill-size 65536" + } + }, + { + "display": "DeepSeek-R1-0528-MXFP4 FP4 SGLang-Mesh DPA8 EP8 MTP", + "dashboard_model": "DeepSeek-R1-0528-MXFP4-mesh-dpa8-ep8-mtp", + "workload_label": "SGLang-Mesh", + "source_path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "prefix": "deepseek-r1-fp4-mesh-dpa8-ep8-mtp", + "mesh_spec_mode": "mtp", + "mesh_presets": ["all", "fp4-all", "fp4-dpa8-ep8-mtp", "fp4-mtp", "dpa8-ep8-mtp", "mtp"], + "tp_size": 8, + "dp_size": 8, + "ep_size": 8, + "extra_args": ["trust_remote_code", "aiter_runtime", "--dp-size 8 --enable-dp-attention --ep-size 8", "mtp3_common", "--max-running-requests 4096"], + "bench_args": "", + "runner": "atom-mi355-8gpu-aac-runner", + "nightly_group": "A", + "supported_input_output_pairs": ["8192x1", "1x1024"], + "supported_concurrency_values_by_pair": { + "8192x1": [512, 1024, 2048], + "1x1024": [1024, 2048, 4096] + }, + "env_vars": "deepseek_mtp_dp_common", + "case_extra_args_by_pair": { + "8192x1": "--chunked-prefill-size 65536" + } + } + ] +} diff --git a/.github/benchmark/sglang_models_accuracy.json b/.github/benchmark/sglang_models_accuracy.json index bc22276e4..65b134894 100644 --- a/.github/benchmark/sglang_models_accuracy.json +++ b/.github/benchmark/sglang_models_accuracy.json @@ -2,8 +2,8 @@ { "model_name": "DeepSeek-R1-FP8 TP4", "model_path": "deepseek-ai/DeepSeek-R1-0528", - "extraArgs": "--tensor-parallel-size 4", - "env_vars": "SGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1", + "extraArgs": "--trust-remote-code --tensor-parallel-size 4 --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=1\nTORCHINDUCTOR_COMPILE_THREADS=128", "runner": "linux-atom-mi35x-4", "test_level": "nightly", "accuracy_threshold": 0.91, @@ -62,8 +62,8 @@ { "model_name": "DeepSeek-R1-FP8 TP8", "model_path": "deepseek-ai/DeepSeek-R1-0528", - "extraArgs": "--tensor-parallel-size 8", - "env_vars": "SGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1", + "extraArgs": "--trust-remote-code --tensor-parallel-size 8 --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=1\nTORCHINDUCTOR_COMPILE_THREADS=128", "runner": "linux-atom-mi35x-8", "test_level": "nightly", "accuracy_threshold": 0.93, @@ -73,62 +73,110 @@ }, { "model_name": "DeepSeek-R1-FP4 TP4", - "model_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "extraArgs": "--tensor-parallel-size 4", - "env_vars": "SGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1", + "model_path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "extraArgs": "--trust-remote-code --tensor-parallel-size 4 --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=1\nTORCHINDUCTOR_COMPILE_THREADS=128", "runner": "linux-atom-mi35x-4", "test_level": "nightly", "accuracy_threshold": 0.91, "accuracy_baseline": null, - "accuracy_baseline_model": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", + "accuracy_baseline_model": "amd/DeepSeek-R1-0528-MXFP4-v2", "_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k." }, { "model_name": "DeepSeek-R1-FP4 TP4 DP4 EP4", - "model_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "extraArgs": "--trust-remote-code --tensor-parallel-size 4 --data-parallel-size 4 --expert-parallel-size 4 --enable-dp-attention --kv-cache-dtype fp8_e4m3 --attention-backend aiter --mem-fraction-static 0.8 --decode-log-interval 1000 --chunked-prefill-size 65536 --max-running-requests 24 --disable-radix-cache", - "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nAITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nMORI_SHMEM_MODE=ISOLATION\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=128\nSGLANG_MORI_DISPATCH_DTYPE=bf16\nSGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=16384", + "model_path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "extraArgs": "--trust-remote-code --tensor-parallel-size 4 --expert-parallel-size 4 --data-parallel-size 4 --enable-dp-attention --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nSGLANG_ENABLE_TORCH_COMPILE=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nMORI_SHMEM_MODE=ISOLATION\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nTORCHINDUCTOR_COMPILE_THREADS=128", "runner": "linux-atom-mi35x-8", "test_level": "nightly", "accuracy_threshold": 0.91, "accuracy_baseline": null, - "accuracy_baseline_model": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", + "accuracy_baseline_model": "amd/DeepSeek-R1-0528-MXFP4-v2", + "_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k." + }, + { + "model_name": "DeepSeek-R1-FP4 TP4 DP8 EP8", + "model_path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "extraArgs": "--trust-remote-code --tensor-parallel-size 4 --expert-parallel-size 8 --data-parallel-size 8 --enable-dp-attention --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nSGLANG_ENABLE_TORCH_COMPILE=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nMORI_SHMEM_MODE=ISOLATION\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nTORCHINDUCTOR_COMPILE_THREADS=128", + "runner": "linux-atom-mi35x-8", + "test_level": "nightly", + "accuracy_threshold": 0.91, + "accuracy_baseline": null, + "accuracy_baseline_model": "amd/DeepSeek-R1-0528-MXFP4-v2", "_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k." }, { "model_name": "DeepSeek-R1-FP4 TP8", - "model_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "extraArgs": "--tensor-parallel-size 8", - "env_vars": "SGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1", + "model_path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "extraArgs": "--trust-remote-code --tensor-parallel-size 8 --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=1\nTORCHINDUCTOR_COMPILE_THREADS=128", "runner": "linux-atom-mi35x-8", "test_level": "nightly", "accuracy_threshold": 0.93, "accuracy_baseline": null, - "accuracy_baseline_model": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", + "accuracy_baseline_model": "amd/DeepSeek-R1-0528-MXFP4-v2", "_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k." }, { "model_name": "DeepSeek-R1-FP4 TP8 MTP3", + "model_path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "extraArgs": "--trust-remote-code --tensor-parallel-size 8 --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache --speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 --max-running-requests 256 --cuda-graph-bs 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 160 192 224 256", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nSGLANG_ENABLE_SPEC_V2=1\nSGLANG_ENABLE_TORCH_COMPILE=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nTORCHINDUCTOR_COMPILE_THREADS=128", + "runner": "linux-atom-mi35x-8", + "test_level": "nightly", + "accuracy_threshold": 0.93, + "accuracy_baseline": null, + "accuracy_baseline_model": "amd/DeepSeek-R1-0528-MXFP4-v2", + "_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k." + }, + { + "model_name": "DeepSeek-R1-FP4-MTP-MoEFP4 TP8", "model_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "extraArgs": "--tensor-parallel-size 8 --speculative-algorithm NEXTN --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 --max-running-requests 256 --cuda-graph-bs 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 160 192 224 256", - "env_vars": "SGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_ENABLE_SPEC_V2=1", + "extraArgs": "--trust-remote-code --tensor-parallel-size 8 --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=1\nTORCHINDUCTOR_COMPILE_THREADS=128", "runner": "linux-atom-mi35x-8", "test_level": "nightly", "accuracy_threshold": 0.93, "accuracy_baseline": null, "accuracy_baseline_model": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k." + "_baseline_note": "Coverage for the legacy MTP-MoEFP4 artifact using the SGLang benchmark TP8 server configuration." }, { - "model_name": "DeepSeek-R1-FP4 TP8 MTP1", + "model_name": "DeepSeek-R1-FP4-MTP-MoEFP4 TP8 DP8 EP8", + "model_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", + "extraArgs": "--trust-remote-code --tensor-parallel-size 8 --expert-parallel-size 8 --data-parallel-size 8 --enable-dp-attention --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nSGLANG_ENABLE_TORCH_COMPILE=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nMORI_SHMEM_MODE=ISOLATION\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nTORCHINDUCTOR_COMPILE_THREADS=128", + "runner": "linux-atom-mi35x-8", + "test_level": "nightly", + "accuracy_threshold": 0.91, + "accuracy_baseline": null, + "accuracy_baseline_model": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", + "_baseline_note": "Coverage for the legacy MTP-MoEFP4 artifact using TP8 with DP-attention and EP8." + }, + { + "model_name": "DeepSeek-R1-FP4-MTP-MoEFP4 TP8 MTP3", "model_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "extraArgs": "--tensor-parallel-size 8 --speculative-algorithm NEXTN --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 --max-running-requests 256 --cuda-graph-bs 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 160 192 224 256", - "env_vars": "SGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_ENABLE_SPEC_V2=1", + "extraArgs": "--trust-remote-code --tensor-parallel-size 8 --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache --speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 --max-running-requests 256 --cuda-graph-bs 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 160 192 224 256", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nSGLANG_ENABLE_SPEC_V2=1\nSGLANG_ENABLE_TORCH_COMPILE=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nTORCHINDUCTOR_COMPILE_THREADS=128", "runner": "linux-atom-mi35x-8", "test_level": "nightly", "accuracy_threshold": 0.93, "accuracy_baseline": null, "accuracy_baseline_model": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", + "_baseline_note": "Coverage for the legacy MTP-MoEFP4 artifact using the SGLang benchmark TP8 MTP3 server configuration." + }, + { + "model_name": "DeepSeek-R1-FP4 TP8 MTP1", + "model_path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "extraArgs": "--trust-remote-code --tensor-parallel-size 8 --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache --speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 --max-running-requests 256 --cuda-graph-bs 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 160 192 224 256", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nSGLANG_ENABLE_SPEC_V2=1\nSGLANG_ENABLE_TORCH_COMPILE=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nTORCHINDUCTOR_COMPILE_THREADS=128", + "runner": "linux-atom-mi35x-8", + "test_level": "nightly", + "accuracy_threshold": 0.93, + "accuracy_baseline": null, + "accuracy_baseline_model": "amd/DeepSeek-R1-0528-MXFP4-v2", "_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k." } ] diff --git a/.github/scripts/atom_sglang_mesh_benchmark.sh b/.github/scripts/atom_sglang_mesh_benchmark.sh index cef924c50..617ebc9f0 100644 --- a/.github/scripts/atom_sglang_mesh_benchmark.sh +++ b/.github/scripts/atom_sglang_mesh_benchmark.sh @@ -137,6 +137,27 @@ if [[ -n "${SERVER_EXTRA_ARGS}" ]]; then shlex_split_to_array "${SERVER_EXTRA_ARGS}" server_args fi +declare -a filtered_server_args=() +for ((i = 0; i < ${#server_args[@]}; i++)); do + case "${server_args[$i]}" in + --chunked-prefill-size) + if (( i + 1 >= ${#server_args[@]} )); then + echo "ERROR: --chunked-prefill-size requires a value." + exit 2 + fi + prefill_size="${server_args[$((i + 1))]}" + i=$((i + 1)) + ;; + --chunked-prefill-size=*) + prefill_size="${server_args[$i]#--chunked-prefill-size=}" + ;; + *) + filtered_server_args+=("${server_args[$i]}") + ;; + esac +done +server_args=("${filtered_server_args[@]}") + if [[ "${SPEC_MODE}" == "mtp" ]]; then export SGLANG_ENABLE_SPEC_V2="${SGLANG_ENABLE_SPEC_V2:-1}" if [[ "${SERVER_EXTRA_ARGS}" != *"--speculative-algorithm"* ]]; then @@ -194,6 +215,13 @@ if [[ -n "${BENCH_EXTRA_ARGS}" ]]; then shlex_split_to_array "${BENCH_EXTRA_ARGS}" bench_args fi +bench_num_prompts="$(( CONC * 10 ))" +bench_num_warmups="$(( 2 * CONC ))" +if (( DP_SIZE > 1 && EP_SIZE > 1 )); then + bench_num_prompts="$(( CONC * 3 ))" + bench_num_warmups="${CONC}" +fi + set -x PYTHONDONTWRITEBYTECODE=1 python "${BENCH_SERVING_DIR}/benchmark_serving.py" \ --model="${resolved_model_path}" \ @@ -203,9 +231,9 @@ PYTHONDONTWRITEBYTECODE=1 python "${BENCH_SERVING_DIR}/benchmark_serving.py" \ --random-input-len="${ISL}" \ --random-output-len="${OSL}" \ --random-range-ratio "${RANDOM_RANGE_RATIO}" \ - --num-prompts="$(( CONC * 10 ))" \ + --num-prompts="${bench_num_prompts}" \ --max-concurrency="${CONC}" \ - --num-warmups="$(( 2 * CONC ))" \ + --num-warmups="${bench_num_warmups}" \ --request-rate=inf \ --ignore-eos \ --save-result \ diff --git a/.github/workflows/atom-sglang-accuracy-validation.yaml b/.github/workflows/atom-sglang-accuracy-validation.yaml index 5be814e0f..547276af9 100644 --- a/.github/workflows/atom-sglang-accuracy-validation.yaml +++ b/.github/workflows/atom-sglang-accuracy-validation.yaml @@ -49,6 +49,11 @@ on: required: false type: boolean default: false + run_dsr1_fp4_tp4_dp8_ep8: + description: "DeepSeek-R1-FP4 TP4 DP8 EP8" + required: false + type: boolean + default: false run_dsr1_fp4_tp8: description: "DeepSeek-R1-FP4 TP8" required: false @@ -59,6 +64,21 @@ on: required: false type: boolean default: false + run_dsr1_fp4_mtp_moefp4_tp8: + description: "DeepSeek-R1-FP4-MTP-MoEFP4 TP8" + required: false + type: boolean + default: false + run_dsr1_fp4_mtp_moefp4_tp8_dp8_ep8: + description: "DeepSeek-R1-FP4-MTP-MoEFP4 TP8 DP8 EP8" + required: false + type: boolean + default: false + run_dsr1_fp4_mtp_moefp4_tp8_mtp3: + description: "DeepSeek-R1-FP4-MTP-MoEFP4 TP8 MTP3" + required: false + type: boolean + default: false run_dsr1_fp4_tp8_mtp1: description: "DeepSeek-R1-FP4 TP8 MTP1" required: false @@ -121,8 +141,12 @@ jobs: RUN_DSR1_FP8_TP8: ${{ inputs.run_dsr1_fp8_tp8 }} RUN_DSR1_FP4_TP4: ${{ inputs.run_dsr1_fp4_tp4 }} RUN_DSR1_FP4_TP4_DP4_EP4: ${{ inputs.run_dsr1_fp4_tp4_dp4_ep4 }} + RUN_DSR1_FP4_TP4_DP8_EP8: ${{ inputs.run_dsr1_fp4_tp4_dp8_ep8 }} RUN_DSR1_FP4_TP8: ${{ inputs.run_dsr1_fp4_tp8 }} RUN_DSR1_FP4_TP8_MTP3: ${{ inputs.run_dsr1_fp4_tp8_mtp3 }} + RUN_DSR1_FP4_MTP_MOEFP4_TP8: ${{ inputs.run_dsr1_fp4_mtp_moefp4_tp8 }} + RUN_DSR1_FP4_MTP_MOEFP4_TP8_DP8_EP8: ${{ inputs.run_dsr1_fp4_mtp_moefp4_tp8_dp8_ep8 }} + RUN_DSR1_FP4_MTP_MOEFP4_TP8_MTP3: ${{ inputs.run_dsr1_fp4_mtp_moefp4_tp8_mtp3 }} RUN_DSR1_FP4_TP8_MTP1: ${{ inputs.run_dsr1_fp4_tp8_mtp1 }} REBUILD_ATOM_BASE_FROM_DOCKERFILE: ${{ inputs.rebuild_atom_base_from_dockerfile }} run: | @@ -141,10 +165,10 @@ jobs: "toggle_env": "RUN_DSR1_FP8_TP4", "model_name": "DeepSeek-R1-FP8 TP4", "model_path": "deepseek-ai/DeepSeek-R1-0528", - "extra_args": "--tensor-parallel-size 4", + "extra_args": "--trust-remote-code --tensor-parallel-size 4 --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache", "accuracy_test_threshold": 0.91, - "env_vars": "SGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1", - "runner": "atom-mi355-8gpu-conductor-sgl-runner", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=1\nTORCHINDUCTOR_COMPILE_THREADS=128", + "runner": "linux-atom-mi35x-4", }, { "toggle_env": "RUN_QWEN35_35B_A3B_FP8_TP2", @@ -186,55 +210,91 @@ jobs: "toggle_env": "RUN_DSR1_FP8_TP8", "model_name": "DeepSeek-R1-FP8 TP8", "model_path": "deepseek-ai/DeepSeek-R1-0528", - "extra_args": "--tensor-parallel-size 8", + "extra_args": "--trust-remote-code --tensor-parallel-size 8 --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache", "accuracy_test_threshold": 0.93, - "env_vars": "SGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1", - "runner": "atom-mi355-8gpu-conductor-sgl-runner", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=1\nTORCHINDUCTOR_COMPILE_THREADS=128", + "runner": "linux-atom-mi35x-8", }, { "toggle_env": "RUN_DSR1_FP4_TP4", "model_name": "DeepSeek-R1-FP4 TP4", - "model_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "extra_args": "--tensor-parallel-size 4", + "model_path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "extra_args": "--trust-remote-code --tensor-parallel-size 4 --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache", "accuracy_test_threshold": 0.91, - "env_vars": "SGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1", - "runner": "atom-mi355-8gpu-conductor-sgl-runner", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=1\nTORCHINDUCTOR_COMPILE_THREADS=128", + "runner": "linux-atom-mi35x-4", }, { "toggle_env": "RUN_DSR1_FP4_TP4_DP4_EP4", "model_name": "DeepSeek-R1-FP4 TP4 DP4 EP4", - "model_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "extra_args": "--trust-remote-code --tensor-parallel-size 4 --data-parallel-size 4 --expert-parallel-size 4 --enable-dp-attention --kv-cache-dtype fp8_e4m3 --attention-backend aiter --mem-fraction-static 0.8 --decode-log-interval 1000 --chunked-prefill-size 65536 --max-running-requests 24 --disable-radix-cache", + "model_path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "extra_args": "--trust-remote-code --tensor-parallel-size 4 --expert-parallel-size 4 --data-parallel-size 4 --enable-dp-attention --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache", "accuracy_test_threshold": 0.91, - "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nAITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nMORI_SHMEM_MODE=ISOLATION\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=128\nSGLANG_MORI_DISPATCH_DTYPE=bf16\nSGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=16384", - "runner": "atom-mi355-8gpu-conductor-sgl-runner", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nSGLANG_ENABLE_TORCH_COMPILE=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nMORI_SHMEM_MODE=ISOLATION\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nTORCHINDUCTOR_COMPILE_THREADS=128", + "runner": "linux-atom-mi35x-8", + }, + { + "toggle_env": "RUN_DSR1_FP4_TP4_DP8_EP8", + "model_name": "DeepSeek-R1-FP4 TP4 DP8 EP8", + "model_path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "extra_args": "--trust-remote-code --tensor-parallel-size 4 --expert-parallel-size 8 --data-parallel-size 8 --enable-dp-attention --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache", + "accuracy_test_threshold": 0.91, + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nSGLANG_ENABLE_TORCH_COMPILE=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nMORI_SHMEM_MODE=ISOLATION\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nTORCHINDUCTOR_COMPILE_THREADS=128", + "runner": "linux-atom-mi35x-8", }, { "toggle_env": "RUN_DSR1_FP4_TP8", "model_name": "DeepSeek-R1-FP4 TP8", - "model_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "extra_args": "--tensor-parallel-size 8", + "model_path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "extra_args": "--trust-remote-code --tensor-parallel-size 8 --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache", "accuracy_test_threshold": 0.93, - "env_vars": "SGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1", - "runner": "atom-mi355-8gpu-conductor-sgl-runner", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=1\nTORCHINDUCTOR_COMPILE_THREADS=128", + "runner": "linux-atom-mi35x-8", }, { "toggle_env": "RUN_DSR1_FP4_TP8_MTP3", "model_name": "DeepSeek-R1-FP4 TP8 MTP3", + "model_path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "extra_args": "--trust-remote-code --tensor-parallel-size 8 --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache --speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 --max-running-requests 256 --cuda-graph-bs 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 160 192 224 256", + "accuracy_test_threshold": 0.93, + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nSGLANG_ENABLE_SPEC_V2=1\nSGLANG_ENABLE_TORCH_COMPILE=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nTORCHINDUCTOR_COMPILE_THREADS=128", + "runner": "linux-atom-mi35x-8", + }, + { + "toggle_env": "RUN_DSR1_FP4_MTP_MOEFP4_TP8", + "model_name": "DeepSeek-R1-FP4-MTP-MoEFP4 TP8", "model_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "extra_args": "--tensor-parallel-size 8 --speculative-algorithm NEXTN --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 --max-running-requests 256 --cuda-graph-bs 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 160 192 224 256", + "extra_args": "--trust-remote-code --tensor-parallel-size 8 --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache", "accuracy_test_threshold": 0.93, - "env_vars": "SGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_ENABLE_SPEC_V2=1", - "runner": "atom-mi355-8gpu-conductor-sgl-runner", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=1\nTORCHINDUCTOR_COMPILE_THREADS=128", + "runner": "linux-atom-mi35x-8", + }, + { + "toggle_env": "RUN_DSR1_FP4_MTP_MOEFP4_TP8_DP8_EP8", + "model_name": "DeepSeek-R1-FP4-MTP-MoEFP4 TP8 DP8 EP8", + "model_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", + "extra_args": "--trust-remote-code --tensor-parallel-size 8 --expert-parallel-size 8 --data-parallel-size 8 --enable-dp-attention --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache", + "accuracy_test_threshold": 0.91, + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nSGLANG_ENABLE_TORCH_COMPILE=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nMORI_SHMEM_MODE=ISOLATION\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nTORCHINDUCTOR_COMPILE_THREADS=128", + "runner": "linux-atom-mi35x-8", + }, + { + "toggle_env": "RUN_DSR1_FP4_MTP_MOEFP4_TP8_MTP3", + "model_name": "DeepSeek-R1-FP4-MTP-MoEFP4 TP8 MTP3", + "model_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", + "extra_args": "--trust-remote-code --tensor-parallel-size 8 --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache --speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 --max-running-requests 256 --cuda-graph-bs 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 160 192 224 256", + "accuracy_test_threshold": 0.93, + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nSGLANG_ENABLE_SPEC_V2=1\nSGLANG_ENABLE_TORCH_COMPILE=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nTORCHINDUCTOR_COMPILE_THREADS=128", + "runner": "linux-atom-mi35x-8", }, { "toggle_env": "RUN_DSR1_FP4_TP8_MTP1", "model_name": "DeepSeek-R1-FP4 TP8 MTP1", - "model_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4", - "extra_args": "--tensor-parallel-size 8 --speculative-algorithm NEXTN --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 --max-running-requests 256 --cuda-graph-bs 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 160 192 224 256", + "model_path": "amd/DeepSeek-R1-0528-MXFP4-v2", + "extra_args": "--trust-remote-code --tensor-parallel-size 8 --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache --speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 --max-running-requests 256 --cuda-graph-bs 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 160 192 224 256", "accuracy_test_threshold": 0.93, - "env_vars": "SGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_ENABLE_SPEC_V2=1", - "runner": "atom-mi355-8gpu-conductor-sgl-runner", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nSGLANG_ENABLE_SPEC_V2=1\nSGLANG_ENABLE_TORCH_COMPILE=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nTORCHINDUCTOR_COMPILE_THREADS=128", + "runner": "linux-atom-mi35x-8", }, ] diff --git a/.github/workflows/atom-sglang-benchmark.yaml b/.github/workflows/atom-sglang-benchmark.yaml index 61ad32cde..922e299c1 100644 --- a/.github/workflows/atom-sglang-benchmark.yaml +++ b/.github/workflows/atom-sglang-benchmark.yaml @@ -6,11 +6,11 @@ concurrency: on: schedule: - # Nightly at 23:00 Beijing time (15:00 UTC). - - cron: '0 15 * * 1,3' # Mon/Wed: SGLang-OOB DeepSeek group A, 6 models x 10 params = 60 cases - - cron: '0 15 * * 2,4' # Tue/Thu: SGLang-OOB DeepSeek MTP group B, 2 models x 10 params = 20 cases - - cron: '0 15 * * 5' # Fri: SGLang-OOB all, 10 models x 10 params = 100 cases - - cron: '0 15 * * 6' # Sat: SGLang-Mesh all, 8 configs = 62 cases + # Nightly at 22:00 Beijing time (14:00 UTC). + - cron: '0 14 * * 1,3' # Mon/Wed: SGLang-OOB non-MTP DeepSeek configs + - cron: '0 14 * * 2,4' # Tue/Thu: SGLang-OOB MTP + Qwen configs + - cron: '0 14 * * 5' # Fri: SGLang-Mesh all configs + - cron: '0 14 * * 6' # Sat: SGLang-OOB all configs workflow_dispatch: inputs: benchmark_suite: @@ -31,30 +31,40 @@ on: - "deepseek-r1-fp4-tp4 (1024x1024/8192x1024: [4,8,16,32,64])" - "deepseek-r1-fp4-tp4-dp4-ep4 (1024x1024/8192x1024: [4,8,16,32,64])" - "deepseek-r1-fp4-tp8-ep8 (1024x1024/8192x1024: [4,8,16,32,64])" + - "deepseek-r1-fp4-tp4-dp8-ep8 (1024x1024/8192x1024: [4,8,16,32,64])" - "deepseek-r1-fp4-tp8-mtp3 (1024x1024/8192x1024: [4,8,16,32,64])" - "deepseek-r1-fp4-tp8-mtp1 (1024x1024/8192x1024: [4,8,16,32,64])" + - "deepseek-r1-fp4-mtp3-tp4-dp4-ep4 (1024x1024/8192x1024: [4,8,16,32,64])" + - "deepseek-r1-fp4-mtp3-tp8-dp8-ep8 (1024x1024/8192x1024: [4,8,16,32,64])" - "qwen3-5-397b-a17b-fp8-tp4 (1024x1024/8192x1024: [4,8,16,32,64])" - "qwen3-5-397b-a17b-fp8-tp8 (1024x1024/8192x1024: [4,8,16,32,64])" - - "all-deepseek (8 DeepSeek configs x 10 default params)" + - "all-deepseek (11 DeepSeek configs x 10 default params)" + - "all-deepseek-non-mtp (7 DeepSeek non-MTP configs x 10 default params)" + - "all-deepseek-mtp (4 DeepSeek MTP configs x 10 default params)" - "all-qwen (2 Qwen configs x 10 default params)" - - "all-oob (all SGLang-OOB configs x 10 default params)" + - "all-oob (13 SGLang-OOB configs x 10 default params)" default: "none (do not run SGLang-OOB models)" mesh_config_preset: description: "SGLang-Mesh config subset (ignored for SGLang-OOB)" type: choice options: - - "ds-fp4-all (all Mesh configs: 62 cases)" - - "ds-fp4-tp8 (8192x1/1x1024: [2,4,8,16,32,64,128,256])" - - "ds-fp4-tp8-mtp (8192x1/1x1024: [2,4,8,16,32,64,128,256])" - - "ds-fp4-tp4 (8192x1/1x1024: [64,128,256])" - - "ds-fp4-tp4-mtp (8192x1/1x1024: [64,128,256])" - - "ds-fp4-dpa4-ep4 (8192x1: [256,512,1024])" - - "ds-fp4-dpa4-ep4-mtp (8192x1: [256,512,1024])" - - "ds-fp4-dpa8-ep8 (8192x1: [512,1024,2048]; 1x1024: [1024,2048,4096])" - - "ds-fp4-dpa8-ep8-mtp (8192x1: [512,1024,2048]; 1x1024: [1024,2048,4096])" - - "ds-fp4-non-mtp (tp8 + tp4 + dpa4-ep4 + dpa8-ep8)" - - "ds-fp4-mtp (tp8-mtp + tp4-mtp + dpa4-ep4-mtp + dpa8-ep8-mtp)" - default: "ds-fp4-all (all Mesh configs: 62 cases)" + - "ds-all (all DeepSeek Mesh configs: 71 cases)" + - "ds-fp4-all (all FP4 Mesh configs: 62 cases)" + - "ds-fp4-tp8 (FP4 8192x1/1x1024: [2,4,8,16,32,64,128,256])" + - "ds-fp4-tp8-mtp (FP4 8192x1/1x1024: [2,4,8,16,32,64,128,256])" + - "ds-fp4-tp4 (FP4 8192x1/1x1024: [64,128,256])" + - "ds-fp4-tp4-mtp (FP4 8192x1/1x1024: [64,128,256])" + - "ds-fp4-dpa4-ep4 (FP4 8192x1: [256,512,1024])" + - "ds-fp4-dpa4-ep4-mtp (FP4 8192x1: [256,512,1024])" + - "ds-fp4-dpa8-ep8 (FP4 8192x1: [512,1024,2048]; 1x1024: [1024,2048,4096])" + - "ds-fp4-dpa8-ep8-mtp (FP4 8192x1: [512,1024,2048]; 1x1024: [1024,2048,4096])" + - "ds-fp4-non-mtp (FP4 tp8 + tp4 + dpa4-ep4 + dpa8-ep8)" + - "ds-fp4-mtp (FP4 tp8-mtp + tp4-mtp + dpa4-ep4-mtp + dpa8-ep8-mtp)" + - "ds-fp8-all (all FP8 Mesh configs: 9 cases)" + - "ds-fp8-dpa4-ep4 (FP8 8192x1: [256,512,1024])" + - "ds-fp8-dpa8-ep8 (FP8 8192x1: [512,1024,2048]; 1x1024: [1024,2048,4096])" + - "ds-fp8-non-mtp (FP8 dpa4-ep4 + dpa8-ep8)" + default: "ds-all (all DeepSeek Mesh configs: 71 cases)" mesh_server_mode: description: "SGLang-Mesh server launch mode. Use sglang-mori only with lmsysorg/sglang-rocm images." type: choice @@ -149,7 +159,8 @@ jobs: WORKLOAD_LABEL="${INPUT_WORKLOAD_LABEL:-SGLang-OOB}" if [[ "${GITHUB_EVENT_NAME}" == "schedule" ]]; then case "${SCHEDULE_CRON}" in - "0 15 * * 6") WORKLOAD_LABEL="SGLang-Mesh" ;; + "0 14 * * 5") WORKLOAD_LABEL="SGLang-Mesh" ;; + "0 14 * * 1,3"|"0 14 * * 2,4"|"0 14 * * 6") WORKLOAD_LABEL="SGLang-OOB" ;; *) WORKLOAD_LABEL="SGLang-OOB" ;; esac fi @@ -373,11 +384,38 @@ jobs: import sys from pathlib import Path - models = json.loads( - Path(".github/benchmark/sglang_benchmark_models.json").read_text( - encoding="utf-8" - ) - ) + def expand_template_parts(value, templates, separator): + if isinstance(value, str): + return templates.get(value, value) + if isinstance(value, list): + return separator.join( + templates.get(str(part), str(part)) + for part in value + if str(part) + ) + return value + + def load_model_catalog(path): + catalog = json.loads(path.read_text(encoding="utf-8")) + if isinstance(catalog, list): + return catalog + + templates = catalog.get("templates", {}) + extra_arg_templates = templates.get("extra_args", {}) + env_var_templates = templates.get("env_vars", {}) + models = [] + for model in catalog.get("models", []): + expanded = dict(model) + expanded["extra_args"] = expand_template_parts( + expanded.get("extra_args", ""), extra_arg_templates, " " + ) + expanded["env_vars"] = expand_template_parts( + expanded.get("env_vars", ""), env_var_templates, "\n" + ) + models.append(expanded) + return models + + models = load_model_catalog(Path(".github/benchmark/sglang_benchmark_models.json")) event = os.environ["GITHUB_EVENT_NAME"] workload_label = os.environ.get("WORKLOAD_LABEL") or "SGLang-OOB" @@ -395,8 +433,10 @@ jobs: def normalize_mesh_preset(value): preset = str(value or "all").split(" (", 1)[0].split(" / ", 1)[0] - if preset.startswith("ds-fp4-"): - return preset.removeprefix("ds-fp4-") + if preset == "ds-all": + return "all" + if preset.startswith("ds-fp4-") or preset.startswith("ds-fp8-"): + return preset.removeprefix("ds-") return preset def oob_selected_by_preset(model, preset): @@ -407,34 +447,42 @@ jobs: return True if preset == "all-deepseek": return prefix.startswith("deepseek-") + if preset == "all-deepseek-non-mtp": + return prefix.startswith("deepseek-") and "mtp" not in prefix + if preset == "all-deepseek-mtp": + return prefix.startswith("deepseek-") and "mtp" in prefix if preset == "all-qwen": return prefix.startswith("qwen") return prefix == preset + def is_oob_mtp(model): + return matches_workload(model) and "mtp" in str(model.get("prefix", "")) + + def is_oob_qwen(model): + return matches_workload(model) and str(model.get("prefix", "")).startswith("qwen") + + def is_oob_non_mtp_deepseek(model): + prefix = str(model.get("prefix", "")) + return matches_workload(model) and prefix.startswith("deepseek-") and "mtp" not in prefix + if event == "schedule": schedule_cron = os.environ.get("SCHEDULE_CRON", "") - if schedule_cron == "0 15 * * 1,3": - selected_group = "A-DEEPSEEK" - selected = [m for m in models if m.get("nightly_group", "A") == "A" and matches_workload(m)] - elif schedule_cron == "0 15 * * 2,4": - selected_group = "B-DEEPSEEK-MTP" - selected = [ - m - for m in models - if m.get("nightly_group") == "B" - and matches_workload(m) - and str(m.get("prefix", "")).startswith("deepseek-") - ] - elif schedule_cron == "0 15 * * 5": - selected_group = "C-ALL" - selected = [m for m in models if matches_workload(m)] - elif schedule_cron == "0 15 * * 6": - selected_group = "WEEKEND-MESH" + if schedule_cron == "0 14 * * 1,3": + selected_group = "OOB-NON-MTP-DEEPSEEK" + selected = [m for m in models if is_oob_non_mtp_deepseek(m)] + elif schedule_cron == "0 14 * * 2,4": + selected_group = "OOB-MTP-QWEN" + selected = [m for m in models if is_oob_mtp(m) or is_oob_qwen(m)] + elif schedule_cron == "0 14 * * 5": + selected_group = "MESH-ALL" selected = [ m for m in models if matches_workload(m) and has_mesh_preset(m, "all") ] + elif schedule_cron == "0 14 * * 6": + selected_group = "OOB-ALL" + selected = [m for m in models if matches_workload(m)] else: selected_group = "SKIP-UNKNOWN-DAY" selected = [] @@ -695,6 +743,8 @@ jobs: MESH_TP_SIZE: ${{ matrix.model.tp_size || '' }} MESH_DP_SIZE: ${{ matrix.model.dp_size || '' }} MESH_EP_SIZE: ${{ matrix.model.ep_size || '' }} + CASE_EXTRA_ARGS_BY_PAIR: ${{ toJson(matrix.model.case_extra_args_by_pair) }} + CASE_ENV_VARS_BY_PAIR: ${{ toJson(matrix.model.case_env_vars_by_pair) }} RESULT_PREFIX: ${{ matrix.model.prefix }} ISL: ${{ matrix.params.input_length }} OSL: ${{ matrix.params.output_length }} @@ -835,6 +885,26 @@ jobs: echo "Using model cache backend: ${MODEL_CACHE_DESC}" printf '%s\n' "${{ matrix.model.env_vars }}" | sed 's/^[[:space:]]*//' > /tmp/oot_env_file.txt + CASE_KEY="${ISL}x${OSL}" + CASE_ENV_VARS="$( + CASE_ENV_VARS_BY_PAIR="${CASE_ENV_VARS_BY_PAIR:-null}" CASE_KEY="${CASE_KEY}" python3 - <<'PY' + import json + import os + + raw = os.environ.get("CASE_ENV_VARS_BY_PAIR") or "null" + try: + mapping = json.loads(raw) + except json.JSONDecodeError: + mapping = None + if isinstance(mapping, dict): + value = mapping.get(os.environ["CASE_KEY"], "") + if value: + print(value) + PY + )" + if [[ -n "${CASE_ENV_VARS}" ]]; then + printf '%s\n' "${CASE_ENV_VARS}" >> /tmp/oot_env_file.txt + fi $CONTAINER_ENGINE run -dt --device=/dev/kfd $DEVICE_FLAG \ -v "${GITHUB_WORKSPACE:-$PWD}":/workspace \ @@ -921,6 +991,32 @@ jobs: echo "=== Benchmark config: ${MODEL_NAME} ISL=${ISL} OSL=${OSL} CONC=${CONC} RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO} ===" EFFECTIVE_BENCHMARK_RUNNER="sglang_atom" EFFECTIVE_SGLANG_EXTRA_ARGS="${SGLANG_EXTRA_ARGS}" + CASE_KEY="${ISL}x${OSL}" + CASE_EXTRA_ARGS="$( + CASE_EXTRA_ARGS_BY_PAIR="${CASE_EXTRA_ARGS_BY_PAIR:-null}" CASE_KEY="${CASE_KEY}" python3 - <<'PY' + import json + import os + + raw = os.environ.get("CASE_EXTRA_ARGS_BY_PAIR") or "null" + try: + mapping = json.loads(raw) + except json.JSONDecodeError: + mapping = None + if isinstance(mapping, dict): + value = mapping.get(os.environ["CASE_KEY"], "") + if value: + print(value) + PY + )" + if [[ -n "${CASE_EXTRA_ARGS}" ]]; then + EFFECTIVE_SGLANG_EXTRA_ARGS="${EFFECTIVE_SGLANG_EXTRA_ARGS} ${CASE_EXTRA_ARGS}" + fi + BENCH_NUM_PROMPTS="$(( CONC * 10 ))" + BENCH_NUM_WARMUPS="$(( 2 * CONC ))" + if [[ "${WORKLOAD_LABEL}" == "SGLang-Mesh" && "${MESH_DP_SIZE:-1}" -gt 1 && "${MESH_EP_SIZE:-1}" -gt 1 ]]; then + BENCH_NUM_PROMPTS="$(( CONC * 3 ))" + BENCH_NUM_WARMUPS="${CONC}" + fi if [[ "${WORKLOAD_LABEL}" == "SGLang-Mesh" && "${MESH_SERVER_MODE}" == "sglang-mori" ]]; then case "${SGLANG_IMAGE_TAG}" in lmsysorg/sglang-rocm*|docker.io/lmsysorg/sglang-rocm*) ;; @@ -944,7 +1040,7 @@ jobs: -e RESULT_FILENAME="${RESULT_FILENAME}" \ -e RESULT_DIR="${CONTAINER_RESULT_DIR}" \ -e BENCH_SERVING_DIR="${CONTAINER_BENCH_SERVING_DIR}" \ - -e SERVER_EXTRA_ARGS="${SGLANG_EXTRA_ARGS}" \ + -e SERVER_EXTRA_ARGS="${EFFECTIVE_SGLANG_EXTRA_ARGS}" \ -e BENCH_EXTRA_ARGS="${BENCH_EXTRA_ARGS}" \ -e SPEC_MODE="${MESH_SPEC_MODE}" \ -e MAX_WAIT_RETRIES="${MAX_WAIT_RETRIES}" \ @@ -955,7 +1051,7 @@ jobs: " else if [[ "${WORKLOAD_LABEL}" == "SGLang-Mesh" && -n "${MESH_TP_SIZE}" ]]; then - EFFECTIVE_SGLANG_EXTRA_ARGS="--tensor-parallel-size ${MESH_TP_SIZE} ${SGLANG_EXTRA_ARGS}" + EFFECTIVE_SGLANG_EXTRA_ARGS="--tensor-parallel-size ${MESH_TP_SIZE} ${EFFECTIVE_SGLANG_EXTRA_ARGS}" fi $CONTAINER_ENGINE exec -d \ -e SGLANG_MODEL_NAME="${MODEL_NAME}" \ @@ -1053,10 +1149,10 @@ jobs: --random-input-len=\"${ISL}\" \ --random-output-len=\"${OSL}\" \ --random-range-ratio \"${RANDOM_RANGE_RATIO}\" \ - --num-prompts=\"$(( CONC * 10 ))\" \ + --num-prompts=\"${BENCH_NUM_PROMPTS}\" \ --max-concurrency=\"${CONC}\" \ ${TRUST_REMOTE_CODE_ARG} \ - --num-warmups=\"$(( 2 * CONC ))\" \ + --num-warmups=\"${BENCH_NUM_WARMUPS}\" \ --request-rate=inf \ --ignore-eos \ --save-result \ @@ -1341,11 +1437,38 @@ jobs: sys.path.insert(0, str(llm_booster_dashboard)) from update_data import index_root_for, rebuild_index # noqa: E402 - catalog = json.loads( - (root / ".github" / "benchmark" / "sglang_benchmark_models.json").read_text( - encoding="utf-8" - ) - ) + def expand_template_parts(value, templates, separator): + if isinstance(value, str): + return templates.get(value, value) + if isinstance(value, list): + return separator.join( + templates.get(str(part), str(part)) + for part in value + if str(part) + ) + return value + + def load_model_catalog(path): + catalog = json.loads(path.read_text(encoding="utf-8")) + if isinstance(catalog, list): + return catalog + + templates = catalog.get("templates", {}) + extra_arg_templates = templates.get("extra_args", {}) + env_var_templates = templates.get("env_vars", {}) + models = [] + for model in catalog.get("models", []): + expanded = dict(model) + expanded["extra_args"] = expand_template_parts( + expanded.get("extra_args", ""), extra_arg_templates, " " + ) + expanded["env_vars"] = expand_template_parts( + expanded.get("env_vars", ""), env_var_templates, "\n" + ) + models.append(expanded) + return models + + catalog = load_model_catalog(root / ".github" / "benchmark" / "sglang_benchmark_models.json") def slugify(text: str) -> str: return re.sub(r"[^a-z0-9]+", "-", text.lower()).strip("-") or "unknown" diff --git a/.github/workflows/atom-sglang-test.yaml b/.github/workflows/atom-sglang-test.yaml index ebd6b3922..d18452685 100644 --- a/.github/workflows/atom-sglang-test.yaml +++ b/.github/workflows/atom-sglang-test.yaml @@ -235,20 +235,28 @@ jobs: include: - model_name: "DeepSeek-R1-FP8 TP4" model_path: "deepseek-ai/DeepSeek-R1-0528" - extra_args: "--tensor-parallel-size 4" + extra_args: "--trust-remote-code --tensor-parallel-size 4 --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache" env_vars: | + SGLANG_DEFAULT_SERVER_ARGS= SGLANG_AITER_FP8_PREFILL_ATTN=0 SGLANG_USE_AITER=1 ATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1 + SGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models + SGLANG_ENABLE_TORCH_COMPILE=1 + TORCHINDUCTOR_COMPILE_THREADS=128 accuracy_test_threshold: 0.91 runner: linux-atom-mi35x-4 - model_name: "DeepSeek-R1-FP4 TP4" - model_path: "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4" - extra_args: "--tensor-parallel-size 4" + model_path: "amd/DeepSeek-R1-0528-MXFP4-v2" + extra_args: "--trust-remote-code --tensor-parallel-size 4 --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.85 --page-size 1 --disable-radix-cache" env_vars: | + SGLANG_DEFAULT_SERVER_ARGS= SGLANG_AITER_FP8_PREFILL_ATTN=0 SGLANG_USE_AITER=1 ATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1 + SGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models + SGLANG_ENABLE_TORCH_COMPILE=1 + TORCHINDUCTOR_COMPILE_THREADS=128 accuracy_test_threshold: 0.91 runner: linux-atom-mi35x-4 - model_name: "Qwen3.5-35B-A3B-FP8 TP2" diff --git a/recipes/atom_sglang/DeepSeek-R1.md b/recipes/atom_sglang/DeepSeek-R1.md index ba52c47ca..206faf507 100644 --- a/recipes/atom_sglang/DeepSeek-R1.md +++ b/recipes/atom_sglang/DeepSeek-R1.md @@ -1,6 +1,6 @@ # DeepSeek-R1 with ATOM SGLang Backend -This recipe shows how to run `deepseek-ai/DeepSeek-R1-0528` or `amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4` with the SGLang-ATOM backend. For background on the SGLang-ATOM integration, see [Introduce ATOM as external model package of SGLang](https://github.com/ROCm/ATOM/issues/359). +This recipe shows how to run `deepseek-ai/DeepSeek-R1-0528` or `amd/DeepSeek-R1-0528-MXFP4-v2` with the SGLang-ATOM backend. For background on the SGLang-ATOM integration, see [Introduce ATOM as external model package of SGLang](https://github.com/ROCm/ATOM/issues/359). ## Step 1: Pull the SGLang-ATOM Docker @@ -14,7 +14,9 @@ Launch a container from this image and run the remaining commands inside the con The SGLang-ATOM backend keeps the standard SGLang CLI, server APIs, and general usage flow compatible with upstream SGLang. For general server options and API usage, users can refer to the [official SGLang documentation](https://docs.sglang.ai/). -Before launching the server, export the same SGLang-ATOM settings used by the benchmark workflow: +### DeepSeek with FP8 (TP=8) + +Users can use this command to launch the FP8 server with the same settings as the SGLang benchmark workflow. ```bash export AITER_QUICK_REDUCE_QUANTIZATION=INT4 @@ -24,13 +26,7 @@ export ATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1 # Introduce ATOM as external model package of SGLang export SGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models export SGLANG_ENABLE_TORCH_COMPILE=1 -``` - -### DeepSeek with FP8 (TP=8) - -Users can use this command to launch the FP8 server with the same settings as the SGLang benchmark workflow. -```bash TORCHINDUCTOR_COMPILE_THREADS=128 \ python3 -m sglang.launch_server \ --model-path deepseek-ai/DeepSeek-R1-0528 \ @@ -48,6 +44,13 @@ python3 -m sglang.launch_server \ ### DeepSeek with FP8 (TP=4) ```bash +export AITER_QUICK_REDUCE_QUANTIZATION=INT4 +export SGLANG_AITER_FP8_PREFILL_ATTN=0 +export SGLANG_USE_AITER=1 +export ATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1 +# Introduce ATOM as external model package of SGLang +export SGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models +export SGLANG_ENABLE_TORCH_COMPILE=1 TORCHINDUCTOR_COMPILE_THREADS=128 \ python3 -m sglang.launch_server \ --model-path deepseek-ai/DeepSeek-R1-0528 \ @@ -64,12 +67,19 @@ python3 -m sglang.launch_server \ ### DeepSeek with MXFP4 (TP=8) -AMD Instinct MI355X GPU supports MXFP4 computation instructions, and users can use the following command to launch the MXFP4 server on MI355X. For MXFP4 model weight, we suggest using the checkpoint quantized from AMD Quark. +AMD Instinct MI355X GPU supports MXFP4 computation instructions, and users can use the following command to launch the MXFP4 server on MI355X. For MXFP4 model weights, we suggest using the checkpoint quantized from AMD Quark. ```bash +export AITER_QUICK_REDUCE_QUANTIZATION=INT4 +export SGLANG_AITER_FP8_PREFILL_ATTN=0 +export SGLANG_USE_AITER=1 +export ATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1 +# Introduce ATOM as external model package of SGLang +export SGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models +export SGLANG_ENABLE_TORCH_COMPILE=1 TORCHINDUCTOR_COMPILE_THREADS=128 \ python3 -m sglang.launch_server \ - --model-path amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4 \ + --model-path amd/DeepSeek-R1-0528-MXFP4-v2 \ --host localhost \ --port 8000 \ --trust-remote-code \ @@ -84,9 +94,16 @@ python3 -m sglang.launch_server \ ### DeepSeek with MXFP4 (TP=4) ```bash +export AITER_QUICK_REDUCE_QUANTIZATION=INT4 +export SGLANG_AITER_FP8_PREFILL_ATTN=0 +export SGLANG_USE_AITER=1 +export ATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1 +# Introduce ATOM as external model package of SGLang +export SGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models +export SGLANG_ENABLE_TORCH_COMPILE=1 TORCHINDUCTOR_COMPILE_THREADS=128 \ python3 -m sglang.launch_server \ - --model-path amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4 \ + --model-path amd/DeepSeek-R1-0528-MXFP4-v2 \ --host localhost \ --port 8000 \ --trust-remote-code \ @@ -101,11 +118,17 @@ python3 -m sglang.launch_server \ ### DeepSeek with MXFP4 (TP=4, DP=4, EP=4) ```bash +export AITER_QUICK_REDUCE_QUANTIZATION=INT4 +export SGLANG_AITER_FP8_PREFILL_ATTN=0 +export SGLANG_USE_AITER=1 +export SGLANG_ENABLE_TORCH_COMPILE=1 +export ATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1 export MORI_SHMEM_MODE=ISOLATION +export SGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models TORCHINDUCTOR_COMPILE_THREADS=128 \ python3 -m sglang.launch_server \ - --model-path amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4 \ + --model-path amd/DeepSeek-R1-0528-MXFP4-v2 \ --host localhost \ --port 8000 \ --trust-remote-code \ @@ -119,30 +142,58 @@ python3 -m sglang.launch_server \ --page-size 1 \ --disable-radix-cache ``` + +### DeepSeek with MXFP4 (TP=8, DP=8, EP=8) + +```bash +export AITER_QUICK_REDUCE_QUANTIZATION=INT4 +export SGLANG_AITER_FP8_PREFILL_ATTN=0 +export SGLANG_USE_AITER=1 +export SGLANG_ENABLE_TORCH_COMPILE=1 +export ATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1 +export MORI_SHMEM_MODE=ISOLATION +export SGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models + +TORCHINDUCTOR_COMPILE_THREADS=128 \ +python3 -m sglang.launch_server \ + --model-path amd/DeepSeek-R1-0528-MXFP4-v2 \ + --host localhost \ + --port 8000 \ + --trust-remote-code \ + --tensor-parallel-size 8 \ + --expert-parallel-size 8 \ + --data-parallel-size 8 \ + --enable-dp-attention \ + --attention-backend aiter \ + --kv-cache-dtype fp8_e4m3 \ + --mem-fraction-static 0.8 \ + --page-size 1 \ + --disable-radix-cache +``` In Addition, To align with mori prefill, you need to add --chunked-prefill-size 65536. This is also included in mori's startup command. -### DeepSeek with MXFP4 + MTP (TP=8) +### DeepSeek with MXFP4 + MTP (TP=8, MTP=3) -The `amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4` checkpoint includes MTP weights. To enable MTP decoding, launch SGLang with the `NEXTN` speculative decoding options. The example below follows `launch_deepseek_mtp_fp4.sh` and uses one draft step by default. +Use checkpoint `amd/DeepSeek-R1-0528-MXFP4-v2` and `SGLang/DeepSeek-R1-NextN`, which includes MTP weights. To enable MTP decoding, launch SGLang with the `NEXTN` speculative decoding options. The example below shows three draft step. ```bash export AITER_QUICK_REDUCE_QUANTIZATION=INT4 -export ATOM_ENABLE_DS_INPUT_RMSNORM_QUANT_FUSION=0 export ATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1 export SGLANG_AITER_FP8_PREFILL_ATTN=0 export SGLANG_USE_AITER=1 +export SGLANG_ENABLE_SPEC_V2=1 +export SGLANG_ENABLE_TORCH_COMPILE=1 export SGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" -export SGLANG_ENABLE_SPEC_V2=1 TP_SIZE=8 -MTP=${MTP:-1} +MTP=${MTP:-3} SPECULATIVE_NUM_DRAFT_TOKENS=${SPECULATIVE_NUM_DRAFT_TOKENS:-$((MTP + 1))} TORCHINDUCTOR_COMPILE_THREADS=128 \ python3 -m sglang.launch_server \ - --model-path amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4 \ + --model-path amd/DeepSeek-R1-0528-MXFP4-v2 \ --host localhost \ --port 8000 \ --trust-remote-code \ @@ -152,6 +203,7 @@ python3 -m sglang.launch_server \ --mem-fraction-static 0.8 \ --page-size 1 \ --disable-radix-cache \ + --speculative-draft-model-path SGLang/DeepSeek-R1-NextN \ --speculative-algorithm NEXTN \ --speculative-num-steps "${MTP}" \ --speculative-eagle-topk 1 \ @@ -162,19 +214,26 @@ python3 -m sglang.launch_server \ For a 4-GPU run, set `CUDA_VISIBLE_DEVICES` to the target devices and change `TP_SIZE=4`. -### DeepSeek with MXFP4 + MTP (TP=4, DP=4, EP=4) +### DeepSeek with MXFP4 + MTP (MTP=3, TP=4, DP=4, EP=4) ```bash +export AITER_QUICK_REDUCE_QUANTIZATION=INT4 +export ATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1 +export SGLANG_AITER_FP8_PREFILL_ATTN=0 +export SGLANG_USE_AITER=1 +export SGLANG_ENABLE_SPEC_V2=1 export MORI_SHMEM_MODE=ISOLATION -export ATOM_DUAL_STREAM_MOE_TOKEN_THRESHOLD=0 +export SGLANG_ENABLE_TORCH_COMPILE=1 + +export SGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models export CUDA_VISIBLE_DEVICES="0,1,2,3" -MTP=${MTP:-1} +MTP=${MTP:-3} SPECULATIVE_NUM_DRAFT_TOKENS=${SPECULATIVE_NUM_DRAFT_TOKENS:-$((MTP + 1))} TORCHINDUCTOR_COMPILE_THREADS=128 \ python3 -m sglang.launch_server \ - --model-path amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4 \ + --model-path amd/DeepSeek-R1-0528-MXFP4-v2 \ --host localhost \ --port 8000 \ --trust-remote-code \ @@ -187,14 +246,56 @@ python3 -m sglang.launch_server \ --mem-fraction-static 0.8 \ --page-size 1 \ --disable-radix-cache \ + --speculative-draft-model-path SGLang/DeepSeek-R1-NextN \ --speculative-algorithm NEXTN \ --speculative-num-steps "${MTP}" \ --speculative-eagle-topk 1 \ --speculative-num-draft-tokens "${SPECULATIVE_NUM_DRAFT_TOKENS}" \ - --max-running-requests 256 \ + --max-running-requests 4096 \ --cuda-graph-bs 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 160 192 224 256 ``` -In Addition, To align with mori prefill, you need to add --chunked-prefill-size 65536. This is also included in mori's startup command. + +### DeepSeek with MXFP4 + MTP (MTP=3, TP=8, DP=8, EP=8) + +```bash +export AITER_QUICK_REDUCE_QUANTIZATION=INT4 +export ATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1 +export SGLANG_AITER_FP8_PREFILL_ATTN=0 +export SGLANG_USE_AITER=1 +export SGLANG_ENABLE_SPEC_V2=1 +export MORI_SHMEM_MODE=ISOLATION +export SGLANG_ENABLE_TORCH_COMPILE=1 + +export SGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models + +export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" +MTP=${MTP:-3} +SPECULATIVE_NUM_DRAFT_TOKENS=${SPECULATIVE_NUM_DRAFT_TOKENS:-$((MTP + 1))} + +TORCHINDUCTOR_COMPILE_THREADS=128 \ +python3 -m sglang.launch_server \ + --model-path amd/DeepSeek-R1-0528-MXFP4-v2 \ + --host localhost \ + --port 8000 \ + --trust-remote-code \ + --tensor-parallel-size 8 \ + --expert-parallel-size 8 \ + --data-parallel-size 8 \ + --enable-dp-attention \ + --attention-backend aiter \ + --kv-cache-dtype fp8_e4m3 \ + --mem-fraction-static 0.8 \ + --page-size 1 \ + --disable-radix-cache \ + --speculative-draft-model-path SGLang/DeepSeek-R1-NextN \ + --speculative-algorithm NEXTN \ + --speculative-num-steps "${MTP}" \ + --speculative-eagle-topk 1 \ + --speculative-num-draft-tokens "${SPECULATIVE_NUM_DRAFT_TOKENS}" \ + --max-running-requests 4096 \ + --cuda-graph-bs 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 160 192 224 256 +``` +In addition, to align with MORI prefill, you need to add --chunked-prefill-size 65536. This is also included in mori's startup command. @@ -213,7 +314,7 @@ RESULT_DIR=./benchmark-results RESULT_FILENAME=deepseek-r1-fp4-tp4-${ISL}-${OSL}-${CONC}-${RANDOM_RANGE_RATIO}.json python3 /tmp/bench_serving/benchmark_serving.py \ - --model=amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4 \ + --model=amd/DeepSeek-R1-0528-MXFP4-v2 \ --backend=sglang \ --base-url=http://127.0.0.1:8000 \ --dataset-name=random \ @@ -235,7 +336,7 @@ python3 /tmp/bench_serving/benchmark_serving.py \ For FP8 or TP8 cases, keep the same benchmark command and replace `--model` with the checkpoint used in Step 2. ### Optional: Enable Profiling -If you want to collect profiling trace, set the SGLang profiling environment variables before launching the server, and add `--profile` to the benchmark client command. +If you want to collect profiling traces, set the SGLang profiling environment variables before launching the server, and add `--profile` to the benchmark client command. ```bash export SGLANG_PROFILE_RECORD_SHAPES=1 @@ -249,7 +350,7 @@ Then append `--profile` to the `benchmark_serving.py` command in Step 3. ```bash lm_eval --model local-completions \ - --model_args model=amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4,base_url=http://localhost:8000/v1/completions,num_concurrent=65,max_retries=1,tokenized_requests=False,trust_remote_code=True \ + --model_args model=amd/DeepSeek-R1-0528-MXFP4-v2,base_url=http://localhost:8000/v1/completions,num_concurrent=65,max_retries=1,tokenized_requests=False,trust_remote_code=True \ --tasks gsm8k \ --num_fewshot 3 ```