diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index d3ef03291..14eec1583 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -486,7 +486,7 @@ dsr1-fp8-mi355x-atom-mtp: - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } dsr1-fp8-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: mi355x-disagg @@ -641,7 +641,7 @@ dsr1-fp8-mi355x-sglang-disagg: dsr1-fp8-mi355x-sglang-disagg-mtp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: mi355x-disagg @@ -794,10 +794,9 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=2" - dsr1-fp4-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 - model: amd/DeepSeek-R1-0528-MXFP4 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327 + model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg precision: fp4 @@ -1003,9 +1002,10 @@ dsr1-fp4-mi355x-sglang-disagg: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" + dsr1-fp4-mi355x-sglang-disagg-mtp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 - model: amd/DeepSeek-R1-0528-MXFP4 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327 + model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg precision: fp4 diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 5565c5b3b..0aa2d0c20 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -34,7 +34,6 @@ export IBDEVICES export GLOO_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) -set +x export NCCL_IB_HCA=$IBDEVICES @@ -64,6 +63,8 @@ export MORI_MAX_DISPATCH_TOKENS_DECODE=160 export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) export MORI_EP_LAUNCH_CONFIG_MODE=AUTO + +#TODO(billishyahao): The following IO env will be deprecated soon. export MORI_IO_QP_MAX_SEND_WR=16384 export MORI_IO_QP_MAX_CQE=32768 export MORI_IO_QP_MAX_SGE=4 @@ -89,17 +90,21 @@ $1 == "DSCP" && $2 == ":" && $NF == p { if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then TC=$(( 4 * ND_DSCP )) export MORI_RDMA_SL=$ND_PRIO + export MORI_IO_SL=$ND_PRIO export MORI_RDMA_TC=$TC - echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL" + export MORI_IO_TC=$TC + echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL, MORI_IO_TC=$MORI_IO_TC, MORI_IO_SL=$MORI_IO_SL" else echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." # Fall back to hostname-based detection NODENAME=$(hostname -s) if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then export MORI_RDMA_TC=96 + export MORI_IO_TC=96 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" elif [[ $NODENAME == mia1* ]]; then export MORI_RDMA_TC=104 + export MORI_IO_TC=104 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" else echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration." @@ -110,9 +115,11 @@ else NODENAME=$(hostname -s) if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then export MORI_RDMA_TC=96 + export MORI_IO_TC=96 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" elif [[ $NODENAME == mia1* ]]; then export MORI_RDMA_TC=104 + export MORI_IO_TC=104 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" else echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration." @@ -124,3 +131,4 @@ fi export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH} +set +x diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 2bbdd91d6..eed59bdab 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -38,7 +38,7 @@ # cuda_graph_bs_range: str DeepSeek-V3: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -69,7 +69,7 @@ DeepSeek-V3: cuda_graph_bs_range: "1-128" DeepSeek-V3-0324: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -100,7 +100,7 @@ DeepSeek-V3-0324: cuda_graph_bs_range: "1-128" DeepSeek-R1: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -131,7 +131,7 @@ DeepSeek-R1: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -162,7 +162,7 @@ DeepSeek-R1-0528: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528-MXFP4-Preview: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -193,7 +193,7 @@ DeepSeek-R1-0528-MXFP4-Preview: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528-MXFP4: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -222,3 +222,34 @@ DeepSeek-R1-0528-MXFP4: max_running_requests: 128 chunked_prefill_size: 262144 cuda_graph_bs_range: "1-128" + +DeepSeek-R1-0528-MXFP4-v2: + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode " + dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + prefill: + mem_fraction_static: 0.8 + disable_radix_cache: true + dp: + max_running_requests: 24 + chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE" + cuda_graph_bs: "1 2 3" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 16384 + cuda_graph_bs_range: "1-128" + decode: + mem_fraction_static: 0.85 + prefill_round_robin_balance: true + dp: + max_running_requests: 4096 + chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE" + cuda_graph_bs_range: "1-160" + ep_only: + max_running_requests: 256 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-256" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-128" diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 7f174b760..7340ef51c 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -187,18 +187,8 @@ else decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP fi -# Use Decode configuration to configure different TP/DP size between P and D -PREFILL_DECODE_DIFFERENT_TP="" -if [[ "$PREFILL_ENABLE_DP" != "$DECODE_ENABLE_DP" ]]; then - if [[ "$DECODE_ENABLE_DP" == "true" ]]; then - PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp ${DECODE_TP_SIZE}" - else - PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp 1" - fi -fi - # Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) -PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} ${PREFILL_DECODE_DIFFERENT_TP}" +PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} " if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" fi diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 3dbc5eccc..1cd22211a 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1213,3 +1213,13 @@ - "Uses nvidia/GLM-5-NVFP4 model with modelopt_fp4 quantization" - "Image: lmsysorg/sglang:nightly-dev-cu13-20260328-a27651d5" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/973 + +- config-keys: + - dsr1-fp4-mi355x-sglang-disagg + - dsr1-fp4-mi355x-sglang-disagg-mtp + description: + - "Bump SGL mori image to March 27" + - "Add more low latency sweep configs" + - "Enable v2 mxfp4 DSR1 0528 model" + - "Enable fp4 disp feature on mori" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/983