Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -486,7 +486,7 @@ dsr1-fp8-mi355x-atom-mtp:
- { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp }

dsr1-fp8-mi355x-sglang-disagg:
image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hi @billishyahao

in early march, you said that after consulting with @HaiShaw and others in the org that by End of March, you would be using upstream images. Can u please update this use upstream nightly images instead of second class forks?

lets ensure that we work towards amd being an first class platform on sglang instead of continuing to submit second class forks

model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: mi355x-disagg
Expand Down Expand Up @@ -641,7 +641,7 @@ dsr1-fp8-mi355x-sglang-disagg:


dsr1-fp8-mi355x-sglang-disagg-mtp:
image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327
model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: mi355x-disagg
Expand Down Expand Up @@ -794,10 +794,9 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=2"


dsr1-fp4-mi355x-sglang-disagg:
image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3
model: amd/DeepSeek-R1-0528-MXFP4
image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327
model: amd/DeepSeek-R1-0528-MXFP4-v2
model-prefix: dsr1
runner: mi355x-disagg
precision: fp4
Expand Down Expand Up @@ -1003,9 +1002,10 @@ dsr1-fp4-mi355x-sglang-disagg:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=0"


dsr1-fp4-mi355x-sglang-disagg-mtp:
image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3
model: amd/DeepSeek-R1-0528-MXFP4
image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327
model: amd/DeepSeek-R1-0528-MXFP4-v2
model-prefix: dsr1
runner: mi355x-disagg
precision: fp4
Expand Down
12 changes: 10 additions & 2 deletions benchmarks/multi_node/amd_utils/env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ export IBDEVICES
export GLOO_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1)
export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1)

set +x

export NCCL_IB_HCA=$IBDEVICES

Expand Down Expand Up @@ -64,6 +63,8 @@ export MORI_MAX_DISPATCH_TOKENS_DECODE=160
export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2))

export MORI_EP_LAUNCH_CONFIG_MODE=AUTO

#TODO(billishyahao): The following IO env will be deprecated soon.
export MORI_IO_QP_MAX_SEND_WR=16384
export MORI_IO_QP_MAX_CQE=32768
export MORI_IO_QP_MAX_SGE=4
Expand All @@ -89,17 +90,21 @@ $1 == "DSCP" && $2 == ":" && $NF == p {
if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then
TC=$(( 4 * ND_DSCP ))
export MORI_RDMA_SL=$ND_PRIO
export MORI_IO_SL=$ND_PRIO
export MORI_RDMA_TC=$TC
echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL"
export MORI_IO_TC=$TC
echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL, MORI_IO_TC=$MORI_IO_TC, MORI_IO_SL=$MORI_IO_SL"
else
echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection."
# Fall back to hostname-based detection
NODENAME=$(hostname -s)
if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
export MORI_RDMA_TC=96
export MORI_IO_TC=96
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
elif [[ $NODENAME == mia1* ]]; then
export MORI_RDMA_TC=104
export MORI_IO_TC=104
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
else
echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration."
Expand All @@ -110,9 +115,11 @@ else
NODENAME=$(hostname -s)
if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
export MORI_RDMA_TC=96
export MORI_IO_TC=96
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
elif [[ $NODENAME == mia1* ]]; then
export MORI_RDMA_TC=104
export MORI_IO_TC=104
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
else
echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration."
Expand All @@ -124,3 +131,4 @@ fi
export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH}


set +x
43 changes: 37 additions & 6 deletions benchmarks/multi_node/amd_utils/models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
# cuda_graph_bs_range: str

DeepSeek-V3:
base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
prefill:
Expand Down Expand Up @@ -69,7 +69,7 @@ DeepSeek-V3:
cuda_graph_bs_range: "1-128"

DeepSeek-V3-0324:
base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
prefill:
Expand Down Expand Up @@ -100,7 +100,7 @@ DeepSeek-V3-0324:
cuda_graph_bs_range: "1-128"

DeepSeek-R1:
base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
prefill:
Expand Down Expand Up @@ -131,7 +131,7 @@ DeepSeek-R1:
cuda_graph_bs_range: "1-128"

DeepSeek-R1-0528:
base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
prefill:
Expand Down Expand Up @@ -162,7 +162,7 @@ DeepSeek-R1-0528:
cuda_graph_bs_range: "1-128"

DeepSeek-R1-0528-MXFP4-Preview:
base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
prefill:
Expand Down Expand Up @@ -193,7 +193,7 @@ DeepSeek-R1-0528-MXFP4-Preview:
cuda_graph_bs_range: "1-128"

DeepSeek-R1-0528-MXFP4:
base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
prefill:
Expand Down Expand Up @@ -222,3 +222,34 @@ DeepSeek-R1-0528-MXFP4:
max_running_requests: 128
chunked_prefill_size: 262144
cuda_graph_bs_range: "1-128"

DeepSeek-R1-0528-MXFP4-v2:
base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode "
dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
prefill:
mem_fraction_static: 0.8
disable_radix_cache: true
dp:
max_running_requests: 24
chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE"
cuda_graph_bs: "1 2 3"
no_dp:
max_running_requests: 128
chunked_prefill_size: 16384
cuda_graph_bs_range: "1-128"
decode:
mem_fraction_static: 0.85
prefill_round_robin_balance: true
dp:
max_running_requests: 4096
chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE"
cuda_graph_bs_range: "1-160"
ep_only:
max_running_requests: 256
chunked_prefill_size: 262144
cuda_graph_bs_range: "1-256"
no_dp:
max_running_requests: 128
chunked_prefill_size: 262144
cuda_graph_bs_range: "1-128"
12 changes: 1 addition & 11 deletions benchmarks/multi_node/amd_utils/server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -187,18 +187,8 @@ else
decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP
fi

# Use Decode configuration to configure different TP/DP size between P and D
PREFILL_DECODE_DIFFERENT_TP=""
if [[ "$PREFILL_ENABLE_DP" != "$DECODE_ENABLE_DP" ]]; then
if [[ "$DECODE_ENABLE_DP" == "true" ]]; then
PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp ${DECODE_TP_SIZE}"
else
PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp 1"
fi
fi

# Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS)
PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} ${PREFILL_DECODE_DIFFERENT_TP}"
PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} "
if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then
PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache"
fi
Expand Down
10 changes: 10 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1213,3 +1213,13 @@
- "Uses nvidia/GLM-5-NVFP4 model with modelopt_fp4 quantization"
- "Image: lmsysorg/sglang:nightly-dev-cu13-20260328-a27651d5"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/973

- config-keys:
- dsr1-fp4-mi355x-sglang-disagg
- dsr1-fp4-mi355x-sglang-disagg-mtp
description:
- "Bump SGL mori image to March 27"
- "Add more low latency sweep configs"
- "Enable v2 mxfp4 DSR1 0528 model"
- "Enable fp4 disp feature on mori"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/983