Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
301 changes: 301 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1231,3 +1231,304 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=1"


dsr1-fp8-mi325x-sglang-disagg:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ur missing perfchange log . yaml too

image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good
model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: mi325x-disagg
precision: fp8
framework: sglang-disagg
multinode: true
disagg: true
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
# "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP8)
- spec-decoding: "none"
conc-list: [ 512, 1024 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=0"

# "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8)
- spec-decoding: "none"
conc-list: [ 768, 512, 256 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=0"

# "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
- spec-decoding: "none"
conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=0"

# "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

r u sure that TP4 is on the pareto here? do u have an graph?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

image just the initial sweep

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

u only have TP4 curve and u have "hide non-optimal"? can u run the rest of the 24 datapoints?

- spec-decoding: "none"
conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=0"

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ur missing MTP submissions and u only have not MTP so far

- isl: 8192
osl: 1024
search-space:
# "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8)
- spec-decoding: "none"
conc-list: [ 512, 1024 ]
prefill:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "PREFILL_NODES=2"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=0"

# "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
- spec-decoding: "none"
conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=0"

# "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
- spec-decoding: "none"
conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=0"


dsr1-fp8-mi325x-sglang-disagg-mtp:
image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good
model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: mi325x-disagg
precision: fp8
framework: sglang-disagg
multinode: true
disagg: true
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
# MTP configurations
# "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP8)
- spec-decoding: "mtp"
conc-list: [ 512, 1024 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=1"

# "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8)
- spec-decoding: "mtp"
conc-list: [ 768, 512, 256 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=1"

# "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
- spec-decoding: "mtp"
conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=2"

# "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
- spec-decoding: "mtp"
conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=2"

- isl: 8192
osl: 1024
search-space:
# MTP configurations
# "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8)
- spec-decoding: "mtp"
conc-list: [ 512, 1024 ]
prefill:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "PREFILL_NODES=2"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=1"

# "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
- spec-decoding: "mtp"
conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=2"

# "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
- spec-decoding: "mtp"
conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=2"
5 changes: 5 additions & 0 deletions .github/configs/runners.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,11 @@ mi325x:
- 'mi325x-amd_1'
- 'mi325x-amd_2'
- 'mi325x-amd_3'
mi325x-disagg:
- 'mi325x-amd_0'
- 'mi325x-amd_1'
- 'mi325x-amd_2'
- 'mi325x-amd_3'
mi355x:
- 'mi355x-amds_0'
- 'mi355x-amds_1'
Expand Down
13 changes: 13 additions & 0 deletions benchmarks/multi_node/amd_utils/env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ if [[ -z "$IBDEVICES" ]]; then
export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7
elif [[ $NODENAME == mia1* ]]; then
export IBDEVICES=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
elif [[ $NODENAME == chi-mi325x* ]]; then
# Vultr/CPE MI325X cluster: Broadcom RoCE (bnxt_re); bnxt_re6 is DOWN, skip it
export IBDEVICES=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re7,bnxt_re8
else
echo "ERROR: Unable to detect cluster from hostname $NODENAME and IBDEVICES not set" >&2
exit 1
Expand Down Expand Up @@ -101,6 +104,11 @@ $1 == "DSCP" && $2 == ":" && $NF == p {
elif [[ $NODENAME == mia1* ]]; then
export MORI_RDMA_TC=104
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
elif [[ $NODENAME == chi-mi325x* ]]; then
# Vultr/CPE MI325X: Broadcom Thor 2, DSCP AF31(26)->prio 3, TC=4*26=104
export MORI_RDMA_TC=104
export MORI_RDMA_SL=3
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL from hostname $NODENAME"
else
echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration."
fi
Expand All @@ -114,6 +122,11 @@ else
elif [[ $NODENAME == mia1* ]]; then
export MORI_RDMA_TC=104
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
elif [[ $NODENAME == chi-mi325x* ]]; then
# Vultr/CPE MI325X: Broadcom Thor 2, DSCP AF31(26)->prio 3, TC=4*26=104
export MORI_RDMA_TC=104
export MORI_RDMA_SL=3
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL from hostname $NODENAME"
else
echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration."
echo " This is normal for clusters without QoS or outside Docker containers."
Expand Down
Loading
Loading