Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ dsr1-fp8-mi355x-sglang:
- { tp: 8, conc-start: 4, conc-end: 64 }

qwen3.5-bf16-mi355x-sglang:
image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260215
image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260327
model: Qwen/Qwen3.5-397B-A17B
model-prefix: qwen3.5
runner: mi355x
Expand All @@ -125,11 +125,13 @@ qwen3.5-bf16-mi355x-sglang:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }
- { tp: 8, conc-start: 4, conc-end: 256 }
- { tp: 8, ep: 8, conc-start: 64, conc-end: 256 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }
- { tp: 8, conc-start: 4, conc-end: 256 }
- { tp: 8, ep: 8, conc-start: 64, conc-end: 256 }

qwen3.5-bf16-mi300x-sglang:
image: lmsysorg/sglang:v0.5.9-rocm720-mi30x
Expand Down Expand Up @@ -186,7 +188,7 @@ qwen3.5-fp8-mi325x-sglang:
- { tp: 8, conc-start: 4, conc-end: 64 }

qwen3.5-fp8-mi355x-sglang:
image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260218
image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260327
model: Qwen/Qwen3.5-397B-A17B-FP8
model-prefix: qwen3.5
runner: mi355x
Expand All @@ -198,10 +200,12 @@ qwen3.5-fp8-mi355x-sglang:
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }
- { tp: 4, ep: 4, conc-start: 16, conc-end: 256 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }
- { tp: 4, ep: 4, conc-start: 16, conc-end: 256 }

qwen3.5-fp8-mi300x-sglang:
image: lmsysorg/sglang:v0.5.9-rocm720-mi30x
Expand Down
16 changes: 14 additions & 2 deletions benchmarks/single_node/qwen3.5_bf16_mi355x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ check_env_vars \
ISL \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME
RESULT_FILENAME \
EP_SIZE

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
Expand All @@ -19,11 +20,14 @@ hf download "$MODEL"

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}
CONTEXT_LENGTH=$((ISL + OSL + 20))
MAX_PREFILL_TOKENS=32768

EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH"
fi
# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor
Expand All @@ -34,8 +38,16 @@ python3 -m sglang.launch_server \
--host=0.0.0.0 \
--port $PORT \
--tensor-parallel-size $TP \
--ep-size $EP_SIZE \
--data-parallel-size 1 \
--trust-remote-code \
--mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
--tokenizer-worker-num 6 \
--enable-aiter-allreduce-fusion \
--cuda-graph-max-bs $CONC \
--disable-radix-cache \
--max-prefill-tokens $MAX_PREFILL_TOKENS \
--scheduler-recv-interval 30 \
--mem-fraction-static 0.75 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &

SERVER_PID=$!

Expand Down
16 changes: 14 additions & 2 deletions benchmarks/single_node/qwen3.5_fp8_mi355x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ check_env_vars \
ISL \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME
RESULT_FILENAME \
EP_SIZE

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
Expand All @@ -19,11 +20,14 @@ hf download "$MODEL"

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}
CONTEXT_LENGTH=$((ISL + OSL + 20))
MAX_PREFILL_TOKENS=32768

EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH"
fi
# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor
Expand All @@ -34,8 +38,16 @@ python3 -m sglang.launch_server \
--host=0.0.0.0 \
--port $PORT \
--tensor-parallel-size $TP \
--ep-size $EP_SIZE \
--data-parallel-size 1 \
--trust-remote-code \
--mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
--tokenizer-worker-num 6 \
--enable-aiter-allreduce-fusion \
--cuda-graph-max-bs $CONC \
--disable-radix-cache \
--max-prefill-tokens $MAX_PREFILL_TOKENS \
--scheduler-recv-interval 30 \
--mem-fraction-static 0.75 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &

SERVER_PID=$!

Expand Down
8 changes: 8 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1220,3 +1220,11 @@
- "Uses nvidia/GLM-5-NVFP4 model with modelopt_fp4 quantization"
- "Image: lmsysorg/sglang:nightly-dev-cu13-20260328-a27651d5"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/973
- config-keys:
- qwen3.5-bf16-mi355x-sglang
- qwen3.5-fp8-mi355x-sglang
description:
- "Update cli args of Qwen3.5 FP8 and BF16 SGLang benchmarks for MI355X to achieve better performance"
- "Use lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260327"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/980

Loading