Skip to content

Commit e1e8cb2

Browse files
github-actions[bot]functionstackxclaude
committed
Add Qwen3.5-397B-A17B NVFP4 B200 SGLang benchmark config
- Add qwen3.5-fp4-b200-sglang config to nvidia-master.yaml (1k1k, 8k1k) - Add launch script following updated conventions (dynamic scheduler interval, EVAL_CONTEXT_ARGS, tokenizer-worker-num, conditional allreduce fusion) - Add perf-changelog entry Co-authored-by: functionstackx <functionstackx@users.noreply.github.com> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 2542b74 commit e1e8cb2

3 files changed

Lines changed: 128 additions & 0 deletions

File tree

.github/configs/nvidia-master.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1789,6 +1789,24 @@ qwen3.5-fp8-b200-sglang:
17891789
- { tp: 8, ep: 1, conc-start: 4, conc-end: 16 }
17901790
- { tp: 4, ep: 4, conc-start: 16, conc-end: 128 }
17911791

1792+
qwen3.5-fp4-b200-sglang:
1793+
image: lmsysorg/sglang:v0.5.9-cu129-amd64
1794+
model: nvidia/Qwen3.5-397B-A17B-NVFP4
1795+
model-prefix: qwen3.5
1796+
runner: b200
1797+
precision: fp4
1798+
framework: sglang
1799+
multinode: false
1800+
seq-len-configs:
1801+
- isl: 1024
1802+
osl: 1024
1803+
search-space:
1804+
- { tp: 4, ep: 1, conc-start: 4, conc-end: 128 }
1805+
- isl: 8192
1806+
osl: 1024
1807+
search-space:
1808+
- { tp: 4, ep: 1, conc-start: 4, conc-end: 128 }
1809+
17921810
glm5-fp8-b200-sglang:
17931811
image: lmsysorg/sglang:nightly-dev-cu13-20260317-1eea7448
17941812
model: zai-org/GLM-5-FP8
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
#!/usr/bin/env bash
2+
3+
source "$(dirname "$0")/../benchmark_lib.sh"
4+
5+
check_env_vars \
6+
MODEL \
7+
TP \
8+
CONC \
9+
ISL \
10+
OSL \
11+
RANDOM_RANGE_RATIO \
12+
RESULT_FILENAME \
13+
EP_SIZE
14+
15+
if [[ -n "$SLURM_JOB_ID" ]]; then
16+
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
17+
fi
18+
19+
nvidia-smi
20+
21+
hf download "$MODEL"
22+
23+
export NCCL_NVLS_ENABLE=1
24+
export SGL_ENABLE_JIT_DEEPGEMM=false
25+
export SGLANG_ENABLE_FLASHINFER_GEMM=true
26+
export PYTHONUNBUFFERED=1
27+
28+
SERVER_LOG=/workspace/server.log
29+
PORT=${PORT:-8888}
30+
31+
# Default: recv every ~10 requests; if CONC >= 16, relax to ~30 requests between scheduler recv polls.
32+
if [[ $CONC -ge 16 ]]; then
33+
SCHEDULER_RECV_INTERVAL=30
34+
else
35+
SCHEDULER_RECV_INTERVAL=10
36+
fi
37+
38+
MEM_FRAC_STATIC=0.85
39+
CHUNKED_PREFILL_SIZE=32768
40+
MAX_PREFILL_TOKENS=32768
41+
CUDA_GRAPH_MAX_BATCH_SIZE=$CONC
42+
MAX_RUNNING_REQUESTS=128
43+
CONTEXT_LENGTH=$((ISL + OSL + 20))
44+
if [ "${EVAL_ONLY}" = "true" ]; then
45+
setup_eval_context
46+
CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN"
47+
fi
48+
49+
if [[ $TP -eq 8 ]]; then
50+
EXTRA_ARGS="--enable-flashinfer-allreduce-fusion"
51+
else
52+
EXTRA_ARGS=""
53+
fi
54+
55+
echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL"
56+
57+
# Start GPU monitoring (power, temperature, clocks every second)
58+
start_gpu_monitor
59+
60+
set -x
61+
PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \
62+
--trust-remote-code \
63+
--tensor-parallel-size=$TP --data-parallel-size=1 --ep-size $EP_SIZE \
64+
--quantization modelopt_fp4 --fp4-gemm-backend flashinfer_cutlass \
65+
--kv-cache-dtype fp8_e4m3 \
66+
--mamba-ssm-dtype bfloat16 \
67+
--cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE --max-running-requests $MAX_RUNNING_REQUESTS \
68+
--mem-fraction-static $MEM_FRAC_STATIC --chunked-prefill-size $CHUNKED_PREFILL_SIZE --max-prefill-tokens $MAX_PREFILL_TOKENS \
69+
--context-length $CONTEXT_LENGTH --disable-radix-cache \
70+
--attention-backend trtllm_mha --moe-runner-backend flashinfer_trtllm \
71+
$EXTRA_ARGS --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
72+
--tokenizer-worker-num 6 --stream-interval 30 > $SERVER_LOG 2>&1 &
73+
74+
SERVER_PID=$!
75+
76+
# Wait for server to be ready
77+
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
78+
79+
pip install -q datasets pandas
80+
81+
run_benchmark_serving \
82+
--model "$MODEL" \
83+
--port "$PORT" \
84+
--backend vllm \
85+
--input-len "$ISL" \
86+
--output-len "$OSL" \
87+
--random-range-ratio "$RANDOM_RANGE_RATIO" \
88+
--num-prompts "$((CONC * 10))" \
89+
--max-concurrency "$CONC" \
90+
--result-filename "$RESULT_FILENAME" \
91+
--result-dir /workspace/
92+
93+
# After throughput, run evaluation only if RUN_EVAL is true
94+
if [ "${RUN_EVAL}" = "true" ]; then
95+
run_eval --framework lm-eval --port "$PORT"
96+
append_lm_eval_summary
97+
fi
98+
99+
# Stop GPU monitoring
100+
stop_gpu_monitor
101+
set +x

perf-changelog.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1198,3 +1198,12 @@
11981198
description:
11991199
- "Add --disable-radix-cache to SGLang server launch command for qwen3.5 MI300X and MI325X benchmark scripts"
12001200
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/970
1201+
1202+
- config-keys:
1203+
- qwen3.5-fp4-b200-sglang
1204+
description:
1205+
- "Add Qwen3.5-397B-A17B NVFP4 B200 SGLang benchmark config and launch script"
1206+
- "Image: lmsysorg/sglang:v0.5.9-cu129-amd64"
1207+
- "Model: nvidia/Qwen3.5-397B-A17B-NVFP4"
1208+
- "Configs: 1k1k (TP4 conc 4-128), 8k1k (TP4 conc 4-128)"
1209+
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/820

0 commit comments

Comments
 (0)