Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
734c1ba
Add Qwen3.5 h200 MTP
hshrivastava-droid Mar 20, 2026
3d80687
extend conc
hshrivastava-droid Mar 20, 2026
96a8f02
adding flag
hshrivastava-droid Mar 20, 2026
db6e75d
Merge branch 'main' into nv/h200-qwen35
hshrivastava-droid Mar 23, 2026
0d13e55
Update perf-changelog.yaml
hshrivastava-droid Mar 23, 2026
0ce06cf
add new line
hshrivastava-droid Mar 23, 2026
5dd2308
Update perf-changelog.yaml
hshrivastava-droid Mar 23, 2026
14a584f
add new line
hshrivastava-droid Mar 23, 2026
55a6d03
Update perf-changelog.yaml
hshrivastava-droid Mar 23, 2026
2e489b9
fix: perf bug
hshrivastava-droid Mar 23, 2026
cbe069d
fix:perf
hshrivastava-droid Mar 23, 2026
250e850
Delete docs/accuracy_evals_slides.html
hshrivastava-droid Mar 23, 2026
af9bda1
change env variable
hshrivastava-droid Mar 24, 2026
571512b
fix: max seq len
hshrivastava-droid Mar 24, 2026
cc42d88
remove extra flag
hshrivastava-droid Mar 25, 2026
4a35e5c
fix:perf
hshrivastava-droid Mar 25, 2026
1e96dc3
Merge branch 'main' into nv/h200-qwen35
hshrivastava-droid Mar 25, 2026
61d531d
add new line
hshrivastava-droid Mar 25, 2026
15443cd
Clean up perf-changelog.yaml by removing old entries
Ankur-singh Mar 25, 2026
75a4160
update:spec num
hshrivastava-droid Mar 25, 2026
1e93eae
update: spec token
hshrivastava-droid Mar 25, 2026
3ff815b
Merge branch 'main' into nv/h200-qwen35
hshrivastava-droid Mar 30, 2026
ea4e969
fix:perf log
hshrivastava-droid Mar 30, 2026
c76c817
fix:perf log delete
hshrivastava-droid Mar 30, 2026
63a71ae
fix:perf log add
hshrivastava-droid Mar 30, 2026
e374eb1
fix:perf log add
hshrivastava-droid Mar 30, 2026
1570da9
fix:perf log add
hshrivastava-droid Mar 30, 2026
de7febd
fix:perf log add
hshrivastava-droid Mar 30, 2026
315e1e8
add perf
hshrivastava-droid Mar 30, 2026
8205d84
Fix PR link formatting in perf-changelog.yaml
hshrivastava-droid Mar 30, 2026
82bf274
fix:eval error8k1k
hshrivastava-droid Mar 31, 2026
59200f2
Remove duplicate configuration entry in nvidia-master.yaml
hshrivastava-droid Mar 31, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1998,6 +1998,24 @@ qwen3.5-fp8-h200-sglang:
search-space:
- { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }

qwen3.5-fp8-h200-sglang-mtp:
image: lmsysorg/sglang:v0.5.9-cu129-amd64
model: Qwen/Qwen3.5-397B-A17B-FP8
model-prefix: qwen3.5
runner: h200
precision: fp8
framework: sglang
multinode: false
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, ep: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, ep: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp }

glm5-fp8-h200-sglang:
image: lmsysorg/sglang:glm5-hopper
model: zai-org/GLM-5-FP8
Expand Down
95 changes: 95 additions & 0 deletions benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME \
EP_SIZE \
MAX_MODEL_LEN

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

nvidia-smi

hf download "$MODEL"

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

# MTP (Multi-Token Prediction) Config - EAGLE speculative decoding
SPECULATIVE_NUM_STEPS=3
SPECULATIVE_DRAFT_TOKENS=4
SPECULATIVE_EAGLE_TOPK=1

echo "CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_MODEL_LEN: $MAX_MODEL_LEN"

# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor

set -x
python3 -m sglang.launch_server \
--model "$MODEL" \
--host 0.0.0.0 \
--port "$PORT" \
--tp "$TP" \
--expert-parallel-size "$EP_SIZE" \
--reasoning-parser qwen3 \
--tool-call-parser qwen3_coder \
--enable-flashinfer-allreduce-fusion \
--max-running-requests 128 \
--chunked-prefill-size 16384 \
--mem-fraction-static 0.8 \
--cuda-graph-max-bs "$CONC" \
--context-length "$MAX_MODEL_LEN" \
--kv-cache-dtype fp8_e4m3 \
--quantization fp8 \
--attention-backend flashinfer \
--stream-interval 50 \
--tokenizer-worker-num 6 \
--mamba-ssm-dtype bfloat16 \
--disable-radix-cache \
--trust-remote-code \
--speculative-algorithm EAGLE \
--speculative-num-steps "$SPECULATIVE_NUM_STEPS" \
Comment on lines +42 to +61
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add and merge it to cookbook please

https://cookbook.sglang.io/autoregressive/Qwen/Qwen3.5
Image

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

working on this, will share PR link here

--speculative-num-draft-tokens "$SPECULATIVE_DRAFT_TOKENS" \
--speculative-eagle-topk "$SPECULATIVE_EAGLE_TOPK" \
> "$SERVER_LOG" 2>&1 &

SERVER_PID=$!

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

pip install -q datasets pandas

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--use-chat-template \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
export EVAL_CONCURRENT_REQUESTS="${EVAL_CONCURRENT_REQUESTS:-$CONC}"
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
7 changes: 7 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1220,3 +1220,10 @@
- "Uses nvidia/GLM-5-NVFP4 model with modelopt_fp4 quantization"
- "Image: lmsysorg/sglang:nightly-dev-cu13-20260328-a27651d5"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/973

- config-keys:
- qwen3.5-fp8-h200-sglang-mtp
description:
- "Add Qwen3.5-397B-A17B-FP8 H200 SGLang MTP (EAGLE speculative decoding)"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/921

Loading