Skip to content
14 changes: 8 additions & 6 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3101,7 +3101,7 @@ gptoss-fp4-b200-vllm:
- { tp: 8, conc-start: 4, conc-end: 4 }

minimaxm2.5-fp8-b200-vllm:
image: vllm/vllm-openai:v0.17.0-cu130
image: vllm/vllm-openai:v0.18.0-cu130
model: MiniMaxAI/MiniMax-M2.5
model-prefix: minimaxm2.5
runner: b200
Expand All @@ -3112,13 +3112,15 @@ minimaxm2.5-fp8-b200-vllm:
- isl: 1024
osl: 1024
search-space:
- { tp: 2, conc-start: 4, conc-end: 64 }
- { tp: 4, conc-start: 4, conc-end: 64 }
- { tp: 2, conc-start: 4, conc-end: 256 }
- { tp: 4, conc-start: 4, conc-end: 256 }
- { tp: 4, ep: 4, conc-start: 64, conc-end: 64 }
- { tp: 8, conc-start: 4, conc-end: 8 }
- isl: 8192
osl: 1024
search-space:
- { tp: 2, conc-start: 4, conc-end: 64 }
- { tp: 4, conc-start: 4, conc-end: 64 }
- { tp: 2, conc-start: 4, conc-end: 256 }
- { tp: 4, conc-start: 4, conc-end: 256 }

gptoss-fp4-h100-vllm:
image: vllm/vllm-openai:v0.18.0
Expand Down Expand Up @@ -3345,7 +3347,7 @@ gptoss-fp4-h200-vllm:
- { tp: 8, conc-start: 4, conc-end: 32 }

minimaxm2.5-fp8-h200-vllm:
image: vllm/vllm-openai:v0.18.0
image: vllm/vllm-openai:nightly-5b8c30d62b754b575e043ce2fc0dcbf8a64f6306
model: MiniMaxAI/MiniMax-M2.5
model-prefix: minimaxm2.5
runner: h200
Expand Down
7 changes: 4 additions & 3 deletions benchmarks/single_node/minimaxm2.5_fp8_b200.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ hf download "$MODEL"
SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

export VLLM_USE_FLASHINFER_MOE_FP8=0
export VLLM_MOE_USE_DEEP_GEMM=0
export VLLM_USE_DEEP_GEMM=0
export VLLM_FLASHINFER_ALLREDUCE_BACKEND=mnnvl

if [ "$EP_SIZE" -ge 1 ]; then
EP=" --enable-expert-parallel"
Expand All @@ -47,7 +47,8 @@ $EP \
--gpu-memory-utilization 0.95 \
--max-model-len $MAX_MODEL_LEN \
--block-size=32 \
--no-enable-prefix-caching \
--kv-cache-dtype fp8 \
--stream-interval 20 --no-enable-prefix-caching \
--trust-remote-code > $SERVER_LOG 2>&1 &

SERVER_PID=$!
Expand Down
11 changes: 10 additions & 1 deletion perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1136,7 +1136,7 @@
description:
- "Disable prefix caching (--no-enable-prefix-caching) for all MiniMax benchmarks using random datasets"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/966

- config-keys:
# NVIDIA single-node
- dsr1-fp4-b200-sglang
Expand Down Expand Up @@ -1213,3 +1213,12 @@
- "Uses nvidia/GLM-5-NVFP4 model with modelopt_fp4 quantization"
- "Image: lmsysorg/sglang:nightly-dev-cu13-20260328-a27651d5"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/973

- config-keys:
- minimaxm2.5-fp8-b200-vllm
description:
- "Update vLLM image from v0.17.0 to v0.18.0 for MiniMax-M2.5 FP8 B200"
- "Add tp4 ep4 search-space entries (conc 32-256) for all seq-len configs"
- "Remove ISL 1024 / OSL 8192 seq-len config"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/947

Loading