diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 157a9b54c..1ddeb4d31 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -3101,7 +3101,7 @@ gptoss-fp4-b200-vllm: - { tp: 8, conc-start: 4, conc-end: 4 } minimaxm2.5-fp8-b200-vllm: - image: vllm/vllm-openai:v0.17.0-cu130 + image: vllm/vllm-openai:nightly-5b8c30d62b754b575e043ce2fc0dcbf8a64f6306 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: b200 @@ -3112,13 +3112,16 @@ minimaxm2.5-fp8-b200-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 256 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 256 } + - { tp: 4, ep: 4, conc-start: 16, conc-end: 64 } + # - { tp: 8, conc-start: 4, conc-end: 8 } + # - isl: 8192 + # osl: 1024 + # search-space: + # - { tp: 2, conc-start: 4, conc-end: 256 } + # - { tp: 4, conc-start: 4, conc-end: 256 } gptoss-fp4-h100-vllm: image: vllm/vllm-openai:v0.18.0 @@ -3345,7 +3348,7 @@ gptoss-fp4-h200-vllm: - { tp: 8, conc-start: 4, conc-end: 32 } minimaxm2.5-fp8-h200-vllm: - image: vllm/vllm-openai:v0.18.0 + image: vllm/vllm-openai:nightly-5b8c30d62b754b575e043ce2fc0dcbf8a64f6306 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: h200 diff --git a/benchmarks/single_node/minimaxm2.5_fp8_b200.sh b/benchmarks/single_node/minimaxm2.5_fp8_b200.sh index 5ea1b8657..5604b553c 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_b200.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_b200.sh @@ -24,10 +24,9 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} -export VLLM_USE_FLASHINFER_MOE_FP8=0 -export VLLM_MOE_USE_DEEP_GEMM=0 +export VLLM_FLASHINFER_ALLREDUCE_BACKEND=mnnvl -if [ "$EP_SIZE" -ge 1 ]; then +if [ "$EP_SIZE" -gt 1 ]; then EP=" --enable-expert-parallel" else EP=" " @@ -47,7 +46,8 @@ $EP \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ --block-size=32 \ ---no-enable-prefix-caching \ +--kv-cache-dtype fp8 \ +--stream-interval 20 --no-enable-prefix-caching \ --trust-remote-code > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index a82882f61..d041d897e 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1143,7 +1143,7 @@ description: - "Disable prefix caching (--no-enable-prefix-caching) for all MiniMax benchmarks using random datasets" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/966 - + - config-keys: # NVIDIA single-node - dsr1-fp4-b200-sglang @@ -1235,3 +1235,13 @@ - "New model support on ATOM framework" - "Kimi-K2.5 FP4, and MiniMax-M2.5 FP8 configs added for MI355X ATOM" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/963 + +- config-keys: + - minimaxm2.5-fp8-b200-vllm + description: + - "Update vLLM image from v0.17.0 to v0.18.0 for MiniMax-M2.5 FP8 B200" + - "Add tp4 ep4 search-space entries (conc 32-256) for all seq-len configs" + - "Remove ISL 1024 / OSL 8192 seq-len config" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/947 + +