From 0383696f7c6173378dee1ab115b4151f51f47ccf Mon Sep 17 00:00:00 2001 From: billishyahao Date: Mon, 16 Mar 2026 08:36:19 +0000 Subject: [PATCH 01/14] [AMD] add dsr1 mxfp4 v2 sweep points --- .github/configs/amd-master.yaml | 56 +++++++++++++++++++++ benchmarks/multi_node/amd_utils/models.yaml | 31 ++++++++++++ 2 files changed, 87 insertions(+) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 5551860f2..61c842f58 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1446,6 +1446,62 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" +dsr1-fp4-mi355x-sglang-disagg-mtp-v2: + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0313-2 + model: amd/DeepSeek-R1-0528-MXFP4-v2 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp4 + framework: sglang-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations + # 1P1D pure TP8 + - spec-decoding: "mtp" + conc-list: [ 1, 2, 4, 8 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + + + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=3" + + + + # FIXME(billishyahao): disable FP4 1k8k for now # - isl: 1024 diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 2bbdd91d6..4c6611571 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -222,3 +222,34 @@ DeepSeek-R1-0528-MXFP4: max_running_requests: 128 chunked_prefill_size: 262144 cuda_graph_bs_range: "1-128" + +DeepSeek-R1-0528-MXFP4-v2: + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" + dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + prefill: + mem_fraction_static: 0.8 + disable_radix_cache: true + dp: + max_running_requests: 24 + chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE" + cuda_graph_bs: "1 2 3" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 16384 + cuda_graph_bs_range: "1-128" + decode: + mem_fraction_static: 0.85 + prefill_round_robin_balance: true + dp: + max_running_requests: 4096 + chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE" + cuda_graph_bs_range: "1-160" + ep_only: + max_running_requests: 256 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-256" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-128" From 18e05b1cbb097497a63800291b6015e8cd37e250 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Tue, 17 Mar 2026 06:36:04 +0000 Subject: [PATCH 02/14] fix --- .github/configs/amd-master.yaml | 3 --- benchmarks/multi_node/amd_utils/models.yaml | 14 +++++++------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 61c842f58..f20ed38fd 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1499,9 +1499,6 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-v2: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=3" - - - # FIXME(billishyahao): disable FP4 1k8k for now # - isl: 1024 diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 4c6611571..07668659d 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -38,7 +38,7 @@ # cuda_graph_bs_range: str DeepSeek-V3: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -69,7 +69,7 @@ DeepSeek-V3: cuda_graph_bs_range: "1-128" DeepSeek-V3-0324: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -100,7 +100,7 @@ DeepSeek-V3-0324: cuda_graph_bs_range: "1-128" DeepSeek-R1: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -131,7 +131,7 @@ DeepSeek-R1: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -162,7 +162,7 @@ DeepSeek-R1-0528: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528-MXFP4-Preview: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -193,7 +193,7 @@ DeepSeek-R1-0528-MXFP4-Preview: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528-MXFP4: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -224,7 +224,7 @@ DeepSeek-R1-0528-MXFP4: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528-MXFP4-v2: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: From 32b5d3d00cce991eb9e7a3b298c69ee7b9cf28cd Mon Sep 17 00:00:00 2001 From: Zhai Feiyue Date: Tue, 24 Mar 2026 14:59:35 +0000 Subject: [PATCH 03/14] Fix tokenizer mismatch between benchmark client and sglang server on transformers v5 Transformers v5 incorrectly rebuilds pre_tokenizer/decoder components for models like DeepSeek-R1 that use LlamaTokenizerFast with a non-Llama tokenizer architecture. The sglang server fixes this at startup, but the benchmark client loads the tokenizer without these fixes, causing a ~5x token count inflation (e.g. 7000 tokens -> 35000 tokens) and false performance regressions in TTFT and throughput benchmarks. Apply the same tokenizer fixes (pre_tokenizer/decoder restoration and add_bos_token recovery) that sglang server applies, so client and server tokenize identically. No-op on transformers v4. Made-with: Cursor --- utils/bench_serving/backend_request_func.py | 72 ++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/utils/bench_serving/backend_request_func.py b/utils/bench_serving/backend_request_func.py index 32331a398..4990ef5fa 100644 --- a/utils/bench_serving/backend_request_func.py +++ b/utils/bench_serving/backend_request_func.py @@ -439,6 +439,75 @@ def get_model(pretrained_model_name_or_path: str) -> str: return pretrained_model_name_or_path +def _fix_tokenizer_for_sglang(tokenizer, model_path): + """Fix transformers v5 tokenizer to match sglang server-side behavior. + + Root cause: transformers v5 (>= 5.0) changed how tokenizers are loaded. + Specifically, LlamaTokenizerFast.__init__ in v5 rebuilds the pre_tokenizer + and decoder from scratch using class-specific components, discarding the + originals from tokenizer.json. For models like DeepSeek-R1 that declare + LlamaTokenizerFast but actually use a ByteLevel/Sequence tokenizer + architecture, v5 incorrectly replaces the original Sequence pre_tokenizer + with Metaspace, and the original ByteLevel decoder with Sequence. + + The sglang server applies fixes for this in hf_transformers_utils.py + (_fix_v5_tokenizer_components and _fix_v5_add_bos_eos_token), but the + benchmark client loads the tokenizer directly via AutoTokenizer without + these fixes. This mismatch causes the client to encode text differently + from the server -- e.g. a 7000-token prompt on the client becomes ~35000 + tokens on the server, leading to ~5x TTFT inflation and false performance + regressions in benchmarks. + + This function replicates the same fixes so the benchmark client tokenizes + identically to the sglang server. It is a no-op on transformers v4. + """ + import json + from pathlib import Path + + backend = getattr(tokenizer, "_tokenizer", None) + if backend is not None: + try: + from tokenizers import Tokenizer as RawTokenizer + tok_file = Path(model_path) / "tokenizer.json" + if tok_file.is_file(): + raw = RawTokenizer.from_file(str(tok_file)) + raw_pre = type(raw.pre_tokenizer).__name__ if raw.pre_tokenizer else None + loaded_pre = type(backend.pre_tokenizer).__name__ if backend.pre_tokenizer else None + if raw_pre and loaded_pre and raw_pre != loaded_pre: + backend.pre_tokenizer = raw.pre_tokenizer + backend.decoder = raw.decoder + except Exception: + pass + + try: + config_file = Path(model_path) / "tokenizer_config.json" + if config_file.is_file(): + with open(config_file) as f: + config = json.load(f) + tok_class = config.get("tokenizer_class", "") + bos_eos_classes = { + "LlamaTokenizer", "LlamaTokenizerFast", + "CodeLlamaTokenizer", "CodeLlamaTokenizerFast", + "GemmaTokenizer", "GemmaTokenizerFast", "CohereTokenizerFast", + } + if tok_class in bos_eos_classes: + defaults = {"add_bos_token": True, "add_eos_token": False} + changed = False + for attr in ("add_bos_token", "add_eos_token"): + val = config.get(attr) + if val is None: + val = defaults.get(attr, False) + if getattr(tokenizer, attr, None) != val: + setattr(tokenizer, f"_{attr}", val) + changed = True + if changed and hasattr(tokenizer, "update_post_processor"): + tokenizer.update_post_processor() + except Exception: + pass + + return tokenizer + + def get_tokenizer( pretrained_model_name_or_path: str, tokenizer_mode: str = "auto", @@ -464,11 +533,12 @@ def get_tokenizer( return MistralTokenizer.from_pretrained( str(pretrained_model_name_or_path)) else: - return AutoTokenizer.from_pretrained( + tokenizer = AutoTokenizer.from_pretrained( pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs, ) + return _fix_tokenizer_for_sglang(tokenizer, pretrained_model_name_or_path) ASYNC_REQUEST_FUNCS = { From 0bd347fe71d6689269c81569b797985618ffad7f Mon Sep 17 00:00:00 2001 From: billishyahao Date: Wed, 25 Mar 2026 15:28:03 +0000 Subject: [PATCH 04/14] change mtp model to fp8 --- .github/configs/amd-master.yaml | 369 +++++++++++++++++++- benchmarks/multi_node/amd_utils/models.yaml | 2 +- 2 files changed, 369 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index f20ed38fd..525595b7b 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1446,8 +1446,218 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" + +dsr1-fp4-mi355x-sglang-disagg-v2: + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323 + model: amd/DeepSeek-R1-0528-MXFP4-v2 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp4 + framework: sglang-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # non-MTP configurations + # 1P1D TP8 + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 8 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP8 + - spec-decoding: "none" + conc-list: [ 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP8 + - spec-decoding: "none" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP4 + - spec-decoding: "none" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1*DEP4+ 1*DEP8 + - spec-decoding: "none" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + - isl: 8192 + osl: 1024 + search-space: + # non-MTP configurations + # 1P1D pure TP8 + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 8 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP8 + - spec-decoding: "none" + conc-list: [ 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP8 + - spec-decoding: "none" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP4 + - spec-decoding: "none" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 4*DEP4 + 1*DEP8 + - spec-decoding: "none" + conc-list: [ 1024, 2048, 4096 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=4" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + dsr1-fp4-mi355x-sglang-disagg-mtp-v2: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0313-2 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1456,6 +1666,106 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-v2: multinode: true disagg: true seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # MTP configurations + # 1P1D TP8 + - spec-decoding: "mtp" + conc-list: [ 1, 2, 4, 8 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=3" + + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # 1P2D TP4 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # 1*DEP4+ 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + + - isl: 8192 osl: 1024 search-space: @@ -1499,6 +1809,63 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-v2: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=3" + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # 1P2D TP4 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # 4*DEP4 + 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 1024, 2048, 4096 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=4" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + # FIXME(billishyahao): disable FP4 1k8k for now # - isl: 1024 diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 07668659d..6bca6b52a 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -225,7 +225,7 @@ DeepSeek-R1-0528-MXFP4: DeepSeek-R1-0528-MXFP4-v2: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" - mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" + mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: mem_fraction_static: 0.8 From 754e53c00fd834dcc6093c8b164966b8019b0605 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Wed, 25 Mar 2026 15:32:42 +0000 Subject: [PATCH 05/14] change fp8 image --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 525595b7b..2cea84d01 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -596,7 +596,7 @@ dsr1-fp8-mi355x-atom-mtp: - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } dsr1-fp8-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: mi355x-disagg @@ -751,7 +751,7 @@ dsr1-fp8-mi355x-sglang-disagg: dsr1-fp8-mi355x-sglang-disagg-mtp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: mi355x-disagg From f29f2d01ea990161dfc6bc79401a30a30bda9502 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 27 Mar 2026 11:20:18 +0000 Subject: [PATCH 06/14] bump image to 0327 --- .github/configs/amd-master.yaml | 4 ++-- benchmarks/multi_node/amd_utils/env.sh | 4 +++- benchmarks/multi_node/amd_utils/models.yaml | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 2cea84d01..a0112d479 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1448,7 +1448,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: dsr1-fp4-mi355x-sglang-disagg-v2: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1657,7 +1657,7 @@ dsr1-fp4-mi355x-sglang-disagg-v2: dsr1-fp4-mi355x-sglang-disagg-mtp-v2: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 5565c5b3b..f4b631673 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -34,7 +34,6 @@ export IBDEVICES export GLOO_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) -set +x export NCCL_IB_HCA=$IBDEVICES @@ -123,4 +122,7 @@ fi # FIXME: WA for latest upstream 0305 image export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH} +export SGLANG_ENABLE_SPEC_V2=1 +export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1 +set +x diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 6bca6b52a..eed59bdab 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -225,7 +225,7 @@ DeepSeek-R1-0528-MXFP4: DeepSeek-R1-0528-MXFP4-v2: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" - mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1" + mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode " dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: mem_fraction_static: 0.8 From a44c7eb8759e8527d8af192e5aa2ffc7f7e65fb0 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 27 Mar 2026 14:09:11 +0000 Subject: [PATCH 07/14] remove specv2 --- benchmarks/multi_node/amd_utils/env.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index f4b631673..02cb77a91 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -122,7 +122,5 @@ fi # FIXME: WA for latest upstream 0305 image export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH} -export SGLANG_ENABLE_SPEC_V2=1 -export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1 set +x From 25141364c930fc59e455aeca97eeeebd81e750fa Mon Sep 17 00:00:00 2001 From: billishyahao Date: Mon, 30 Mar 2026 01:57:19 +0000 Subject: [PATCH 08/14] consolidate dsr1 fp4 configs --- .github/configs/amd-master.yaml | 422 +------------------------------- 1 file changed, 1 insertion(+), 421 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index a0112d479..6a96a4af2 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1027,427 +1027,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: # - "DECODE_NODES=2" # - "DECODE_MTP_SIZE=0" - dsr1-fp4-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 - model: amd/DeepSeek-R1-0528-MXFP4 - model-prefix: dsr1 - runner: mi355x-disagg - precision: fp4 - framework: sglang-disagg - multinode: true - disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # non-MTP configurations - # 1P1D TP8 - - spec-decoding: "none" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP8 - - spec-decoding: "none" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP8 - - spec-decoding: "none" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP4 - - spec-decoding: "none" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1*DEP4+ 1*DEP8 - - spec-decoding: "none" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - - isl: 8192 - osl: 1024 - search-space: - # non-MTP configurations - # 1P1D pure TP8 - - spec-decoding: "none" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP8 - - spec-decoding: "none" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP8 - - spec-decoding: "none" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP4 - - spec-decoding: "none" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 4*DEP4 + 1*DEP8 - - spec-decoding: "none" - conc-list: [ 1024, 2048, 4096 ] - prefill: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=4" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - -dsr1-fp4-mi355x-sglang-disagg-mtp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 - model: amd/DeepSeek-R1-0528-MXFP4 - model-prefix: dsr1 - runner: mi355x-disagg - precision: fp4 - framework: sglang-disagg - multinode: true - disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # MTP configurations - # 1P1D TP8 - - spec-decoding: "mtp" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - # 1P2D TP4 - - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - # 1*DEP4+ 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - - - isl: 8192 - osl: 1024 - search-space: - # MTP configurations - # 1P1D pure TP8 - - spec-decoding: "mtp" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=3" - - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - # 1P2D TP4 - - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - # 4*DEP4 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 1024, 2048, 4096 ] - prefill: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=4" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - -dsr1-fp4-mi355x-sglang-disagg-v2: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 @@ -1656,7 +1236,7 @@ dsr1-fp4-mi355x-sglang-disagg-v2: - "DECODE_MTP_SIZE=0" -dsr1-fp4-mi355x-sglang-disagg-mtp-v2: +dsr1-fp4-mi355x-sglang-disagg-mtp: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 From 682a4ab4ec3d42c73cd5c54b9aede2ba1fc33a54 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Mon, 30 Mar 2026 02:03:58 +0000 Subject: [PATCH 09/14] bump fp8 image to 0327 --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 27518d40b..a139ca560 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -486,7 +486,7 @@ dsr1-fp8-mi355x-atom-mtp: - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } dsr1-fp8-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: mi355x-disagg @@ -641,7 +641,7 @@ dsr1-fp8-mi355x-sglang-disagg: dsr1-fp8-mi355x-sglang-disagg-mtp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: mi355x-disagg From 64bf10078c4e4f9f19486dc0f6727dc6ef1902d2 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Mon, 30 Mar 2026 04:36:27 +0000 Subject: [PATCH 10/14] fix crash --- benchmarks/multi_node/amd_utils/server.sh | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 7f174b760..7340ef51c 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -187,18 +187,8 @@ else decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP fi -# Use Decode configuration to configure different TP/DP size between P and D -PREFILL_DECODE_DIFFERENT_TP="" -if [[ "$PREFILL_ENABLE_DP" != "$DECODE_ENABLE_DP" ]]; then - if [[ "$DECODE_ENABLE_DP" == "true" ]]; then - PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp ${DECODE_TP_SIZE}" - else - PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp 1" - fi -fi - # Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) -PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} ${PREFILL_DECODE_DIFFERENT_TP}" +PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} " if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" fi From c44e1755ea6cc81f5e6f59b071ed20ddb7abefe4 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Mon, 30 Mar 2026 15:26:57 +0000 Subject: [PATCH 11/14] fix env --- benchmarks/multi_node/amd_utils/env.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 02cb77a91..88ea2ac84 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -88,17 +88,21 @@ $1 == "DSCP" && $2 == ":" && $NF == p { if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then TC=$(( 4 * ND_DSCP )) export MORI_RDMA_SL=$ND_PRIO + export MORI_IO_SL=$ND_PRIO export MORI_RDMA_TC=$TC - echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL" + export MORI_IO_TC=$TC + echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL, MORI_IO_TC=$MORI_IO_TC, MORI_IO_SL=$MORI_IO_SL" else echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." # Fall back to hostname-based detection NODENAME=$(hostname -s) if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then export MORI_RDMA_TC=96 + export MORI_IO_TC=96 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" elif [[ $NODENAME == mia1* ]]; then export MORI_RDMA_TC=104 + export MORI_IO_TC=104 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" else echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration." @@ -109,9 +113,11 @@ else NODENAME=$(hostname -s) if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then export MORI_RDMA_TC=96 + export MORI_IO_TC=96 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" elif [[ $NODENAME == mia1* ]]; then export MORI_RDMA_TC=104 + export MORI_IO_TC=104 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" else echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration." From 0a41f8980559717d1e2544ac013048dbb85b8c94 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Tue, 31 Mar 2026 06:50:28 +0000 Subject: [PATCH 12/14] cleanup --- .github/configs/amd-master.yaml | 123 -------------------------------- 1 file changed, 123 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index a139ca560..14eec1583 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -794,129 +794,6 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=2" - # FIXME(billishyahao): disable 1k8k for now - # - isl: 1024 - # osl: 8192 - # search-space: - # # MTP configurations - # # "Top of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8) - # - spec-decoding: "mtp" - # conc-list: [ 2048 ] - # prefill: - # num-worker: 1 - # tp: 1 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "PREFILL_NODES=1" - # decode: - # num-worker: 1 - # tp: 1 - # ep: 16 - # dp-attn: true - # additional-settings: - # - "DECODE_NODES=2" - # - "DECODE_MTP_SIZE=1" - - - # # "Middle of curve" (1 prefill worker at DEP8 and 2 decode workers each at DEP8) - # - spec-decoding: "mtp" - # conc-list: [ 256, 512, 1024 ] - # prefill: - # num-worker: 1 - # tp: 1 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "PREFILL_NODES=1" - # decode: - # num-worker: 2 - # tp: 1 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "DECODE_NODES=2" - # - "DECODE_MTP_SIZE=1" - - - # # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - # - spec-decoding: "mtp" - # conc-list: [ 32, 64, 128 ] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "PREFILL_NODES=1" - - # decode: - # num-worker: 2 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "DECODE_NODES=2" - # - "DECODE_MTP_SIZE=1" - - # # non-MTP configurations - # # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16) - # - spec-decoding: "none" - # conc-list: [ 2048 ] - # prefill: - # num-worker: 1 - # tp: 1 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "PREFILL_NODES=1" - # decode: - # num-worker: 1 - # tp: 1 - # ep: 16 - # dp-attn: true - # additional-settings: - # - "DECODE_NODES=2" - # - "DECODE_MTP_SIZE=0" - - # # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8) - # - spec-decoding: "none" - # conc-list: [ 256, 512, 1024 ] - # prefill: - # num-worker: 1 - # tp: 1 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "PREFILL_NODES=1" - # decode: - # num-worker: 2 - # tp: 1 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "DECODE_NODES=2" - # - "DECODE_MTP_SIZE=0" - - # # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - # - spec-decoding: "none" - # conc-list: [ 32, 64, 128 ] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "PREFILL_NODES=1" - # decode: - # num-worker: 2 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "DECODE_NODES=2" - # - "DECODE_MTP_SIZE=0" - dsr1-fp4-mi355x-sglang-disagg: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327 model: amd/DeepSeek-R1-0528-MXFP4-v2 From 7282748ed6da9d902f737b8843f0599f01546d26 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Tue, 31 Mar 2026 06:54:30 +0000 Subject: [PATCH 13/14] add perf change log --- perf-changelog.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 3dbc5eccc..1cd22211a 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1213,3 +1213,13 @@ - "Uses nvidia/GLM-5-NVFP4 model with modelopt_fp4 quantization" - "Image: lmsysorg/sglang:nightly-dev-cu13-20260328-a27651d5" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/973 + +- config-keys: + - dsr1-fp4-mi355x-sglang-disagg + - dsr1-fp4-mi355x-sglang-disagg-mtp + description: + - "Bump SGL mori image to March 27" + - "Add more low latency sweep configs" + - "Enable v2 mxfp4 DSR1 0528 model" + - "Enable fp4 disp feature on mori" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/983 From e6d4b3255d079f7de7ad13367120d521cb5d02a7 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Tue, 31 Mar 2026 08:14:42 +0000 Subject: [PATCH 14/14] add deprecate comments --- benchmarks/multi_node/amd_utils/env.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 88ea2ac84..0aa2d0c20 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -63,6 +63,8 @@ export MORI_MAX_DISPATCH_TOKENS_DECODE=160 export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) export MORI_EP_LAUNCH_CONFIG_MODE=AUTO + +#TODO(billishyahao): The following IO env will be deprecated soon. export MORI_IO_QP_MAX_SEND_WR=16384 export MORI_IO_QP_MAX_CQE=32768 export MORI_IO_QP_MAX_SGE=4