From 0383696f7c6173378dee1ab115b4151f51f47ccf Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Mon, 16 Mar 2026 08:36:19 +0000
Subject: [PATCH 01/14] [AMD] add dsr1 mxfp4 v2 sweep points

---
 .github/configs/amd-master.yaml             | 56 +++++++++++++++++++++
 benchmarks/multi_node/amd_utils/models.yaml | 31 ++++++++++++
 2 files changed, 87 insertions(+)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 5551860f2..61c842f58 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1446,6 +1446,62 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=1"
 
+dsr1-fp4-mi355x-sglang-disagg-mtp-v2:
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0313-2
+  model: amd/DeepSeek-R1-0528-MXFP4-v2
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp4
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # MTP configurations
+    # 1P1D pure TP8
+    - spec-decoding: "mtp"
+      conc-list: [ 1, 2, 4, 8 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=3"
+
+
+    # 1P2D TP8
+    - spec-decoding: "mtp"
+      conc-list: [ 2, 4, 8, 16, 32 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=3"
+
+    
+
+
 
   # FIXME(billishyahao): disable FP4 1k8k for now
   # - isl: 1024
diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index 2bbdd91d6..4c6611571 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -222,3 +222,34 @@ DeepSeek-R1-0528-MXFP4:
       max_running_requests: 128
       chunked_prefill_size: 262144
       cuda_graph_bs_range: "1-128"
+
+DeepSeek-R1-0528-MXFP4-v2:
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
+  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  prefill:
+    mem_fraction_static: 0.8
+    disable_radix_cache: true
+    dp:
+      max_running_requests: 24
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE"
+      cuda_graph_bs: "1 2 3"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 16384
+      cuda_graph_bs_range: "1-128"
+  decode:
+    mem_fraction_static: 0.85
+    prefill_round_robin_balance: true
+    dp:
+      max_running_requests: 4096
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE"
+      cuda_graph_bs_range: "1-160"
+    ep_only:
+      max_running_requests: 256
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-256"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"

From 18e05b1cbb097497a63800291b6015e8cd37e250 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Tue, 17 Mar 2026 06:36:04 +0000
Subject: [PATCH 02/14] fix

---
 .github/configs/amd-master.yaml             |  3 ---
 benchmarks/multi_node/amd_utils/models.yaml | 14 +++++++-------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 61c842f58..f20ed38fd 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1499,9 +1499,6 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-v2:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=3"
 
-    
-
-
 
   # FIXME(billishyahao): disable FP4 1k8k for now
   # - isl: 1024
diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index 4c6611571..07668659d 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -38,7 +38,7 @@
 #         cuda_graph_bs_range: str
 
 DeepSeek-V3:
-  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -69,7 +69,7 @@ DeepSeek-V3:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-V3-0324:
-  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -100,7 +100,7 @@ DeepSeek-V3-0324:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1:
-  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -131,7 +131,7 @@ DeepSeek-R1:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1-0528:
-  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -162,7 +162,7 @@ DeepSeek-R1-0528:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1-0528-MXFP4-Preview:
-  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -193,7 +193,7 @@ DeepSeek-R1-0528-MXFP4-Preview:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1-0528-MXFP4:
-  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -224,7 +224,7 @@ DeepSeek-R1-0528-MXFP4:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1-0528-MXFP4-v2:
-  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:

From 32b5d3d00cce991eb9e7a3b298c69ee7b9cf28cd Mon Sep 17 00:00:00 2001
From: Zhai Feiyue <feiyue.zhai@amd.com>
Date: Tue, 24 Mar 2026 14:59:35 +0000
Subject: [PATCH 03/14] Fix tokenizer mismatch between benchmark client and
 sglang server on transformers v5

Transformers v5 incorrectly rebuilds pre_tokenizer/decoder components for
models like DeepSeek-R1 that use LlamaTokenizerFast with a non-Llama
tokenizer architecture. The sglang server fixes this at startup, but the
benchmark client loads the tokenizer without these fixes, causing a ~5x
token count inflation (e.g. 7000 tokens -> 35000 tokens) and false
performance regressions in TTFT and throughput benchmarks.

Apply the same tokenizer fixes (pre_tokenizer/decoder restoration and
add_bos_token recovery) that sglang server applies, so client and server
tokenize identically. No-op on transformers v4.

Made-with: Cursor
---
 utils/bench_serving/backend_request_func.py | 72 ++++++++++++++++++++-
 1 file changed, 71 insertions(+), 1 deletion(-)

diff --git a/utils/bench_serving/backend_request_func.py b/utils/bench_serving/backend_request_func.py
index 32331a398..4990ef5fa 100644
--- a/utils/bench_serving/backend_request_func.py
+++ b/utils/bench_serving/backend_request_func.py
@@ -439,6 +439,75 @@ def get_model(pretrained_model_name_or_path: str) -> str:
     return pretrained_model_name_or_path
 
 
+def _fix_tokenizer_for_sglang(tokenizer, model_path):
+    """Fix transformers v5 tokenizer to match sglang server-side behavior.
+
+    Root cause: transformers v5 (>= 5.0) changed how tokenizers are loaded.
+    Specifically, LlamaTokenizerFast.__init__ in v5 rebuilds the pre_tokenizer
+    and decoder from scratch using class-specific components, discarding the
+    originals from tokenizer.json. For models like DeepSeek-R1 that declare
+    LlamaTokenizerFast but actually use a ByteLevel/Sequence tokenizer
+    architecture, v5 incorrectly replaces the original Sequence pre_tokenizer
+    with Metaspace, and the original ByteLevel decoder with Sequence.
+
+    The sglang server applies fixes for this in hf_transformers_utils.py
+    (_fix_v5_tokenizer_components and _fix_v5_add_bos_eos_token), but the
+    benchmark client loads the tokenizer directly via AutoTokenizer without
+    these fixes. This mismatch causes the client to encode text differently
+    from the server -- e.g. a 7000-token prompt on the client becomes ~35000
+    tokens on the server, leading to ~5x TTFT inflation and false performance
+    regressions in benchmarks.
+
+    This function replicates the same fixes so the benchmark client tokenizes
+    identically to the sglang server. It is a no-op on transformers v4.
+    """
+    import json
+    from pathlib import Path
+
+    backend = getattr(tokenizer, "_tokenizer", None)
+    if backend is not None:
+        try:
+            from tokenizers import Tokenizer as RawTokenizer
+            tok_file = Path(model_path) / "tokenizer.json"
+            if tok_file.is_file():
+                raw = RawTokenizer.from_file(str(tok_file))
+                raw_pre = type(raw.pre_tokenizer).__name__ if raw.pre_tokenizer else None
+                loaded_pre = type(backend.pre_tokenizer).__name__ if backend.pre_tokenizer else None
+                if raw_pre and loaded_pre and raw_pre != loaded_pre:
+                    backend.pre_tokenizer = raw.pre_tokenizer
+                    backend.decoder = raw.decoder
+        except Exception:
+            pass
+
+    try:
+        config_file = Path(model_path) / "tokenizer_config.json"
+        if config_file.is_file():
+            with open(config_file) as f:
+                config = json.load(f)
+            tok_class = config.get("tokenizer_class", "")
+            bos_eos_classes = {
+                "LlamaTokenizer", "LlamaTokenizerFast",
+                "CodeLlamaTokenizer", "CodeLlamaTokenizerFast",
+                "GemmaTokenizer", "GemmaTokenizerFast", "CohereTokenizerFast",
+            }
+            if tok_class in bos_eos_classes:
+                defaults = {"add_bos_token": True, "add_eos_token": False}
+                changed = False
+                for attr in ("add_bos_token", "add_eos_token"):
+                    val = config.get(attr)
+                    if val is None:
+                        val = defaults.get(attr, False)
+                    if getattr(tokenizer, attr, None) != val:
+                        setattr(tokenizer, f"_{attr}", val)
+                        changed = True
+                if changed and hasattr(tokenizer, "update_post_processor"):
+                    tokenizer.update_post_processor()
+    except Exception:
+        pass
+
+    return tokenizer
+
+
 def get_tokenizer(
     pretrained_model_name_or_path: str,
     tokenizer_mode: str = "auto",
@@ -464,11 +533,12 @@ def get_tokenizer(
         return MistralTokenizer.from_pretrained(
             str(pretrained_model_name_or_path))
     else:
-        return AutoTokenizer.from_pretrained(
+        tokenizer = AutoTokenizer.from_pretrained(
             pretrained_model_name_or_path,
             trust_remote_code=trust_remote_code,
             **kwargs,
         )
+        return _fix_tokenizer_for_sglang(tokenizer, pretrained_model_name_or_path)
 
 
 ASYNC_REQUEST_FUNCS = {

From 0bd347fe71d6689269c81569b797985618ffad7f Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Wed, 25 Mar 2026 15:28:03 +0000
Subject: [PATCH 04/14] change mtp model to fp8

---
 .github/configs/amd-master.yaml             | 369 +++++++++++++++++++-
 benchmarks/multi_node/amd_utils/models.yaml |   2 +-
 2 files changed, 369 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index f20ed38fd..525595b7b 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1446,8 +1446,218 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=1"
 
+
+dsr1-fp4-mi355x-sglang-disagg-v2:
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323
+  model: amd/DeepSeek-R1-0528-MXFP4-v2
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp4
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # non-MTP configurations
+    # 1P1D TP8
+    - spec-decoding: "none"
+      conc-list: [ 1, 2, 4, 8 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+    # 1P2D TP8
+    - spec-decoding: "none"
+      conc-list: [ 2, 4, 8, 16, 32 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
+    # 1P2D TP8
+    - spec-decoding: "none" 
+      conc-list: [ 64, 128, 256 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
+    # 1P2D TP4
+    - spec-decoding: "none" 
+      conc-list: [ 64, 128, 256 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+    
+    # 1*DEP4+ 1*DEP8
+    - spec-decoding: "none"
+      conc-list: [ 1024, 2048 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # non-MTP configurations
+    # 1P1D pure TP8
+    - spec-decoding: "none"
+      conc-list: [ 1, 2, 4, 8 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+    # 1P2D TP8
+    - spec-decoding: "none"
+      conc-list: [ 2, 4, 8, 16, 32 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
+    # 1P2D TP8
+    - spec-decoding: "none"
+      conc-list: [ 64, 128, 256 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
+    # 1P2D TP4
+    - spec-decoding: "none"
+      conc-list: [ 64, 128, 256 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
+    # 4*DEP4 + 1*DEP8
+    - spec-decoding: "none"
+      conc-list: [ 1024, 2048, 4096 ]
+      prefill:
+        num-worker: 4
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=4"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+
 dsr1-fp4-mi355x-sglang-disagg-mtp-v2:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0313-2
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1456,6 +1666,106 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-v2:
   multinode: true
   disagg: true
   seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # MTP configurations
+    # 1P1D TP8
+    - spec-decoding: "mtp"
+      conc-list: [ 1, 2, 4, 8 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=3"
+
+    # 1P2D TP8
+    - spec-decoding: "mtp" 
+      conc-list: [ 2, 4, 8, 16, 32 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=3"
+
+    # 1P2D TP8
+    - spec-decoding: "mtp" 
+      conc-list: [ 64, 128, 256 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=1"
+
+    # 1P2D TP4
+    - spec-decoding: "mtp" 
+      conc-list: [ 64, 128, 256 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=1"
+
+    # 1*DEP4+ 1*DEP8
+    - spec-decoding: "mtp"
+      conc-list: [ 1024, 2048 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=1"
+
+
   - isl: 8192
     osl: 1024
     search-space:
@@ -1499,6 +1809,63 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-v2:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=3"
 
+    # 1P2D TP8
+    - spec-decoding: "mtp"
+      conc-list: [ 64, 128, 256 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=1"
+
+    # 1P2D TP4
+    - spec-decoding: "mtp"
+      conc-list: [ 64, 128, 256 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=1"
+
+    # 4*DEP4 + 1*DEP8
+    - spec-decoding: "mtp"
+      conc-list: [ 1024, 2048, 4096 ]
+      prefill:
+        num-worker: 4
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=4"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=1"
+
 
   # FIXME(billishyahao): disable FP4 1k8k for now
   # - isl: 1024
diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index 07668659d..6bca6b52a 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -225,7 +225,7 @@ DeepSeek-R1-0528-MXFP4:
 
 DeepSeek-R1-0528-MXFP4-v2:
   base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
-  mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
+  mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
     mem_fraction_static: 0.8

From 754e53c00fd834dcc6093c8b164966b8019b0605 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Wed, 25 Mar 2026 15:32:42 +0000
Subject: [PATCH 05/14] change fp8 image

---
 .github/configs/amd-master.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 525595b7b..2cea84d01 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -596,7 +596,7 @@ dsr1-fp8-mi355x-atom-mtp:
     - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp  }
 
 dsr1-fp8-mi355x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323
   model: deepseek-ai/DeepSeek-R1-0528
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -751,7 +751,7 @@ dsr1-fp8-mi355x-sglang-disagg:
 
 
 dsr1-fp8-mi355x-sglang-disagg-mtp:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323
   model: deepseek-ai/DeepSeek-R1-0528
   model-prefix: dsr1
   runner: mi355x-disagg

From f29f2d01ea990161dfc6bc79401a30a30bda9502 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Fri, 27 Mar 2026 11:20:18 +0000
Subject: [PATCH 06/14] bump image to 0327

---
 .github/configs/amd-master.yaml             | 4 ++--
 benchmarks/multi_node/amd_utils/env.sh      | 4 +++-
 benchmarks/multi_node/amd_utils/models.yaml | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 2cea84d01..a0112d479 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1448,7 +1448,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
 
 
 dsr1-fp4-mi355x-sglang-disagg-v2:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1657,7 +1657,7 @@ dsr1-fp4-mi355x-sglang-disagg-v2:
 
 
 dsr1-fp4-mi355x-sglang-disagg-mtp-v2:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index 5565c5b3b..f4b631673 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -34,7 +34,6 @@ export IBDEVICES
 export GLOO_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1)
 export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1)
 
-set +x
 
 export NCCL_IB_HCA=$IBDEVICES
 
@@ -123,4 +122,7 @@ fi
 # FIXME: WA for latest upstream 0305 image
 export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH}
 
+export SGLANG_ENABLE_SPEC_V2=1
+export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1
 
+set +x
diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index 6bca6b52a..eed59bdab 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -225,7 +225,7 @@ DeepSeek-R1-0528-MXFP4:
 
 DeepSeek-R1-0528-MXFP4-v2:
   base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
-  mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1"
+  mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode "
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
     mem_fraction_static: 0.8

From a44c7eb8759e8527d8af192e5aa2ffc7f7e65fb0 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Fri, 27 Mar 2026 14:09:11 +0000
Subject: [PATCH 07/14] remove specv2

---
 benchmarks/multi_node/amd_utils/env.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index f4b631673..02cb77a91 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -122,7 +122,5 @@ fi
 # FIXME: WA for latest upstream 0305 image
 export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH}
 
-export SGLANG_ENABLE_SPEC_V2=1
-export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1
 
 set +x

From 25141364c930fc59e455aeca97eeeebd81e750fa Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Mon, 30 Mar 2026 01:57:19 +0000
Subject: [PATCH 08/14] consolidate dsr1 fp4 configs

---
 .github/configs/amd-master.yaml | 422 +-------------------------------
 1 file changed, 1 insertion(+), 421 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index a0112d479..6a96a4af2 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1027,427 +1027,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
   #       - "DECODE_NODES=2"
   #       - "DECODE_MTP_SIZE=0"
 
-
 dsr1-fp4-mi355x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3
-  model: amd/DeepSeek-R1-0528-MXFP4
-  model-prefix: dsr1
-  runner: mi355x-disagg
-  precision: fp4
-  framework: sglang-disagg
-  multinode: true
-  disagg: true
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    search-space:
-    # non-MTP configurations
-    # 1P1D TP8
-    - spec-decoding: "none"
-      conc-list: [ 1, 2, 4, 8 ]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
-
-    # 1P2D TP8
-    - spec-decoding: "none"
-      conc-list: [ 2, 4, 8, 16, 32 ]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
-
-    # 1P2D TP8
-    - spec-decoding: "none" 
-      conc-list: [ 64, 128, 256 ]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
-
-    # 1P2D TP4
-    - spec-decoding: "none" 
-      conc-list: [ 64, 128, 256 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
-    
-    # 1*DEP4+ 1*DEP8
-    - spec-decoding: "none"
-      conc-list: [ 1024, 2048 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
-
-  - isl: 8192
-    osl: 1024
-    search-space:
-    # non-MTP configurations
-    # 1P1D pure TP8
-    - spec-decoding: "none"
-      conc-list: [ 1, 2, 4, 8 ]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
-
-    # 1P2D TP8
-    - spec-decoding: "none"
-      conc-list: [ 2, 4, 8, 16, 32 ]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
-
-    # 1P2D TP8
-    - spec-decoding: "none"
-      conc-list: [ 64, 128, 256 ]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
-
-    # 1P2D TP4
-    - spec-decoding: "none"
-      conc-list: [ 64, 128, 256 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
-
-    # 4*DEP4 + 1*DEP8
-    - spec-decoding: "none"
-      conc-list: [ 1024, 2048, 4096 ]
-      prefill:
-        num-worker: 4
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=4"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
-
-dsr1-fp4-mi355x-sglang-disagg-mtp:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3
-  model: amd/DeepSeek-R1-0528-MXFP4
-  model-prefix: dsr1
-  runner: mi355x-disagg
-  precision: fp4
-  framework: sglang-disagg
-  multinode: true
-  disagg: true
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    search-space:
-    # MTP configurations
-    # 1P1D TP8
-    - spec-decoding: "mtp"
-      conc-list: [ 1, 2, 4, 8 ]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=3"
-
-    # 1P2D TP8
-    - spec-decoding: "mtp" 
-      conc-list: [ 2, 4, 8, 16, 32 ]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=3"
-
-    # 1P2D TP8
-    - spec-decoding: "mtp" 
-      conc-list: [ 64, 128, 256 ]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=1"
-
-    # 1P2D TP4
-    - spec-decoding: "mtp" 
-      conc-list: [ 64, 128, 256 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=1"
-
-    # 1*DEP4+ 1*DEP8
-    - spec-decoding: "mtp"
-      conc-list: [ 1024, 2048 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=1"
-
-
-  - isl: 8192
-    osl: 1024
-    search-space:
-    # MTP configurations
-    # 1P1D pure TP8
-    - spec-decoding: "mtp"
-      conc-list: [ 1, 2, 4, 8 ]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=3"
-
-
-    # 1P2D TP8
-    - spec-decoding: "mtp"
-      conc-list: [ 2, 4, 8, 16, 32 ]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=3"
-
-    # 1P2D TP8
-    - spec-decoding: "mtp"
-      conc-list: [ 64, 128, 256 ]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=1"
-
-    # 1P2D TP4
-    - spec-decoding: "mtp"
-      conc-list: [ 64, 128, 256 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=1"
-
-    # 4*DEP4 + 1*DEP8
-    - spec-decoding: "mtp"
-      conc-list: [ 1024, 2048, 4096 ]
-      prefill:
-        num-worker: 4
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=4"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=1"
-
-
-dsr1-fp4-mi355x-sglang-disagg-v2:
   image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
@@ -1656,7 +1236,7 @@ dsr1-fp4-mi355x-sglang-disagg-v2:
         - "DECODE_MTP_SIZE=0"
 
 
-dsr1-fp4-mi355x-sglang-disagg-mtp-v2:
+dsr1-fp4-mi355x-sglang-disagg-mtp:
   image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1

From 682a4ab4ec3d42c73cd5c54b9aede2ba1fc33a54 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Mon, 30 Mar 2026 02:03:58 +0000
Subject: [PATCH 09/14] bump fp8 image to 0327

---
 .github/configs/amd-master.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 27518d40b..a139ca560 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -486,7 +486,7 @@ dsr1-fp8-mi355x-atom-mtp:
     - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp  }
 
 dsr1-fp8-mi355x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327
   model: deepseek-ai/DeepSeek-R1-0528
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -641,7 +641,7 @@ dsr1-fp8-mi355x-sglang-disagg:
 
 
 dsr1-fp8-mi355x-sglang-disagg-mtp:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327
   model: deepseek-ai/DeepSeek-R1-0528
   model-prefix: dsr1
   runner: mi355x-disagg

From 64bf10078c4e4f9f19486dc0f6727dc6ef1902d2 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Mon, 30 Mar 2026 04:36:27 +0000
Subject: [PATCH 10/14] fix crash

---
 benchmarks/multi_node/amd_utils/server.sh | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index 7f174b760..7340ef51c 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -187,18 +187,8 @@ else
     decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP
 fi
 
-# Use Decode configuration to configure different TP/DP size between P and D
-PREFILL_DECODE_DIFFERENT_TP=""
-if [[ "$PREFILL_ENABLE_DP" != "$DECODE_ENABLE_DP" ]]; then
-    if [[ "$DECODE_ENABLE_DP" == "true" ]]; then
-        PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp ${DECODE_TP_SIZE}"
-    else
-        PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp 1"
-    fi
-fi
-
 # Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS)
-PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} ${PREFILL_DECODE_DIFFERENT_TP}"
+PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} "
 if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then
     PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache"
 fi

From c44e1755ea6cc81f5e6f59b071ed20ddb7abefe4 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Mon, 30 Mar 2026 15:26:57 +0000
Subject: [PATCH 11/14] fix env

---
 benchmarks/multi_node/amd_utils/env.sh | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index 02cb77a91..88ea2ac84 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -88,17 +88,21 @@ $1 == "DSCP" && $2 == ":" && $NF == p {
     if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then
         TC=$(( 4 * ND_DSCP ))
         export MORI_RDMA_SL=$ND_PRIO
+        export MORI_IO_SL=$ND_PRIO
         export MORI_RDMA_TC=$TC
-        echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL"
+        export MORI_IO_TC=$TC
+        echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL, MORI_IO_TC=$MORI_IO_TC, MORI_IO_SL=$MORI_IO_SL"
     else
         echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection."
         # Fall back to hostname-based detection
         NODENAME=$(hostname -s)
         if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
             export MORI_RDMA_TC=96
+            export MORI_IO_TC=96
             echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
         elif [[ $NODENAME == mia1* ]]; then
             export MORI_RDMA_TC=104
+            export MORI_IO_TC=104
             echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
         else
             echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration."
@@ -109,9 +113,11 @@ else
     NODENAME=$(hostname -s)
     if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
         export MORI_RDMA_TC=96
+        export MORI_IO_TC=96
         echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
     elif [[ $NODENAME == mia1* ]]; then
         export MORI_RDMA_TC=104
+        export MORI_IO_TC=104
         echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
     else
         echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration."

From 0a41f8980559717d1e2544ac013048dbb85b8c94 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Tue, 31 Mar 2026 06:50:28 +0000
Subject: [PATCH 12/14] cleanup

---
 .github/configs/amd-master.yaml | 123 --------------------------------
 1 file changed, 123 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index a139ca560..14eec1583 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -794,129 +794,6 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=2"
 
-  # FIXME(billishyahao): disable 1k8k for now
-  # - isl: 1024
-  #   osl: 8192
-  #   search-space:
-  #   # MTP configurations
-  #   # "Top of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8)
-  #   - spec-decoding: "mtp"
-  #     conc-list: [ 2048 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 1
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-  #     decode:
-  #       num-worker: 1
-  #       tp: 1
-  #       ep: 16
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "DECODE_NODES=2"
-  #       - "DECODE_MTP_SIZE=1"
-
-
-  #   # "Middle of curve" (1 prefill worker at DEP8 and 2 decode workers each at DEP8)
-  #   - spec-decoding: "mtp"
-  #     conc-list: [ 256, 512, 1024 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 1
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-  #     decode:
-  #       num-worker: 2
-  #       tp: 1
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "DECODE_NODES=2"
-  #       - "DECODE_MTP_SIZE=1"
-
-
-  #   # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
-  #   - spec-decoding: "mtp"
-  #     conc-list: [ 32, 64, 128 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: false
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-
-  #     decode:
-  #       num-worker: 2
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: false
-  #       additional-settings:
-  #       - "DECODE_NODES=2"
-  #       - "DECODE_MTP_SIZE=1"
-
-  #   # non-MTP configurations
-  #   # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16)
-  #   - spec-decoding: "none"
-  #     conc-list: [ 2048 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 1
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-  #     decode:
-  #       num-worker: 1
-  #       tp: 1
-  #       ep: 16
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "DECODE_NODES=2"
-  #       - "DECODE_MTP_SIZE=0"
-
-  #   # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8)
-  #   - spec-decoding: "none"
-  #     conc-list: [ 256, 512, 1024 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 1
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-  #     decode:
-  #       num-worker: 2
-  #       tp: 1
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "DECODE_NODES=2"
-  #       - "DECODE_MTP_SIZE=0"
-
-  #   # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
-  #   - spec-decoding: "none"
-  #     conc-list: [ 32, 64, 128 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: false
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-  #     decode:
-  #       num-worker: 2
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: false
-  #       additional-settings:
-  #       - "DECODE_NODES=2"
-  #       - "DECODE_MTP_SIZE=0"
-
 dsr1-fp4-mi355x-sglang-disagg:
   image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327
   model: amd/DeepSeek-R1-0528-MXFP4-v2

From 7282748ed6da9d902f737b8843f0599f01546d26 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Tue, 31 Mar 2026 06:54:30 +0000
Subject: [PATCH 13/14] add perf change log

---
 perf-changelog.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 3dbc5eccc..1cd22211a 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1213,3 +1213,13 @@
     - "Uses nvidia/GLM-5-NVFP4 model with modelopt_fp4 quantization"
     - "Image: lmsysorg/sglang:nightly-dev-cu13-20260328-a27651d5"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/973
+
+- config-keys:
+    - dsr1-fp4-mi355x-sglang-disagg
+    - dsr1-fp4-mi355x-sglang-disagg-mtp
+  description:
+    - "Bump SGL mori image to March 27"
+    - "Add more low latency sweep configs"
+    - "Enable v2 mxfp4 DSR1 0528 model"
+    - "Enable fp4 disp feature on mori"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/983

From e6d4b3255d079f7de7ad13367120d521cb5d02a7 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Tue, 31 Mar 2026 08:14:42 +0000
Subject: [PATCH 14/14] add deprecate comments

---
 benchmarks/multi_node/amd_utils/env.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index 88ea2ac84..0aa2d0c20 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -63,6 +63,8 @@ export MORI_MAX_DISPATCH_TOKENS_DECODE=160
 export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2))
 
 export MORI_EP_LAUNCH_CONFIG_MODE=AUTO
+
+#TODO(billishyahao): The following IO env will be deprecated soon.
 export MORI_IO_QP_MAX_SEND_WR=16384
 export MORI_IO_QP_MAX_CQE=32768
 export MORI_IO_QP_MAX_SGE=4