From a1742411a53f9bdbb0c715ae3d2a83c576bbfd8a Mon Sep 17 00:00:00 2001
From: Mahdi Ghodsi <mahdi.ghodsi@amd.com>
Date: Mon, 22 Jun 2026 16:58:30 -0700
Subject: [PATCH] gfx950: disable AITER MoE for openai/gpt-oss models

AITER MoE kernels produce corrupted output on gpt-oss (coherent
reasoning followed by word-salad/repetition/unicode-junk) on
ROCm 7.2 / vLLM 0.23.0. Setting VLLM_ROCM_USE_AITER_MOE=0
fixes it. AITER attention and unified-attention backend stay on.
Verified on both 20b and 120b variants.
---
 skills/serving-llms-on-instinct/data/gpu_overrides.json | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/skills/serving-llms-on-instinct/data/gpu_overrides.json b/skills/serving-llms-on-instinct/data/gpu_overrides.json
index 57709c9..c7ca5a6 100644
--- a/skills/serving-llms-on-instinct/data/gpu_overrides.json
+++ b/skills/serving-llms-on-instinct/data/gpu_overrides.json
@@ -22,6 +22,13 @@
         "unsupported": ["nvfp4"],
         "notes": "MXFP4/MXFP6 are hardware-native on gfx950. FP8 uses OCP (E4M3FN) standard, not FNUZ. NVFP4 is NVIDIA-specific and will not load on ROCm."
       },
+      "model_overrides": [
+        {
+          "match": "openai/gpt-oss",
+          "env_set": {"VLLM_ROCM_USE_AITER_MOE": "0"},
+          "reason": "AITER MoE kernels corrupt gpt-oss output on gfx950 (coherent reasoning but word-salad/repetition/unicode-junk final answer at any temperature) on ROCm 7.2 / vLLM 0.23.0. Only the MoE path needs to be off; AITER attention and the unified-attention backend remain enabled. AITER_FUSED_MOE_A16W4=1 does NOT fix it; AITER_MOE=0 does. Verified on 20b and 120b."
+        }
+      ],
       "workarounds": []
     },
     "gfx942": {