From a1742411a53f9bdbb0c715ae3d2a83c576bbfd8a Mon Sep 17 00:00:00 2001 From: Mahdi Ghodsi Date: Mon, 22 Jun 2026 16:58:30 -0700 Subject: [PATCH] gfx950: disable AITER MoE for openai/gpt-oss models AITER MoE kernels produce corrupted output on gpt-oss (coherent reasoning followed by word-salad/repetition/unicode-junk) on ROCm 7.2 / vLLM 0.23.0. Setting VLLM_ROCM_USE_AITER_MOE=0 fixes it. AITER attention and unified-attention backend stay on. Verified on both 20b and 120b variants. --- skills/serving-llms-on-instinct/data/gpu_overrides.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/skills/serving-llms-on-instinct/data/gpu_overrides.json b/skills/serving-llms-on-instinct/data/gpu_overrides.json index 57709c9..c7ca5a6 100644 --- a/skills/serving-llms-on-instinct/data/gpu_overrides.json +++ b/skills/serving-llms-on-instinct/data/gpu_overrides.json @@ -22,6 +22,13 @@ "unsupported": ["nvfp4"], "notes": "MXFP4/MXFP6 are hardware-native on gfx950. FP8 uses OCP (E4M3FN) standard, not FNUZ. NVFP4 is NVIDIA-specific and will not load on ROCm." }, + "model_overrides": [ + { + "match": "openai/gpt-oss", + "env_set": {"VLLM_ROCM_USE_AITER_MOE": "0"}, + "reason": "AITER MoE kernels corrupt gpt-oss output on gfx950 (coherent reasoning but word-salad/repetition/unicode-junk final answer at any temperature) on ROCm 7.2 / vLLM 0.23.0. Only the MoE path needs to be off; AITER attention and the unified-attention backend remain enabled. AITER_FUSED_MOE_A16W4=1 does NOT fix it; AITER_MOE=0 does. Verified on 20b and 120b." + } + ], "workarounds": [] }, "gfx942": {