From fcfac5b017960d19654f624c95f2f6306cec253e Mon Sep 17 00:00:00 2001
From: rjckkkkk <59609580+rjckkkkk@users.noreply.github.com>
Date: Mon, 8 Jun 2026 15:25:38 +0000
Subject: [PATCH] Add Strix Halo llama.cpp GGUF model knowledge (aliases +
 verified perf)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On the AMD Strix Halo (Radeon 8060S iGPU) rig, scanned on-disk GGUF model
names (e.g. Qwen3.5-9B-Q4_K_M) did not match catalog metadata.name, so
deploy fell back to auto-detect instead of the curated config. Add
metadata.aliases so the local scanner matches, and record llama.cpp b9180
HIP verified decode perf (all 999 layers offloaded, Q4_K_M):

  qwen3.5-9b       33.8 tok/s   (alias only; llamacpp variant already present)
  qwen3.5-27b      11.7 tok/s   (added universal llamacpp variant + GGUF source)
  glm-4.7-flash    58.9 tok/s   (added universal llamacpp variant + GGUF source)
  qwen3.5-35b-a3b  63.0 tok/s   (alias + verified perf; variant already present)

New llamacpp variants use gpu_arch "*" so they apply on any device (GGUF is
the path for low-VRAM hardware). No Go changes — knowledge-only (INV-1/2).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 catalog/models/glm-4.7-flash.yaml   | 27 ++++++++++++++++++++++++-
 catalog/models/qwen3.5-27b.yaml     | 31 ++++++++++++++++++++++++++++-
 catalog/models/qwen3.5-35b-a3b.yaml |  6 +++++-
 catalog/models/qwen3.5-9b.yaml      |  5 +++++
 4 files changed, 66 insertions(+), 3 deletions(-)

diff --git a/catalog/models/glm-4.7-flash.yaml b/catalog/models/glm-4.7-flash.yaml
index d6404d2a..47038562 100644
--- a/catalog/models/glm-4.7-flash.yaml
+++ b/catalog/models/glm-4.7-flash.yaml
@@ -5,19 +5,44 @@ metadata:
   family: glm
   parameter_count: "30B"
   released_at: "2025-11"
+  # Scan-name alias: on-disk GGUF name so the local scanner matches this asset.
+  aliases:
+    - GLM-4.7-Flash-Q4_K_M
 storage:
-  formats: [safetensors]
+  formats: [safetensors, gguf]
   default_path_pattern: "{{.DataDir}}/models/{{.Name}}"
   sources:
     - type: huggingface
       repo: THUDM/GLM-4.7-Flash
       format: safetensors
+    - type: huggingface
+      repo: unsloth/GLM-4.7-Flash-GGUF
+      format: gguf
     - type: modelscope
       repo: ZhipuAI/GLM-4.7-Flash
       format: safetensors
     - type: local_path
       path: ""
 variants:
+  # --- universal llamacpp GGUF (verified on AMD Strix Halo Radeon 8060S iGPU) ---
+  # GLM-4.7-Flash 30B-A3B MoE (llama.cpp arch deepseek2), Q4_K_M ~18GB, all layers offloaded.
+  - name: glm-4.7-flash-universal-llamacpp-q4
+    hardware:
+      gpu_arch: "*"
+      vram_min_mib: 0
+      ram_min_mib: 20480
+    engine: llamacpp
+    format: gguf
+    default_config:
+      quantization: int4
+      n_gpu_layers: 999
+      ctx_size: 8192
+    expected_performance:
+      startup_time_s: 20
+      cold_start_time_s: 40
+      tokens_per_second: [48, 60]
+      latency_first_token_ms: [30, 200]
+      notes: "Verified 2026-06-08 on AMD Strix Halo Radeon 8060S iGPU, llama.cpp b9180 HIP, Q4_K_M (deepseek2 arch, 30B-A3B MoE) all 999 layers offloaded: 58.9 tok/s decode, 83 tok/s prompt."
   # --- GB10 (Blackwell, 128GB unified, K3S) ---
   # BF16 ~59GB, 128GB unified memory 可容纳
   # Glm4MoeLiteForCausalLM 架构 (glm4_moe_lite model_type)
diff --git a/catalog/models/qwen3.5-27b.yaml b/catalog/models/qwen3.5-27b.yaml
index 13bb25aa..2b9aba6d 100644
--- a/catalog/models/qwen3.5-27b.yaml
+++ b/catalog/models/qwen3.5-27b.yaml
@@ -5,17 +5,46 @@ metadata:
   family: qwen
   parameter_count: "27B"
   released_at: "2025-09"
+  # Scan-name alias: on-disk GGUF name so the local scanner matches this asset.
+  aliases:
+    - Qwen3.5-27B-Q4_K_M
 storage:
-  formats: [safetensors]
+  formats: [safetensors, gguf]
   default_path_pattern: "{{.DataDir}}/models/{{.Name}}"
   sources:
     - type: huggingface
       repo: Qwen/Qwen3.5-27B
       format: safetensors
+    - type: huggingface
+      repo: unsloth/Qwen3.5-27B-GGUF
+      format: gguf
     - type: modelscope
       repo: Qwen/Qwen3.5-27B
       format: safetensors
+    - type: local_path
+      path: ""
 variants:
+  # --- universal llamacpp GGUF (verified on AMD Strix Halo Radeon 8060S iGPU) ---
+  # Q4_K_M ~16.5GB, all layers offloaded. gpu_arch "*" so it matches even when
+  # HAL cannot resolve a specific arch (e.g. Windows AMD APU reports no arch).
+  - name: qwen3.5-27b-universal-llamacpp-q4
+    hardware:
+      gpu_arch: "*"
+      vram_min_mib: 0
+      ram_min_mib: 18432
+    engine: llamacpp
+    format: gguf
+    default_config:
+      quantization: int4
+      n_gpu_layers: 999
+      ctx_size: 8192
+    expected_performance:
+      startup_time_s: 25
+      cold_start_time_s: 45
+      tokens_per_second: [11, 13]
+      latency_first_token_ms: [40, 250]
+      notes: "Verified 2026-06-08 on AMD Strix Halo Radeon 8060S iGPU, llama.cpp b9180 HIP, Q4_K_M all 999 layers offloaded: 11.7 tok/s decode, 57 tok/s prompt."
+
   # --- generic vllm-nightly qwen3_5 image is not a validated runtime for this model ---
   - name: qwen3.5-27b-vllm-nightly-unsupported
     hardware:
diff --git a/catalog/models/qwen3.5-35b-a3b.yaml b/catalog/models/qwen3.5-35b-a3b.yaml
index d0196a12..565bc874 100644
--- a/catalog/models/qwen3.5-35b-a3b.yaml
+++ b/catalog/models/qwen3.5-35b-a3b.yaml
@@ -5,6 +5,9 @@ metadata:
   family: qwen
   parameter_count: "35B"
   released_at: "2025-12"
+  # Scan-name alias: on-disk GGUF name so the local scanner matches this asset.
+  aliases:
+    - Qwen3.5-35B-A3B-Q4_K_M
 storage:
   formats: [safetensors, gguf]
   default_path_pattern: "{{.DataDir}}/models/{{.Name}}"
@@ -363,5 +366,6 @@ variants:
     expected_performance:
       startup_time_s: 30
       cold_start_time_s: 60
-      tokens_per_second: [3, 15]
+      tokens_per_second: [3, 63]
       latency_first_token_ms: [80, 400]
+      notes: "Verified 2026-06-08 on AMD Strix Halo Radeon 8060S iGPU, llama.cpp b9180 HIP, Q4_K_M (35B-A3B MoE, ~3.5B active) all 999 layers offloaded: 63 tok/s decode, 101 tok/s prompt. Lower end of range is weak-device CPU-offload fallback."
diff --git a/catalog/models/qwen3.5-9b.yaml b/catalog/models/qwen3.5-9b.yaml
index 4be24dd0..f70beb81 100644
--- a/catalog/models/qwen3.5-9b.yaml
+++ b/catalog/models/qwen3.5-9b.yaml
@@ -5,6 +5,10 @@ metadata:
   family: qwen
   parameter_count: "9B"
   released_at: "2025-09"
+  # Scan-name aliases: on-disk GGUF dir/file names so the local model scanner
+  # matches this catalog asset instead of falling back to auto-detect.
+  aliases:
+    - Qwen3.5-9B-Q4_K_M
 openclaw:
   chat_provider: false
 http:
@@ -110,3 +114,4 @@ variants:
       cold_start_time_s: 30
       tokens_per_second: [10, 40]
       latency_first_token_ms: [30, 150]
+      notes: "Verified 2026-06-08 on AMD Strix Halo Radeon 8060S iGPU (RDNA3.5), llama.cpp b9180 HIP, Q4_K_M all 999 layers offloaded: 33.8 tok/s decode, 206 tok/s prompt."