Approaching-AI · rjckkkkk · Jun 8, 2026
diff --git a/catalog/models/glm-4.7-flash.yaml b/catalog/models/glm-4.7-flash.yaml
@@ -5,19 +5,44 @@ metadata:
   family: glm
   parameter_count: "30B"
   released_at: "2025-11"
+  # Scan-name alias: on-disk GGUF name so the local scanner matches this asset.
+  aliases:
+    - GLM-4.7-Flash-Q4_K_M
 storage:
-  formats: [safetensors]
+  formats: [safetensors, gguf]
   default_path_pattern: "{{.DataDir}}/models/{{.Name}}"
   sources:
     - type: huggingface
       repo: THUDM/GLM-4.7-Flash
       format: safetensors
+    - type: huggingface
+      repo: unsloth/GLM-4.7-Flash-GGUF
+      format: gguf
     - type: modelscope
       repo: ZhipuAI/GLM-4.7-Flash
       format: safetensors
     - type: local_path
       path: ""
 variants:
+  # --- universal llamacpp GGUF (verified on AMD Strix Halo Radeon 8060S iGPU) ---
+  # GLM-4.7-Flash 30B-A3B MoE (llama.cpp arch deepseek2), Q4_K_M ~18GB, all layers offloaded.
+  - name: glm-4.7-flash-universal-llamacpp-q4
+    hardware:
+      gpu_arch: "*"
+      vram_min_mib: 0
+      ram_min_mib: 20480
+    engine: llamacpp
+    format: gguf
+    default_config:
+      quantization: int4
+      n_gpu_layers: 999
+      ctx_size: 8192
+    expected_performance:
+      startup_time_s: 20
+      cold_start_time_s: 40
+      tokens_per_second: [48, 60]
+      latency_first_token_ms: [30, 200]
+      notes: "Verified 2026-06-08 on AMD Strix Halo Radeon 8060S iGPU, llama.cpp b9180 HIP, Q4_K_M (deepseek2 arch, 30B-A3B MoE) all 999 layers offloaded: 58.9 tok/s decode, 83 tok/s prompt."
   # --- GB10 (Blackwell, 128GB unified, K3S) ---
   # BF16 ~59GB, 128GB unified memory 可容纳
   # Glm4MoeLiteForCausalLM 架构 (glm4_moe_lite model_type)

diff --git a/catalog/models/qwen3.5-27b.yaml b/catalog/models/qwen3.5-27b.yaml
@@ -5,17 +5,46 @@ metadata:
   family: qwen
   parameter_count: "27B"
   released_at: "2025-09"
+  # Scan-name alias: on-disk GGUF name so the local scanner matches this asset.
+  aliases:
+    - Qwen3.5-27B-Q4_K_M
 storage:
-  formats: [safetensors]
+  formats: [safetensors, gguf]
   default_path_pattern: "{{.DataDir}}/models/{{.Name}}"
   sources:
     - type: huggingface
       repo: Qwen/Qwen3.5-27B
       format: safetensors
+    - type: huggingface
+      repo: unsloth/Qwen3.5-27B-GGUF
+      format: gguf
     - type: modelscope
       repo: Qwen/Qwen3.5-27B
       format: safetensors
+    - type: local_path
+      path: ""
 variants:
+  # --- universal llamacpp GGUF (verified on AMD Strix Halo Radeon 8060S iGPU) ---
+  # Q4_K_M ~16.5GB, all layers offloaded. gpu_arch "*" so it matches even when
+  # HAL cannot resolve a specific arch (e.g. Windows AMD APU reports no arch).
+  - name: qwen3.5-27b-universal-llamacpp-q4
+    hardware:
+      gpu_arch: "*"
+      vram_min_mib: 0
+      ram_min_mib: 18432
+    engine: llamacpp
+    format: gguf
+    default_config:
+      quantization: int4
+      n_gpu_layers: 999
+      ctx_size: 8192
+    expected_performance:
+      startup_time_s: 25
+      cold_start_time_s: 45
+      tokens_per_second: [11, 13]
+      latency_first_token_ms: [40, 250]
+      notes: "Verified 2026-06-08 on AMD Strix Halo Radeon 8060S iGPU, llama.cpp b9180 HIP, Q4_K_M all 999 layers offloaded: 11.7 tok/s decode, 57 tok/s prompt."
+
   # --- generic vllm-nightly qwen3_5 image is not a validated runtime for this model ---
   - name: qwen3.5-27b-vllm-nightly-unsupported
     hardware:

diff --git a/catalog/models/qwen3.5-35b-a3b.yaml b/catalog/models/qwen3.5-35b-a3b.yaml
@@ -5,6 +5,9 @@ metadata:
   family: qwen
   parameter_count: "35B"
   released_at: "2025-12"
+  # Scan-name alias: on-disk GGUF name so the local scanner matches this asset.
+  aliases:
+    - Qwen3.5-35B-A3B-Q4_K_M
 storage:
   formats: [safetensors, gguf]
   default_path_pattern: "{{.DataDir}}/models/{{.Name}}"
@@ -363,5 +366,6 @@ variants:
     expected_performance:
       startup_time_s: 30
       cold_start_time_s: 60
-      tokens_per_second: [3, 15]
+      tokens_per_second: [3, 63]
       latency_first_token_ms: [80, 400]
+      notes: "Verified 2026-06-08 on AMD Strix Halo Radeon 8060S iGPU, llama.cpp b9180 HIP, Q4_K_M (35B-A3B MoE, ~3.5B active) all 999 layers offloaded: 63 tok/s decode, 101 tok/s prompt. Lower end of range is weak-device CPU-offload fallback."
diff --git a/catalog/models/qwen3.5-9b.yaml b/catalog/models/qwen3.5-9b.yaml
@@ -5,6 +5,10 @@ metadata:
   family: qwen
   parameter_count: "9B"
   released_at: "2025-09"
+  # Scan-name aliases: on-disk GGUF dir/file names so the local model scanner
+  # matches this catalog asset instead of falling back to auto-detect.
+  aliases:
+    - Qwen3.5-9B-Q4_K_M
 openclaw:
   chat_provider: false
 http:
@@ -110,3 +114,4 @@ variants:
       cold_start_time_s: 30
       tokens_per_second: [10, 40]
       latency_first_token_ms: [30, 150]
+      notes: "Verified 2026-06-08 on AMD Strix Halo Radeon 8060S iGPU (RDNA3.5), llama.cpp b9180 HIP, Q4_K_M all 999 layers offloaded: 33.8 tok/s decode, 206 tok/s prompt."