From fcfac5b017960d19654f624c95f2f6306cec253e Mon Sep 17 00:00:00 2001 From: rjckkkkk <59609580+rjckkkkk@users.noreply.github.com> Date: Mon, 8 Jun 2026 15:25:38 +0000 Subject: [PATCH] Add Strix Halo llama.cpp GGUF model knowledge (aliases + verified perf) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On the AMD Strix Halo (Radeon 8060S iGPU) rig, scanned on-disk GGUF model names (e.g. Qwen3.5-9B-Q4_K_M) did not match catalog metadata.name, so deploy fell back to auto-detect instead of the curated config. Add metadata.aliases so the local scanner matches, and record llama.cpp b9180 HIP verified decode perf (all 999 layers offloaded, Q4_K_M): qwen3.5-9b 33.8 tok/s (alias only; llamacpp variant already present) qwen3.5-27b 11.7 tok/s (added universal llamacpp variant + GGUF source) glm-4.7-flash 58.9 tok/s (added universal llamacpp variant + GGUF source) qwen3.5-35b-a3b 63.0 tok/s (alias + verified perf; variant already present) New llamacpp variants use gpu_arch "*" so they apply on any device (GGUF is the path for low-VRAM hardware). No Go changes — knowledge-only (INV-1/2). Co-Authored-By: Claude Opus 4.8 (1M context) --- catalog/models/glm-4.7-flash.yaml | 27 ++++++++++++++++++++++++- catalog/models/qwen3.5-27b.yaml | 31 ++++++++++++++++++++++++++++- catalog/models/qwen3.5-35b-a3b.yaml | 6 +++++- catalog/models/qwen3.5-9b.yaml | 5 +++++ 4 files changed, 66 insertions(+), 3 deletions(-) diff --git a/catalog/models/glm-4.7-flash.yaml b/catalog/models/glm-4.7-flash.yaml index d6404d2a..47038562 100644 --- a/catalog/models/glm-4.7-flash.yaml +++ b/catalog/models/glm-4.7-flash.yaml @@ -5,19 +5,44 @@ metadata: family: glm parameter_count: "30B" released_at: "2025-11" + # Scan-name alias: on-disk GGUF name so the local scanner matches this asset. + aliases: + - GLM-4.7-Flash-Q4_K_M storage: - formats: [safetensors] + formats: [safetensors, gguf] default_path_pattern: "{{.DataDir}}/models/{{.Name}}" sources: - type: huggingface repo: THUDM/GLM-4.7-Flash format: safetensors + - type: huggingface + repo: unsloth/GLM-4.7-Flash-GGUF + format: gguf - type: modelscope repo: ZhipuAI/GLM-4.7-Flash format: safetensors - type: local_path path: "" variants: + # --- universal llamacpp GGUF (verified on AMD Strix Halo Radeon 8060S iGPU) --- + # GLM-4.7-Flash 30B-A3B MoE (llama.cpp arch deepseek2), Q4_K_M ~18GB, all layers offloaded. + - name: glm-4.7-flash-universal-llamacpp-q4 + hardware: + gpu_arch: "*" + vram_min_mib: 0 + ram_min_mib: 20480 + engine: llamacpp + format: gguf + default_config: + quantization: int4 + n_gpu_layers: 999 + ctx_size: 8192 + expected_performance: + startup_time_s: 20 + cold_start_time_s: 40 + tokens_per_second: [48, 60] + latency_first_token_ms: [30, 200] + notes: "Verified 2026-06-08 on AMD Strix Halo Radeon 8060S iGPU, llama.cpp b9180 HIP, Q4_K_M (deepseek2 arch, 30B-A3B MoE) all 999 layers offloaded: 58.9 tok/s decode, 83 tok/s prompt." # --- GB10 (Blackwell, 128GB unified, K3S) --- # BF16 ~59GB, 128GB unified memory 可容纳 # Glm4MoeLiteForCausalLM 架构 (glm4_moe_lite model_type) diff --git a/catalog/models/qwen3.5-27b.yaml b/catalog/models/qwen3.5-27b.yaml index 13bb25aa..2b9aba6d 100644 --- a/catalog/models/qwen3.5-27b.yaml +++ b/catalog/models/qwen3.5-27b.yaml @@ -5,17 +5,46 @@ metadata: family: qwen parameter_count: "27B" released_at: "2025-09" + # Scan-name alias: on-disk GGUF name so the local scanner matches this asset. + aliases: + - Qwen3.5-27B-Q4_K_M storage: - formats: [safetensors] + formats: [safetensors, gguf] default_path_pattern: "{{.DataDir}}/models/{{.Name}}" sources: - type: huggingface repo: Qwen/Qwen3.5-27B format: safetensors + - type: huggingface + repo: unsloth/Qwen3.5-27B-GGUF + format: gguf - type: modelscope repo: Qwen/Qwen3.5-27B format: safetensors + - type: local_path + path: "" variants: + # --- universal llamacpp GGUF (verified on AMD Strix Halo Radeon 8060S iGPU) --- + # Q4_K_M ~16.5GB, all layers offloaded. gpu_arch "*" so it matches even when + # HAL cannot resolve a specific arch (e.g. Windows AMD APU reports no arch). + - name: qwen3.5-27b-universal-llamacpp-q4 + hardware: + gpu_arch: "*" + vram_min_mib: 0 + ram_min_mib: 18432 + engine: llamacpp + format: gguf + default_config: + quantization: int4 + n_gpu_layers: 999 + ctx_size: 8192 + expected_performance: + startup_time_s: 25 + cold_start_time_s: 45 + tokens_per_second: [11, 13] + latency_first_token_ms: [40, 250] + notes: "Verified 2026-06-08 on AMD Strix Halo Radeon 8060S iGPU, llama.cpp b9180 HIP, Q4_K_M all 999 layers offloaded: 11.7 tok/s decode, 57 tok/s prompt." + # --- generic vllm-nightly qwen3_5 image is not a validated runtime for this model --- - name: qwen3.5-27b-vllm-nightly-unsupported hardware: diff --git a/catalog/models/qwen3.5-35b-a3b.yaml b/catalog/models/qwen3.5-35b-a3b.yaml index d0196a12..565bc874 100644 --- a/catalog/models/qwen3.5-35b-a3b.yaml +++ b/catalog/models/qwen3.5-35b-a3b.yaml @@ -5,6 +5,9 @@ metadata: family: qwen parameter_count: "35B" released_at: "2025-12" + # Scan-name alias: on-disk GGUF name so the local scanner matches this asset. + aliases: + - Qwen3.5-35B-A3B-Q4_K_M storage: formats: [safetensors, gguf] default_path_pattern: "{{.DataDir}}/models/{{.Name}}" @@ -363,5 +366,6 @@ variants: expected_performance: startup_time_s: 30 cold_start_time_s: 60 - tokens_per_second: [3, 15] + tokens_per_second: [3, 63] latency_first_token_ms: [80, 400] + notes: "Verified 2026-06-08 on AMD Strix Halo Radeon 8060S iGPU, llama.cpp b9180 HIP, Q4_K_M (35B-A3B MoE, ~3.5B active) all 999 layers offloaded: 63 tok/s decode, 101 tok/s prompt. Lower end of range is weak-device CPU-offload fallback." diff --git a/catalog/models/qwen3.5-9b.yaml b/catalog/models/qwen3.5-9b.yaml index 4be24dd0..f70beb81 100644 --- a/catalog/models/qwen3.5-9b.yaml +++ b/catalog/models/qwen3.5-9b.yaml @@ -5,6 +5,10 @@ metadata: family: qwen parameter_count: "9B" released_at: "2025-09" + # Scan-name aliases: on-disk GGUF dir/file names so the local model scanner + # matches this catalog asset instead of falling back to auto-detect. + aliases: + - Qwen3.5-9B-Q4_K_M openclaw: chat_provider: false http: @@ -110,3 +114,4 @@ variants: cold_start_time_s: 30 tokens_per_second: [10, 40] latency_first_token_ms: [30, 150] + notes: "Verified 2026-06-08 on AMD Strix Halo Radeon 8060S iGPU (RDNA3.5), llama.cpp b9180 HIP, Q4_K_M all 999 layers offloaded: 33.8 tok/s decode, 206 tok/s prompt."