Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion catalog/models/glm-4.7-flash.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,44 @@ metadata:
family: glm
parameter_count: "30B"
released_at: "2025-11"
# Scan-name alias: on-disk GGUF name so the local scanner matches this asset.
aliases:
- GLM-4.7-Flash-Q4_K_M
storage:
formats: [safetensors]
formats: [safetensors, gguf]
default_path_pattern: "{{.DataDir}}/models/{{.Name}}"
sources:
- type: huggingface
repo: THUDM/GLM-4.7-Flash
format: safetensors
- type: huggingface
repo: unsloth/GLM-4.7-Flash-GGUF
format: gguf
- type: modelscope
repo: ZhipuAI/GLM-4.7-Flash
format: safetensors
- type: local_path
path: ""
variants:
# --- universal llamacpp GGUF (verified on AMD Strix Halo Radeon 8060S iGPU) ---
# GLM-4.7-Flash 30B-A3B MoE (llama.cpp arch deepseek2), Q4_K_M ~18GB, all layers offloaded.
- name: glm-4.7-flash-universal-llamacpp-q4
hardware:
gpu_arch: "*"
vram_min_mib: 0
ram_min_mib: 20480
engine: llamacpp
format: gguf
default_config:
quantization: int4
n_gpu_layers: 999
ctx_size: 8192
expected_performance:
startup_time_s: 20
cold_start_time_s: 40
tokens_per_second: [48, 60]
latency_first_token_ms: [30, 200]
notes: "Verified 2026-06-08 on AMD Strix Halo Radeon 8060S iGPU, llama.cpp b9180 HIP, Q4_K_M (deepseek2 arch, 30B-A3B MoE) all 999 layers offloaded: 58.9 tok/s decode, 83 tok/s prompt."
# --- GB10 (Blackwell, 128GB unified, K3S) ---
# BF16 ~59GB, 128GB unified memory 可容纳
# Glm4MoeLiteForCausalLM 架构 (glm4_moe_lite model_type)
Expand Down
31 changes: 30 additions & 1 deletion catalog/models/qwen3.5-27b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,46 @@ metadata:
family: qwen
parameter_count: "27B"
released_at: "2025-09"
# Scan-name alias: on-disk GGUF name so the local scanner matches this asset.
aliases:
- Qwen3.5-27B-Q4_K_M
storage:
formats: [safetensors]
formats: [safetensors, gguf]
default_path_pattern: "{{.DataDir}}/models/{{.Name}}"
sources:
- type: huggingface
repo: Qwen/Qwen3.5-27B
format: safetensors
- type: huggingface
repo: unsloth/Qwen3.5-27B-GGUF
format: gguf
- type: modelscope
repo: Qwen/Qwen3.5-27B
format: safetensors
- type: local_path
path: ""
variants:
# --- universal llamacpp GGUF (verified on AMD Strix Halo Radeon 8060S iGPU) ---
# Q4_K_M ~16.5GB, all layers offloaded. gpu_arch "*" so it matches even when
# HAL cannot resolve a specific arch (e.g. Windows AMD APU reports no arch).
- name: qwen3.5-27b-universal-llamacpp-q4
hardware:
gpu_arch: "*"
vram_min_mib: 0
ram_min_mib: 18432
engine: llamacpp
format: gguf
default_config:
quantization: int4
n_gpu_layers: 999
ctx_size: 8192
expected_performance:
startup_time_s: 25
cold_start_time_s: 45
tokens_per_second: [11, 13]
latency_first_token_ms: [40, 250]
notes: "Verified 2026-06-08 on AMD Strix Halo Radeon 8060S iGPU, llama.cpp b9180 HIP, Q4_K_M all 999 layers offloaded: 11.7 tok/s decode, 57 tok/s prompt."

# --- generic vllm-nightly qwen3_5 image is not a validated runtime for this model ---
- name: qwen3.5-27b-vllm-nightly-unsupported
hardware:
Expand Down
6 changes: 5 additions & 1 deletion catalog/models/qwen3.5-35b-a3b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ metadata:
family: qwen
parameter_count: "35B"
released_at: "2025-12"
# Scan-name alias: on-disk GGUF name so the local scanner matches this asset.
aliases:
- Qwen3.5-35B-A3B-Q4_K_M
storage:
formats: [safetensors, gguf]
default_path_pattern: "{{.DataDir}}/models/{{.Name}}"
Expand Down Expand Up @@ -363,5 +366,6 @@ variants:
expected_performance:
startup_time_s: 30
cold_start_time_s: 60
tokens_per_second: [3, 15]
tokens_per_second: [3, 63]
latency_first_token_ms: [80, 400]
notes: "Verified 2026-06-08 on AMD Strix Halo Radeon 8060S iGPU, llama.cpp b9180 HIP, Q4_K_M (35B-A3B MoE, ~3.5B active) all 999 layers offloaded: 63 tok/s decode, 101 tok/s prompt. Lower end of range is weak-device CPU-offload fallback."
5 changes: 5 additions & 0 deletions catalog/models/qwen3.5-9b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ metadata:
family: qwen
parameter_count: "9B"
released_at: "2025-09"
# Scan-name aliases: on-disk GGUF dir/file names so the local model scanner
# matches this catalog asset instead of falling back to auto-detect.
aliases:
- Qwen3.5-9B-Q4_K_M
openclaw:
chat_provider: false
http:
Expand Down Expand Up @@ -110,3 +114,4 @@ variants:
cold_start_time_s: 30
tokens_per_second: [10, 40]
latency_first_token_ms: [30, 150]
notes: "Verified 2026-06-08 on AMD Strix Halo Radeon 8060S iGPU (RDNA3.5), llama.cpp b9180 HIP, Q4_K_M all 999 layers offloaded: 33.8 tok/s decode, 206 tok/s prompt."
Loading