Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions CLAUDE.md

Large diffs are not rendered by default.

108 changes: 76 additions & 32 deletions backend_service/catalog/text_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,16 +103,20 @@
"popularityLabel": "Featured family",
"likesLabel": "Qwen official",
"badges": ["Reasoning", "Coding", "Agents", "Long context"],
# FU-040 (2026-05-10): dropped ``vision`` from the family-level
# capabilities. Qwen3.6-27B (dense, Coder-Next branding) and
# Qwen3.6-35B-A3B (MoE) are both text-only — vision lives on a
# separate ``Qwen3.6-27B-VL`` variant we do not yet ship. The
# stale tag was promoting ``supportsVision: true`` for every
# community quant variant, which made ``ChatComposer`` render
# the "Attach image" affordance for a model that has no vision
# encoder. Add it back here only when an actual VL variant
# lands in the catalog.
"capabilities": ["reasoning", "coding", "tool-use"],
# FU-072 (2026-05-28): restored ``vision``. FU-040 (2026-05-10)
# had dropped it believing Qwen3.6 was text-only with vision on a
# separate ``Qwen3.6-27B-VL`` we don't ship. Upstream has since
# unified the family onto the multimodal ``Qwen3_5ForConditional
# Generation`` arch — every Qwen3.6 config.json now carries
# ``vision_config`` + ``image_token_id`` + ``vision_start/end``,
# mlx-vlm ships ``qwen3_5`` / ``qwen3_5_moe`` model support, and
# the ggml-org GGUF packs include an ``mmproj`` sibling. The base
# model IS the VL now. The composer "Attach image" button stays
# safe regardless: it reads the *runtime* ``supportsVision`` which
# ``catalog/capabilities.py`` demotes to False for the MLX worker
# (no image path) and gates on actual ``--mmproj`` resolution for
# GGUF, so a vision badge never produces a broken button.
"capabilities": ["reasoning", "coding", "vision", "tool-use"],
"defaultVariantId": "Qwen/Qwen3.6-27B",
"variants": [
{
Expand All @@ -124,8 +128,8 @@
"sizeGb": 54.0,
"format": "Transformers",
"quantization": "BF16",
# FU-040: text-only dense variant (Coder-Next branding).
"capabilities": ["reasoning", "coding", "tool-use"],
# FU-072: multimodal (vision_config present upstream).
"capabilities": ["reasoning", "coding", "vision", "tool-use"],
"note": "Dense 27B Qwen3.6 release with agentic coding tuning. Apache 2.0.",
"contextWindow": "262K",
"launchMode": "convert",
Expand All @@ -141,8 +145,8 @@
"sizeGb": 28.0,
"format": "Transformers",
"quantization": "FP8",
# FU-040: text-only dense variant.
"capabilities": ["reasoning", "coding", "tool-use"],
# FU-072: multimodal (vision_config present upstream).
"capabilities": ["reasoning", "coding", "vision", "tool-use"],
"note": "FP8 quantization of the 27B dense release for ~30 GB VRAM systems.",
"contextWindow": "262K",
"launchMode": "convert",
Expand All @@ -158,7 +162,7 @@
"sizeGb": 70.0,
"format": "Transformers",
"quantization": "BF16",
"capabilities": ["reasoning", "coding", "agents", "tool-use"],
"capabilities": ["reasoning", "coding", "vision", "agents", "tool-use"],
"note": "MoE A3B variant — 35B total params, ~3B active per token. Apache 2.0.",
"contextWindow": "262K",
"launchMode": "convert",
Expand All @@ -174,8 +178,8 @@
"sizeGb": 15.5,
"format": "MLX",
"quantization": "4-bit",
# FU-040: text-only dense variant.
"capabilities": ["reasoning", "coding", "tool-use"],
# FU-072: multimodal (vision_config present upstream).
"capabilities": ["reasoning", "coding", "vision", "tool-use"],
"note": "Community MLX 4-bit conversion for Apple Silicon — fastest local launch path.",
"contextWindow": "262K",
"launchMode": "direct",
Expand All @@ -191,7 +195,7 @@
"sizeGb": 20.0,
"format": "MLX",
"quantization": "4-bit",
"capabilities": ["reasoning", "coding", "agents", "tool-use"],
"capabilities": ["reasoning", "coding", "vision", "agents", "tool-use"],
"note": "MoE 4-bit MLX conversion — sparse activation keeps memory close to a 4B model.",
"contextWindow": "262K",
"launchMode": "direct",
Expand All @@ -207,7 +211,7 @@
"sizeGb": 16.5,
"format": "GGUF",
"quantization": "Q4_K_M",
"capabilities": ["reasoning", "coding", "tool-use"],
"capabilities": ["reasoning", "coding", "vision", "tool-use"],
"note": "Community GGUF pack quantized via llama.cpp b8883 for cross-platform llama.cpp runs.",
"contextWindow": "262K",
"launchMode": "direct",
Expand All @@ -223,7 +227,7 @@
"sizeGb": 21.0,
"format": "GGUF",
"quantization": "Q4_K_M",
"capabilities": ["reasoning", "coding", "agents", "tool-use"],
"capabilities": ["reasoning", "coding", "vision", "agents", "tool-use"],
"note": "MoE GGUF (llama.cpp b8814) — runs the 35B sparse model through standard llama-server.",
"contextWindow": "262K",
"launchMode": "direct",
Expand All @@ -244,8 +248,8 @@
"sizeGb": 29.0,
"format": "GGUF",
"quantization": "Q8_0",
"capabilities": ["reasoning", "coding", "tool-use"],
"note": "Baked-in MTP heads. Pair with --spec-type draft-mtp for 1.8-2.2x speedup with zero quality loss.",
"capabilities": ["reasoning", "coding", "vision", "tool-use"],
"note": "Baked-in MTP heads + mmproj sibling for vision. Pair with --spec-type draft-mtp for 1.8-2.2x speedup with zero quality loss.",
"contextWindow": "262K",
"launchMode": "direct",
"backend": "llama.cpp",
Expand All @@ -260,8 +264,44 @@
"sizeGb": 37.0,
"format": "GGUF",
"quantization": "Q8_0",
"capabilities": ["reasoning", "coding", "agents", "tool-use"],
"note": "MoE with baked-in MTP heads. --spec-type draft-mtp speedup compounds with the sparse activation savings.",
"capabilities": ["reasoning", "coding", "vision", "agents", "tool-use"],
"note": "MoE with baked-in MTP heads + mmproj sibling for vision. --spec-type draft-mtp speedup compounds with the sparse activation savings.",
"contextWindow": "262K",
"launchMode": "direct",
"backend": "llama.cpp",
"releaseDate": "2026-05",
},
{
# FU-064: ggml-org canonical non-MTP companion (2026-05-22).
# Same Q8_0 quality bar as the MTP variant but without the
# baked-in MTP heads — for users on llama.cpp builds that
# predate PR #22673 or who don't want speculative decoding.
"id": "ggml-org/Qwen3.6-27B-GGUF",
"name": "Qwen3.6 27B GGUF (ggml-org Q8_0)",
"repo": "ggml-org/Qwen3.6-27B-GGUF",
"link": "https://huggingface.co/ggml-org/Qwen3.6-27B-GGUF",
"paramsB": 27.0,
"sizeGb": 29.0,
"format": "GGUF",
"quantization": "Q8_0",
"capabilities": ["reasoning", "coding", "vision", "tool-use"],
"note": "ggml-org canonical Q8_0 pack + mmproj sibling for vision. No MTP heads — pick the -MTP-GGUF sibling for speculative decoding.",
"contextWindow": "262K",
"launchMode": "direct",
"backend": "llama.cpp",
"releaseDate": "2026-05",
},
{
"id": "ggml-org/Qwen3.6-35B-A3B-GGUF",
"name": "Qwen3.6 35B A3B GGUF (ggml-org Q8_0)",
"repo": "ggml-org/Qwen3.6-35B-A3B-GGUF",
"link": "https://huggingface.co/ggml-org/Qwen3.6-35B-A3B-GGUF",
"paramsB": 35.0,
"sizeGb": 37.0,
"format": "GGUF",
"quantization": "Q8_0",
"capabilities": ["reasoning", "coding", "vision", "agents", "tool-use"],
"note": "ggml-org canonical Q8_0 MoE pack + mmproj sibling for vision. No MTP heads — pick the -MTP-GGUF sibling for speculative decoding.",
"contextWindow": "262K",
"launchMode": "direct",
"backend": "llama.cpp",
Expand All @@ -288,10 +328,14 @@
"popularityLabel": "Featured family",
"likesLabel": "Qwen official",
"badges": ["Reasoning", "Coding", "Long context"],
# FU-040: Qwen3.5 dense + MoE variants are text-only. The
# ``vision`` tag at family-level was promoting false positives
# in ``supportsVision`` for every community quant variant.
"capabilities": ["reasoning", "coding", "tool-use"],
# FU-072: Qwen3.5 is multimodal upstream (Qwen3_5ForConditional
# Generation + vision_config; mlx-vlm ships qwen3_5 support).
# FU-040 had marked the family text-only — now corrected. The
# runtime ``supportsVision`` is still demoted per-engine in
# catalog/capabilities.py (MLX worker carries no images; GGUF
# gates on mmproj), so the family vision tag drives badges only,
# never a broken composer button.
"capabilities": ["reasoning", "coding", "vision", "tool-use"],
"defaultVariantId": "Qwen/Qwen3.5-9B",
"variants": [
{
Expand All @@ -303,8 +347,8 @@
"sizeGb": 5.1,
"format": "Transformers",
"quantization": "BF16",
"capabilities": ["reasoning", "coding", "tool-use"],
"note": "Smaller Qwen 3.5 variant with strong utility for everyday local work.",
"capabilities": ["reasoning", "coding", "vision", "video", "tool-use"],
"note": "Smaller Qwen 3.5 variant with strong utility for everyday local work. Multimodal (image + video) like its 9B sibling.",
"contextWindow": "262K",
"launchMode": "convert",
"backend": "mlx",
Expand Down Expand Up @@ -348,8 +392,8 @@
"sizeGb": 5.8,
"format": "GGUF",
"quantization": "Q4_K_M",
"capabilities": ["reasoning", "coding", "tool-use"],
"note": "Community GGUF pack with ready-made quantizations for quick llama.cpp runs.",
"capabilities": ["reasoning", "coding", "vision", "video", "tool-use"],
"note": "Community GGUF pack with ready-made quantizations for quick llama.cpp runs. Vision needs the mmproj sibling on disk.",
"contextWindow": "262K",
"launchMode": "direct",
"backend": "llama.cpp",
Expand Down
182 changes: 182 additions & 0 deletions backend_service/helpers/hf_resolve.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
"""Resolve an arbitrary Hugging Face repo into a loadable descriptor (#5).

Lets a user paste any GGUF / MLX repo and run it without a curated
catalog row. The previous behaviour (FU-041) fuzzy-matched off-catalog
repos against the nearest catalog entry, picking up the wrong context
window, capabilities, and DFlash drafter. This module instead reads the
repo's own file list + ``config.json`` and synthesises a descriptor, and
the caller passes ``canonicalRepo=<repo>`` to ``load_model`` so
``_resolve_canonical_repo`` returns it verbatim — no fuzzy match.

``resolve_hf_model`` is pure (no network): it takes the already-fetched
file list and optional parsed ``config.json``. The route layer fetches
those via ``_hub_repo_files`` + a best-effort ``config.json`` read.
"""

from __future__ import annotations

from typing import Any

# GGUF quantization preference when a repo ships several. Quality/size
# sweet spots first; everything unlisted sorts last but is still runnable.
_GGUF_QUANT_PRIORITY = (
"q4_k_m",
"q5_k_m",
"q4_k_s",
"q5_k_s",
"q8_0",
"q6_k",
"q4_0",
"q3_k_m",
"iq4_nl",
)

_DEFAULT_CONTEXT = 8192
_MIN_CONTEXT = 2048
_MAX_CONTEXT = 131072

# config.json keys that carry the trained context length, most-specific
# first.
_CONTEXT_KEYS = ("max_position_embeddings", "n_positions", "max_seq_len", "n_ctx")


def _is_gguf(path: str) -> bool:
return path.lower().endswith(".gguf")


def _gguf_score(path: str) -> tuple[int, int]:
"""Lower is better. (quant_rank, shard_penalty)."""
lowered = path.lower()
quant_rank = len(_GGUF_QUANT_PRIORITY)
for idx, tag in enumerate(_GGUF_QUANT_PRIORITY):
if tag in lowered:
quant_rank = idx
break
# Prefer a non-sharded file; if sharded, only the first shard is a
# valid entry point for llama.cpp.
is_shard = "-of-" in lowered
is_first_shard = "00001-of-" in lowered
shard_penalty = 0 if not is_shard else (1 if is_first_shard else 2)
return (quant_rank, shard_penalty)


def _pick_gguf(gguf_paths: list[str], requested_file: str | None) -> str | None:
if not gguf_paths:
return None
if requested_file and requested_file in gguf_paths:
return requested_file
# Drop non-first shards from contention; if every candidate is a
# non-first shard (unusual), fall back to the full list.
primary = [p for p in gguf_paths if "-of-" not in p.lower() or "00001-of-" in p.lower()]
pool = primary or gguf_paths
return sorted(pool, key=_gguf_score)[0]


def _context_from_config(config: dict[str, Any] | None) -> int | None:
if not isinstance(config, dict):
return None
# Some multimodal configs nest the LM config under text_config.
sources = [config]
text_cfg = config.get("text_config")
if isinstance(text_cfg, dict):
sources.append(text_cfg)
for src in sources:
for key in _CONTEXT_KEYS:
value = src.get(key)
if isinstance(value, (int, float)) and value > 0:
return int(value)
return None


def _infer_capabilities(config: dict[str, Any] | None, has_mmproj: bool) -> dict[str, bool]:
vision = has_mmproj
if isinstance(config, dict):
if config.get("vision_config") or config.get("image_token_id") is not None:
vision = True
return {"text": True, "vision": bool(vision)}


def resolve_hf_model(
repo: str,
*,
files: list[dict[str, Any]],
config: dict[str, Any] | None = None,
requested_file: str | None = None,
) -> dict[str, Any]:
"""Synthesise a loadable descriptor for an arbitrary HF repo.

``files`` are records as produced by ``_hub_repo_files`` siblings:
``{"path", "sizeBytes", "kind"}``. ``config`` is the parsed
``config.json`` when available. Never raises for a well-formed file
list; surfaces uncertainty via ``warnings``.
"""
paths = [str(f.get("path") or "") for f in files if f.get("path")]
size_by_path = {str(f.get("path") or ""): int(f.get("sizeBytes") or 0) for f in files}

gguf_paths = [p for p in paths if _is_gguf(p)]
safetensors_paths = [p for p in paths if p.lower().endswith(".safetensors")]
has_mmproj = any("mmproj" in p.lower() for p in paths)

warnings: list[str] = []
gguf_file: str | None = None

if gguf_paths:
backend = "llama.cpp"
gguf_file = _pick_gguf(gguf_paths, requested_file)
size_bytes = size_by_path.get(gguf_file or "", 0)
if not size_bytes:
size_bytes = sum(size_by_path.get(p, 0) for p in gguf_paths)
elif repo.startswith("mlx-community/") or _looks_like_mlx(config):
backend = "mlx"
size_bytes = sum(size_by_path.get(p, 0) for p in safetensors_paths)
elif safetensors_paths:
# Raw (non-MLX) safetensors: runnable only via a CUDA backend or
# after conversion. Surface it honestly rather than guessing.
backend = "vllm"
size_bytes = sum(size_by_path.get(p, 0) for p in safetensors_paths)
warnings.append(
"This repo ships raw safetensors weights (no GGUF, not an MLX conversion). "
"On Apple Silicon, convert it to MLX or pick a GGUF mirror; the vLLM backend "
"is CUDA-only."
)
else:
backend = "unknown"
size_bytes = sum(size_by_path.values())
warnings.append("No GGUF or safetensors weights found in this repo.")

ctx_from_config = _context_from_config(config)
if ctx_from_config is not None:
context_tokens = max(_MIN_CONTEXT, min(_MAX_CONTEXT, ctx_from_config))
else:
context_tokens = _DEFAULT_CONTEXT
if backend == "llama.cpp":
warnings.append(
f"Context length not read from metadata; defaulting to {_DEFAULT_CONTEXT}. "
"Adjust in launch settings if the model supports more."
)

return {
"repo": repo,
"ref": repo,
"label": repo.split("/")[-1],
"backend": backend,
"ggufFile": gguf_file,
"contextTokens": context_tokens,
"capabilities": _infer_capabilities(config, has_mmproj),
"sizeBytes": size_bytes,
"family": "custom",
"custom": True,
"warnings": warnings,
}


def _looks_like_mlx(config: dict[str, Any] | None) -> bool:
"""Heuristic: an MLX-converted repo carries an MLX quantization stanza."""
if not isinstance(config, dict):
return False
if "quantization" in config and isinstance(config["quantization"], dict):
# mlx-lm writes {"group_size": N, "bits": M} under "quantization".
q = config["quantization"]
if "group_size" in q or "bits" in q:
return True
return False
Loading