cryptopoly · cryptopoly · Jun 2, 2026 · May 29, 2026 · Jun 1, 2026 · Jun 2, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
diff --git a/backend_service/catalog/text_models.py b/backend_service/catalog/text_models.py
@@ -103,16 +103,20 @@
         "popularityLabel": "Featured family",
         "likesLabel": "Qwen official",
         "badges": ["Reasoning", "Coding", "Agents", "Long context"],
-        # FU-040 (2026-05-10): dropped ``vision`` from the family-level
-        # capabilities. Qwen3.6-27B (dense, Coder-Next branding) and
-        # Qwen3.6-35B-A3B (MoE) are both text-only — vision lives on a
-        # separate ``Qwen3.6-27B-VL`` variant we do not yet ship. The
-        # stale tag was promoting ``supportsVision: true`` for every
-        # community quant variant, which made ``ChatComposer`` render
-        # the "Attach image" affordance for a model that has no vision
-        # encoder. Add it back here only when an actual VL variant
-        # lands in the catalog.
-        "capabilities": ["reasoning", "coding", "tool-use"],
+        # FU-072 (2026-05-28): restored ``vision``. FU-040 (2026-05-10)
+        # had dropped it believing Qwen3.6 was text-only with vision on a
+        # separate ``Qwen3.6-27B-VL`` we don't ship. Upstream has since
+        # unified the family onto the multimodal ``Qwen3_5ForConditional
+        # Generation`` arch — every Qwen3.6 config.json now carries
+        # ``vision_config`` + ``image_token_id`` + ``vision_start/end``,
+        # mlx-vlm ships ``qwen3_5`` / ``qwen3_5_moe`` model support, and
+        # the ggml-org GGUF packs include an ``mmproj`` sibling. The base
+        # model IS the VL now. The composer "Attach image" button stays
+        # safe regardless: it reads the *runtime* ``supportsVision`` which
+        # ``catalog/capabilities.py`` demotes to False for the MLX worker
+        # (no image path) and gates on actual ``--mmproj`` resolution for
+        # GGUF, so a vision badge never produces a broken button.
+        "capabilities": ["reasoning", "coding", "vision", "tool-use"],
         "defaultVariantId": "Qwen/Qwen3.6-27B",
         "variants": [
             {
@@ -124,8 +128,8 @@
                 "sizeGb": 54.0,
                 "format": "Transformers",
                 "quantization": "BF16",
-                # FU-040: text-only dense variant (Coder-Next branding).
-                "capabilities": ["reasoning", "coding", "tool-use"],
+                # FU-072: multimodal (vision_config present upstream).
+                "capabilities": ["reasoning", "coding", "vision", "tool-use"],
                 "note": "Dense 27B Qwen3.6 release with agentic coding tuning. Apache 2.0.",
                 "contextWindow": "262K",
                 "launchMode": "convert",
@@ -141,8 +145,8 @@
                 "sizeGb": 28.0,
                 "format": "Transformers",
                 "quantization": "FP8",
-                # FU-040: text-only dense variant.
-                "capabilities": ["reasoning", "coding", "tool-use"],
+                # FU-072: multimodal (vision_config present upstream).
+                "capabilities": ["reasoning", "coding", "vision", "tool-use"],
                 "note": "FP8 quantization of the 27B dense release for ~30 GB VRAM systems.",
                 "contextWindow": "262K",
                 "launchMode": "convert",
@@ -158,7 +162,7 @@
                 "sizeGb": 70.0,
                 "format": "Transformers",
                 "quantization": "BF16",
-                "capabilities": ["reasoning", "coding", "agents", "tool-use"],
+                "capabilities": ["reasoning", "coding", "vision", "agents", "tool-use"],
                 "note": "MoE A3B variant — 35B total params, ~3B active per token. Apache 2.0.",
                 "contextWindow": "262K",
                 "launchMode": "convert",
@@ -174,8 +178,8 @@
                 "sizeGb": 15.5,
                 "format": "MLX",
                 "quantization": "4-bit",
-                # FU-040: text-only dense variant.
-                "capabilities": ["reasoning", "coding", "tool-use"],
+                # FU-072: multimodal (vision_config present upstream).
+                "capabilities": ["reasoning", "coding", "vision", "tool-use"],
                 "note": "Community MLX 4-bit conversion for Apple Silicon — fastest local launch path.",
                 "contextWindow": "262K",
                 "launchMode": "direct",
@@ -191,7 +195,7 @@
                 "sizeGb": 20.0,
                 "format": "MLX",
                 "quantization": "4-bit",
-                "capabilities": ["reasoning", "coding", "agents", "tool-use"],
+                "capabilities": ["reasoning", "coding", "vision", "agents", "tool-use"],
                 "note": "MoE 4-bit MLX conversion — sparse activation keeps memory close to a 4B model.",
                 "contextWindow": "262K",
                 "launchMode": "direct",
@@ -207,7 +211,7 @@
                 "sizeGb": 16.5,
                 "format": "GGUF",
                 "quantization": "Q4_K_M",
-                "capabilities": ["reasoning", "coding", "tool-use"],
+                "capabilities": ["reasoning", "coding", "vision", "tool-use"],
                 "note": "Community GGUF pack quantized via llama.cpp b8883 for cross-platform llama.cpp runs.",
                 "contextWindow": "262K",
                 "launchMode": "direct",
@@ -223,7 +227,7 @@
                 "sizeGb": 21.0,
                 "format": "GGUF",
                 "quantization": "Q4_K_M",
-                "capabilities": ["reasoning", "coding", "agents", "tool-use"],
+                "capabilities": ["reasoning", "coding", "vision", "agents", "tool-use"],
                 "note": "MoE GGUF (llama.cpp b8814) — runs the 35B sparse model through standard llama-server.",
                 "contextWindow": "262K",
                 "launchMode": "direct",
@@ -244,8 +248,8 @@
                 "sizeGb": 29.0,
                 "format": "GGUF",
                 "quantization": "Q8_0",
-                "capabilities": ["reasoning", "coding", "tool-use"],
-                "note": "Baked-in MTP heads. Pair with --spec-type draft-mtp for 1.8-2.2x speedup with zero quality loss.",
+                "capabilities": ["reasoning", "coding", "vision", "tool-use"],
+                "note": "Baked-in MTP heads + mmproj sibling for vision. Pair with --spec-type draft-mtp for 1.8-2.2x speedup with zero quality loss.",
                 "contextWindow": "262K",
                 "launchMode": "direct",
                 "backend": "llama.cpp",
@@ -260,8 +264,44 @@
                 "sizeGb": 37.0,
                 "format": "GGUF",
                 "quantization": "Q8_0",
-                "capabilities": ["reasoning", "coding", "agents", "tool-use"],
-                "note": "MoE with baked-in MTP heads. --spec-type draft-mtp speedup compounds with the sparse activation savings.",
+                "capabilities": ["reasoning", "coding", "vision", "agents", "tool-use"],
+                "note": "MoE with baked-in MTP heads + mmproj sibling for vision. --spec-type draft-mtp speedup compounds with the sparse activation savings.",
+                "contextWindow": "262K",
+                "launchMode": "direct",
+                "backend": "llama.cpp",
+                "releaseDate": "2026-05",
+            },
+            {
+                # FU-064: ggml-org canonical non-MTP companion (2026-05-22).
+                # Same Q8_0 quality bar as the MTP variant but without the
+                # baked-in MTP heads — for users on llama.cpp builds that
+                # predate PR #22673 or who don't want speculative decoding.
+                "id": "ggml-org/Qwen3.6-27B-GGUF",
+                "name": "Qwen3.6 27B GGUF (ggml-org Q8_0)",
+                "repo": "ggml-org/Qwen3.6-27B-GGUF",
+                "link": "https://huggingface.co/ggml-org/Qwen3.6-27B-GGUF",
+                "paramsB": 27.0,
+                "sizeGb": 29.0,
+                "format": "GGUF",
+                "quantization": "Q8_0",
+                "capabilities": ["reasoning", "coding", "vision", "tool-use"],
+                "note": "ggml-org canonical Q8_0 pack + mmproj sibling for vision. No MTP heads — pick the -MTP-GGUF sibling for speculative decoding.",
+                "contextWindow": "262K",
+                "launchMode": "direct",
+                "backend": "llama.cpp",
+                "releaseDate": "2026-05",
+            },
+            {
+                "id": "ggml-org/Qwen3.6-35B-A3B-GGUF",
+                "name": "Qwen3.6 35B A3B GGUF (ggml-org Q8_0)",
+                "repo": "ggml-org/Qwen3.6-35B-A3B-GGUF",
+                "link": "https://huggingface.co/ggml-org/Qwen3.6-35B-A3B-GGUF",
+                "paramsB": 35.0,
+                "sizeGb": 37.0,
+                "format": "GGUF",
+                "quantization": "Q8_0",
+                "capabilities": ["reasoning", "coding", "vision", "agents", "tool-use"],
+                "note": "ggml-org canonical Q8_0 MoE pack + mmproj sibling for vision. No MTP heads — pick the -MTP-GGUF sibling for speculative decoding.",
                 "contextWindow": "262K",
                 "launchMode": "direct",
                 "backend": "llama.cpp",
@@ -288,10 +328,14 @@
         "popularityLabel": "Featured family",
         "likesLabel": "Qwen official",
         "badges": ["Reasoning", "Coding", "Long context"],
-        # FU-040: Qwen3.5 dense + MoE variants are text-only. The
-        # ``vision`` tag at family-level was promoting false positives
-        # in ``supportsVision`` for every community quant variant.
-        "capabilities": ["reasoning", "coding", "tool-use"],
+        # FU-072: Qwen3.5 is multimodal upstream (Qwen3_5ForConditional
+        # Generation + vision_config; mlx-vlm ships qwen3_5 support).
+        # FU-040 had marked the family text-only — now corrected. The
+        # runtime ``supportsVision`` is still demoted per-engine in
+        # catalog/capabilities.py (MLX worker carries no images; GGUF
+        # gates on mmproj), so the family vision tag drives badges only,
+        # never a broken composer button.
+        "capabilities": ["reasoning", "coding", "vision", "tool-use"],
         "defaultVariantId": "Qwen/Qwen3.5-9B",
         "variants": [
             {
@@ -303,8 +347,8 @@
                 "sizeGb": 5.1,
                 "format": "Transformers",
                 "quantization": "BF16",
-                "capabilities": ["reasoning", "coding", "tool-use"],
-                "note": "Smaller Qwen 3.5 variant with strong utility for everyday local work.",
+                "capabilities": ["reasoning", "coding", "vision", "video", "tool-use"],
+                "note": "Smaller Qwen 3.5 variant with strong utility for everyday local work. Multimodal (image + video) like its 9B sibling.",
                 "contextWindow": "262K",
                 "launchMode": "convert",
                 "backend": "mlx",
@@ -348,8 +392,8 @@
                 "sizeGb": 5.8,
                 "format": "GGUF",
                 "quantization": "Q4_K_M",
-                "capabilities": ["reasoning", "coding", "tool-use"],
-                "note": "Community GGUF pack with ready-made quantizations for quick llama.cpp runs.",
+                "capabilities": ["reasoning", "coding", "vision", "video", "tool-use"],
+                "note": "Community GGUF pack with ready-made quantizations for quick llama.cpp runs. Vision needs the mmproj sibling on disk.",
                 "contextWindow": "262K",
                 "launchMode": "direct",
                 "backend": "llama.cpp",

diff --git a/backend_service/helpers/hf_resolve.py b/backend_service/helpers/hf_resolve.py
@@ -0,0 +1,182 @@
+"""Resolve an arbitrary Hugging Face repo into a loadable descriptor (#5).
+
+Lets a user paste any GGUF / MLX repo and run it without a curated
+catalog row. The previous behaviour (FU-041) fuzzy-matched off-catalog
+repos against the nearest catalog entry, picking up the wrong context
+window, capabilities, and DFlash drafter. This module instead reads the
+repo's own file list + ``config.json`` and synthesises a descriptor, and
+the caller passes ``canonicalRepo=<repo>`` to ``load_model`` so
+``_resolve_canonical_repo`` returns it verbatim — no fuzzy match.
+
+``resolve_hf_model`` is pure (no network): it takes the already-fetched
+file list and optional parsed ``config.json``. The route layer fetches
+those via ``_hub_repo_files`` + a best-effort ``config.json`` read.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+# GGUF quantization preference when a repo ships several. Quality/size
+# sweet spots first; everything unlisted sorts last but is still runnable.
+_GGUF_QUANT_PRIORITY = (
+    "q4_k_m",
+    "q5_k_m",
+    "q4_k_s",
+    "q5_k_s",
+    "q8_0",
+    "q6_k",
+    "q4_0",
+    "q3_k_m",
+    "iq4_nl",
+)
+
+_DEFAULT_CONTEXT = 8192
+_MIN_CONTEXT = 2048
+_MAX_CONTEXT = 131072
+
+# config.json keys that carry the trained context length, most-specific
+# first.
+_CONTEXT_KEYS = ("max_position_embeddings", "n_positions", "max_seq_len", "n_ctx")
+
+
+def _is_gguf(path: str) -> bool:
+    return path.lower().endswith(".gguf")
+
+
+def _gguf_score(path: str) -> tuple[int, int]:
+    """Lower is better. (quant_rank, shard_penalty)."""
+    lowered = path.lower()
+    quant_rank = len(_GGUF_QUANT_PRIORITY)
+    for idx, tag in enumerate(_GGUF_QUANT_PRIORITY):
+        if tag in lowered:
+            quant_rank = idx
+            break
+    # Prefer a non-sharded file; if sharded, only the first shard is a
+    # valid entry point for llama.cpp.
+    is_shard = "-of-" in lowered
+    is_first_shard = "00001-of-" in lowered
+    shard_penalty = 0 if not is_shard else (1 if is_first_shard else 2)
+    return (quant_rank, shard_penalty)
+
+
+def _pick_gguf(gguf_paths: list[str], requested_file: str | None) -> str | None:
+    if not gguf_paths:
+        return None
+    if requested_file and requested_file in gguf_paths:
+        return requested_file
+    # Drop non-first shards from contention; if every candidate is a
+    # non-first shard (unusual), fall back to the full list.
+    primary = [p for p in gguf_paths if "-of-" not in p.lower() or "00001-of-" in p.lower()]
+    pool = primary or gguf_paths
+    return sorted(pool, key=_gguf_score)[0]
+
+
+def _context_from_config(config: dict[str, Any] | None) -> int | None:
+    if not isinstance(config, dict):
+        return None
+    # Some multimodal configs nest the LM config under text_config.
+    sources = [config]
+    text_cfg = config.get("text_config")
+    if isinstance(text_cfg, dict):
+        sources.append(text_cfg)
+    for src in sources:
+        for key in _CONTEXT_KEYS:
+            value = src.get(key)
+            if isinstance(value, (int, float)) and value > 0:
+                return int(value)
+    return None
+
+
+def _infer_capabilities(config: dict[str, Any] | None, has_mmproj: bool) -> dict[str, bool]:
+    vision = has_mmproj
+    if isinstance(config, dict):
+        if config.get("vision_config") or config.get("image_token_id") is not None:
+            vision = True
+    return {"text": True, "vision": bool(vision)}
+
+
+def resolve_hf_model(
+    repo: str,
+    *,
+    files: list[dict[str, Any]],
+    config: dict[str, Any] | None = None,
+    requested_file: str | None = None,
+) -> dict[str, Any]:
+    """Synthesise a loadable descriptor for an arbitrary HF repo.
+
+    ``files`` are records as produced by ``_hub_repo_files`` siblings:
+    ``{"path", "sizeBytes", "kind"}``. ``config`` is the parsed
+    ``config.json`` when available. Never raises for a well-formed file
+    list; surfaces uncertainty via ``warnings``.
+    """
+    paths = [str(f.get("path") or "") for f in files if f.get("path")]
+    size_by_path = {str(f.get("path") or ""): int(f.get("sizeBytes") or 0) for f in files}
+
+    gguf_paths = [p for p in paths if _is_gguf(p)]
+    safetensors_paths = [p for p in paths if p.lower().endswith(".safetensors")]
+    has_mmproj = any("mmproj" in p.lower() for p in paths)
+
+    warnings: list[str] = []
+    gguf_file: str | None = None
+
+    if gguf_paths:
+        backend = "llama.cpp"
+        gguf_file = _pick_gguf(gguf_paths, requested_file)
+        size_bytes = size_by_path.get(gguf_file or "", 0)
+        if not size_bytes:
+            size_bytes = sum(size_by_path.get(p, 0) for p in gguf_paths)
+    elif repo.startswith("mlx-community/") or _looks_like_mlx(config):
+        backend = "mlx"
+        size_bytes = sum(size_by_path.get(p, 0) for p in safetensors_paths)
+    elif safetensors_paths:
+        # Raw (non-MLX) safetensors: runnable only via a CUDA backend or
+        # after conversion. Surface it honestly rather than guessing.
+        backend = "vllm"
+        size_bytes = sum(size_by_path.get(p, 0) for p in safetensors_paths)
+        warnings.append(
+            "This repo ships raw safetensors weights (no GGUF, not an MLX conversion). "
+            "On Apple Silicon, convert it to MLX or pick a GGUF mirror; the vLLM backend "
+            "is CUDA-only."
+        )
+    else:
+        backend = "unknown"
+        size_bytes = sum(size_by_path.values())
+        warnings.append("No GGUF or safetensors weights found in this repo.")
+
+    ctx_from_config = _context_from_config(config)
+    if ctx_from_config is not None:
+        context_tokens = max(_MIN_CONTEXT, min(_MAX_CONTEXT, ctx_from_config))
+    else:
+        context_tokens = _DEFAULT_CONTEXT
+        if backend == "llama.cpp":
+            warnings.append(
+                f"Context length not read from metadata; defaulting to {_DEFAULT_CONTEXT}. "
+                "Adjust in launch settings if the model supports more."
+            )
+
+    return {
+        "repo": repo,
+        "ref": repo,
+        "label": repo.split("/")[-1],
+        "backend": backend,
+        "ggufFile": gguf_file,
+        "contextTokens": context_tokens,
+        "capabilities": _infer_capabilities(config, has_mmproj),
+        "sizeBytes": size_bytes,
+        "family": "custom",
+        "custom": True,
+        "warnings": warnings,
+    }
+
+
+def _looks_like_mlx(config: dict[str, Any] | None) -> bool:
+    """Heuristic: an MLX-converted repo carries an MLX quantization stanza."""
+    if not isinstance(config, dict):
+        return False
+    if "quantization" in config and isinstance(config["quantization"], dict):
+        # mlx-lm writes {"group_size": N, "bits": M} under "quantization".
+        q = config["quantization"]
+        if "group_size" in q or "bits" in q:
+            return True
+    return False