From a56c0b9c478cf22059127775a81e0db2e18b1412 Mon Sep 17 00:00:00 2001
From: Codex <codex@local>
Date: Tue, 9 Jun 2026 14:28:09 +0800
Subject: [PATCH] feat(model): mark speculative draft heads as non-standalone

AIMA's model scanner lists every weight artifact on disk as an
independently deployable model. Speculative draft heads (DFlash / MTP)
only make sense paired with their parent model for speculative decoding,
yet they showed up as standalone models with a deploy button
(e.g. Qwen3.6-35B-A3B-DFlash, Qwen3.6-35B-A3B-DFlash-Q4_K_M).

The catalog already names each draft via its parent variant's
speculative_config.model, so detection needs no new per-model YAML:

- knowledge.NormalizeModelKey: lowercases a model name and strips
  quantization/precision/layout suffixes (q4_k_m, bf16, ud, unfused, ...)
  while keeping role tokens like "dflash", so all on-disk artifacts of one
  logical draft share a key and stay distinct from the parent.
- Catalog.SpeculativeDraftModelKeys: harvests speculative_config.model
  across all variants into a set of normalized draft keys.
- annotateModelsFromCatalog: a scanned model whose normalized name is a
  draft key gets standalone_deploy=false + ui.role=draft (only when not
  already set); the embedded UI already hides the deploy button on
  standalone_deploy=false.

Knowledge-driven (INV-1/2): no hardcoded model names, derived entirely
from the catalog. Parent models and quantization variants are unaffected,
and DB rows / deploy-by-name paths are untouched (annotation is applied at
model.list time only).

Tests: NormalizeModelKey table cases (incl. "glm-4.7-flash" not stripped),
SpeculativeDraftModelKeys harvest + nil-catalog, and the annotate wiring
(drafts -> non-standalone draft, parent stays deployable).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 cmd/aima/tooldeps_model.go       | 47 +++++++++++++--------
 cmd/aima/tooldeps_model_test.go  | 53 ++++++++++++++++++++++++
 internal/knowledge/draft.go      | 71 ++++++++++++++++++++++++++++++++
 internal/knowledge/draft_test.go | 59 ++++++++++++++++++++++++++
 4 files changed, 213 insertions(+), 17 deletions(-)
 create mode 100644 internal/knowledge/draft.go
 create mode 100644 internal/knowledge/draft_test.go

diff --git a/cmd/aima/tooldeps_model.go b/cmd/aima/tooldeps_model.go
index c604902..e537d07 100644
--- a/cmd/aima/tooldeps_model.go
+++ b/cmd/aima/tooldeps_model.go
@@ -153,28 +153,41 @@ func annotateModelsFromCatalog(models []*state.Model, cat *knowledge.Catalog) {
 			assetsByName[strings.ToLower(strings.TrimSpace(alias))] = ma
 		}
 	}
+	draftKeys := cat.SpeculativeDraftModelKeys()
+
 	for _, m := range models {
 		if m == nil {
 			continue
 		}
-		ma := assetsByName[strings.ToLower(strings.TrimSpace(m.Name))]
-		if ma == nil {
-			continue
-		}
-		if strings.TrimSpace(m.ModelClass) == "" {
-			m.ModelClass = strings.TrimSpace(ma.Metadata.ModelClass)
-		}
-		if strings.TrimSpace(m.UIRole) == "" {
-			m.UIRole = strings.TrimSpace(ma.UI.Role)
-		}
-		if strings.TrimSpace(m.UIDisplayNote) == "" {
-			m.UIDisplayNote = strings.TrimSpace(ma.UI.DisplayNote)
-		}
-		if strings.TrimSpace(m.UIDisplayNoteZh) == "" {
-			m.UIDisplayNoteZh = strings.TrimSpace(ma.UI.DisplayNoteZh)
+		if ma := assetsByName[strings.ToLower(strings.TrimSpace(m.Name))]; ma != nil {
+			if strings.TrimSpace(m.ModelClass) == "" {
+				m.ModelClass = strings.TrimSpace(ma.Metadata.ModelClass)
+			}
+			if strings.TrimSpace(m.UIRole) == "" {
+				m.UIRole = strings.TrimSpace(ma.UI.Role)
+			}
+			if strings.TrimSpace(m.UIDisplayNote) == "" {
+				m.UIDisplayNote = strings.TrimSpace(ma.UI.DisplayNote)
+			}
+			if strings.TrimSpace(m.UIDisplayNoteZh) == "" {
+				m.UIDisplayNoteZh = strings.TrimSpace(ma.UI.DisplayNoteZh)
+			}
+			if m.StandaloneDeploy == nil {
+				m.StandaloneDeploy = ma.Capabilities.StandaloneDeploy
+			}
 		}
-		if m.StandaloneDeploy == nil {
-			m.StandaloneDeploy = ma.Capabilities.StandaloneDeploy
+
+		// Speculative draft heads (e.g. DFlash/MTP) are companions of their
+		// parent model — the catalog names them via each variant's
+		// speculative_config.model — not independently deployable models.
+		if draftKeys[knowledge.NormalizeModelKey(m.Name)] {
+			if m.StandaloneDeploy == nil {
+				notStandalone := false
+				m.StandaloneDeploy = &notStandalone
+			}
+			if strings.TrimSpace(m.UIRole) == "" {
+				m.UIRole = "draft"
+			}
 		}
 	}
 }
diff --git a/cmd/aima/tooldeps_model_test.go b/cmd/aima/tooldeps_model_test.go
index 55d98f0..f61cc46 100644
--- a/cmd/aima/tooldeps_model_test.go
+++ b/cmd/aima/tooldeps_model_test.go
@@ -581,3 +581,56 @@ func writeScanModelFixture(dir string, weightSize int) error {
 	}
 	return os.WriteFile(filepath.Join(dir, "model.safetensors"), make([]byte, weightSize), 0o644)
 }
+
+// A speculative draft head (e.g. DFlash/MTP), declared only as a variant's
+// speculative_config.model in the catalog, must be marked non-standalone so the
+// UI does not offer to deploy it on its own. Its parent model stays deployable.
+func TestAnnotateModelsFromCatalog_SpeculativeDraftNotStandalone(t *testing.T) {
+	cat := &knowledge.Catalog{
+		ModelAssets: []knowledge.ModelAsset{{
+			Metadata: knowledge.ModelMetadata{
+				Name:    "qwen3.6-35b-a3b",
+				Aliases: []string{"Qwen3.6-35B-A3B"},
+			},
+			Variants: []knowledge.ModelVariant{{
+				Name: "dflash",
+				DefaultConfig: map[string]any{
+					"speculative_config": map[string]any{
+						"method": "dflash",
+						"model":  "/models/Qwen3.6-35B-A3B-DFlash",
+					},
+				},
+			}},
+		}},
+	}
+	models := []*state.Model{
+		{Name: "Qwen3.6-35B-A3B"},               // parent: stays deployable
+		{Name: "Qwen3.6-35B-A3B-DFlash"},        // draft (safetensors)
+		{Name: "Qwen3.6-35B-A3B-DFlash-Q4_K_M"}, // draft (gguf quant)
+	}
+
+	annotateModelsFromCatalog(models, cat)
+
+	byName := make(map[string]*state.Model, len(models))
+	for _, m := range models {
+		byName[m.Name] = m
+	}
+
+	for _, name := range []string{"Qwen3.6-35B-A3B-DFlash", "Qwen3.6-35B-A3B-DFlash-Q4_K_M"} {
+		m := byName[name]
+		if m.StandaloneDeploy == nil || *m.StandaloneDeploy {
+			t.Errorf("%s: StandaloneDeploy = %v, want non-nil false", name, m.StandaloneDeploy)
+		}
+		if m.UIRole != "draft" {
+			t.Errorf("%s: UIRole = %q, want %q", name, m.UIRole, "draft")
+		}
+	}
+
+	parent := byName["Qwen3.6-35B-A3B"]
+	if parent.StandaloneDeploy != nil && !*parent.StandaloneDeploy {
+		t.Errorf("parent model must not be marked non-standalone")
+	}
+	if parent.UIRole == "draft" {
+		t.Errorf("parent model must not be tagged as a draft")
+	}
+}
diff --git a/internal/knowledge/draft.go b/internal/knowledge/draft.go
new file mode 100644
index 0000000..984172b
--- /dev/null
+++ b/internal/knowledge/draft.go
@@ -0,0 +1,71 @@
+package knowledge
+
+import (
+	"path"
+	"regexp"
+	"strings"
+)
+
+// quantSuffixToken matches a single '-'-delimited trailing token that denotes a
+// quantization, precision, or storage-layout variant rather than model identity
+// (e.g. "q4_k_m", "bf16", "ud", "unfused"). Role-bearing tokens such as
+// "dflash"/"mtp"/"flash" are deliberately excluded so a draft head keeps its
+// identity.
+var quantSuffixToken = regexp.MustCompile(`^(?:q\d[\dkmsl_]*|iq\d[\dxsa_]*|bf16|fp16|fp32|fp8|f16|f32|int4|int8|nf4|mxfp4|ud|awq|gptq|gguf|mlx|unfused|fused)$`)
+
+// NormalizeModelKey lowercases a model name and strips trailing
+// quantization/precision/layout tokens so different on-disk artifacts of one
+// logical model share a key. It keeps role-bearing tokens like "dflash" so a
+// draft head normalizes to "<base>-dflash", distinct from its parent "<base>".
+//
+//	"Qwen3.6-35B-A3B-UD-Q4_K_M"      -> "qwen3.6-35b-a3b"
+//	"qwen3.6-35b-a3b-bf16-unfused"   -> "qwen3.6-35b-a3b"
+//	"Qwen3.6-35B-A3B-DFlash-Q4_K_M"  -> "qwen3.6-35b-a3b-dflash"
+//	"glm-4.7-flash"                  -> "glm-4.7-flash"
+func NormalizeModelKey(name string) string {
+	name = strings.TrimSpace(strings.ToLower(name))
+	if name == "" {
+		return ""
+	}
+	tokens := strings.Split(name, "-")
+	for len(tokens) > 1 && quantSuffixToken.MatchString(tokens[len(tokens)-1]) {
+		tokens = tokens[:len(tokens)-1]
+	}
+	return strings.Join(tokens, "-")
+}
+
+// SpeculativeDraftModelKeys harvests every variant's speculative_config.model
+// reference across all model assets and returns the set of normalized draft
+// model keys. A scanned model whose NormalizeModelKey is in this set is a
+// speculative draft head (e.g. DFlash/MTP) — a companion of its parent model,
+// not an independently deployable model.
+func (c *Catalog) SpeculativeDraftModelKeys() map[string]bool {
+	keys := make(map[string]bool)
+	if c == nil {
+		return keys
+	}
+	for i := range c.ModelAssets {
+		for _, v := range c.ModelAssets[i].Variants {
+			ref := speculativeModelRef(v.DefaultConfig)
+			if ref == "" {
+				continue
+			}
+			// The reference may be a path ("/models/X", "D:\models\X") or a
+			// bare name; reduce it to the artifact base name first.
+			base := path.Base(strings.ReplaceAll(ref, `\`, "/"))
+			if key := NormalizeModelKey(base); key != "" {
+				keys[key] = true
+			}
+		}
+	}
+	return keys
+}
+
+func speculativeModelRef(dc map[string]any) string {
+	sc, ok := dc["speculative_config"].(map[string]any)
+	if !ok {
+		return ""
+	}
+	model, _ := sc["model"].(string)
+	return strings.TrimSpace(model)
+}
diff --git a/internal/knowledge/draft_test.go b/internal/knowledge/draft_test.go
new file mode 100644
index 0000000..2164a50
--- /dev/null
+++ b/internal/knowledge/draft_test.go
@@ -0,0 +1,59 @@
+package knowledge
+
+import "testing"
+
+func TestNormalizeModelKey(t *testing.T) {
+	cases := []struct {
+		name string
+		in   string
+		want string
+	}{
+		{"plain", "qwen3.6-35b-a3b", "qwen3.6-35b-a3b"},
+		{"draft safetensors", "Qwen3.6-35B-A3B-DFlash", "qwen3.6-35b-a3b-dflash"},
+		{"draft gguf quant", "Qwen3.6-35B-A3B-DFlash-Q4_K_M", "qwen3.6-35b-a3b-dflash"},
+		{"bf16", "qwen3.6-35b-a3b-bf16", "qwen3.6-35b-a3b"},
+		{"bf16 unfused layout", "qwen3.6-35b-a3b-bf16-unfused", "qwen3.6-35b-a3b"},
+		{"q4 quant", "qwen3.6-35b-a3b-q4_k_m", "qwen3.6-35b-a3b"},
+		{"unsloth dynamic quant", "Qwen3.6-35B-A3B-UD-Q4_K_M", "qwen3.6-35b-a3b"},
+		{"flash is identity not quant", "glm-4.7-flash", "glm-4.7-flash"},
+		{"embedding q8", "qwen3-embedding-4b-q8_0", "qwen3-embedding-4b"},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			if got := NormalizeModelKey(c.in); got != c.want {
+				t.Errorf("NormalizeModelKey(%q) = %q, want %q", c.in, got, c.want)
+			}
+		})
+	}
+}
+
+func TestSpeculativeDraftModelKeys(t *testing.T) {
+	cat := &Catalog{
+		ModelAssets: []ModelAsset{{
+			Metadata: ModelMetadata{Name: "qwen3.6-35b-a3b"},
+			Variants: []ModelVariant{
+				{Name: "plain"}, // no speculative_config
+				{Name: "dflash", DefaultConfig: map[string]any{
+					"speculative_config": map[string]any{
+						"method": "dflash",
+						"model":  "/models/Qwen3.6-35B-A3B-DFlash",
+					},
+				}},
+			},
+		}},
+	}
+	keys := cat.SpeculativeDraftModelKeys()
+	if !keys["qwen3.6-35b-a3b-dflash"] {
+		t.Fatalf("expected draft key %q in %v", "qwen3.6-35b-a3b-dflash", keys)
+	}
+	if keys["qwen3.6-35b-a3b"] {
+		t.Errorf("base model must not be a draft key: %v", keys)
+	}
+}
+
+func TestSpeculativeDraftModelKeys_NilCatalog(t *testing.T) {
+	var c *Catalog
+	if got := c.SpeculativeDraftModelKeys(); len(got) != 0 {
+		t.Errorf("nil catalog should yield no draft keys, got %v", got)
+	}
+}