From a56c0b9c478cf22059127775a81e0db2e18b1412 Mon Sep 17 00:00:00 2001 From: Codex Date: Tue, 9 Jun 2026 14:28:09 +0800 Subject: [PATCH] feat(model): mark speculative draft heads as non-standalone AIMA's model scanner lists every weight artifact on disk as an independently deployable model. Speculative draft heads (DFlash / MTP) only make sense paired with their parent model for speculative decoding, yet they showed up as standalone models with a deploy button (e.g. Qwen3.6-35B-A3B-DFlash, Qwen3.6-35B-A3B-DFlash-Q4_K_M). The catalog already names each draft via its parent variant's speculative_config.model, so detection needs no new per-model YAML: - knowledge.NormalizeModelKey: lowercases a model name and strips quantization/precision/layout suffixes (q4_k_m, bf16, ud, unfused, ...) while keeping role tokens like "dflash", so all on-disk artifacts of one logical draft share a key and stay distinct from the parent. - Catalog.SpeculativeDraftModelKeys: harvests speculative_config.model across all variants into a set of normalized draft keys. - annotateModelsFromCatalog: a scanned model whose normalized name is a draft key gets standalone_deploy=false + ui.role=draft (only when not already set); the embedded UI already hides the deploy button on standalone_deploy=false. Knowledge-driven (INV-1/2): no hardcoded model names, derived entirely from the catalog. Parent models and quantization variants are unaffected, and DB rows / deploy-by-name paths are untouched (annotation is applied at model.list time only). Tests: NormalizeModelKey table cases (incl. "glm-4.7-flash" not stripped), SpeculativeDraftModelKeys harvest + nil-catalog, and the annotate wiring (drafts -> non-standalone draft, parent stays deployable). Co-Authored-By: Claude Opus 4.8 (1M context) --- cmd/aima/tooldeps_model.go | 47 +++++++++++++-------- cmd/aima/tooldeps_model_test.go | 53 ++++++++++++++++++++++++ internal/knowledge/draft.go | 71 ++++++++++++++++++++++++++++++++ internal/knowledge/draft_test.go | 59 ++++++++++++++++++++++++++ 4 files changed, 213 insertions(+), 17 deletions(-) create mode 100644 internal/knowledge/draft.go create mode 100644 internal/knowledge/draft_test.go diff --git a/cmd/aima/tooldeps_model.go b/cmd/aima/tooldeps_model.go index c604902..e537d07 100644 --- a/cmd/aima/tooldeps_model.go +++ b/cmd/aima/tooldeps_model.go @@ -153,28 +153,41 @@ func annotateModelsFromCatalog(models []*state.Model, cat *knowledge.Catalog) { assetsByName[strings.ToLower(strings.TrimSpace(alias))] = ma } } + draftKeys := cat.SpeculativeDraftModelKeys() + for _, m := range models { if m == nil { continue } - ma := assetsByName[strings.ToLower(strings.TrimSpace(m.Name))] - if ma == nil { - continue - } - if strings.TrimSpace(m.ModelClass) == "" { - m.ModelClass = strings.TrimSpace(ma.Metadata.ModelClass) - } - if strings.TrimSpace(m.UIRole) == "" { - m.UIRole = strings.TrimSpace(ma.UI.Role) - } - if strings.TrimSpace(m.UIDisplayNote) == "" { - m.UIDisplayNote = strings.TrimSpace(ma.UI.DisplayNote) - } - if strings.TrimSpace(m.UIDisplayNoteZh) == "" { - m.UIDisplayNoteZh = strings.TrimSpace(ma.UI.DisplayNoteZh) + if ma := assetsByName[strings.ToLower(strings.TrimSpace(m.Name))]; ma != nil { + if strings.TrimSpace(m.ModelClass) == "" { + m.ModelClass = strings.TrimSpace(ma.Metadata.ModelClass) + } + if strings.TrimSpace(m.UIRole) == "" { + m.UIRole = strings.TrimSpace(ma.UI.Role) + } + if strings.TrimSpace(m.UIDisplayNote) == "" { + m.UIDisplayNote = strings.TrimSpace(ma.UI.DisplayNote) + } + if strings.TrimSpace(m.UIDisplayNoteZh) == "" { + m.UIDisplayNoteZh = strings.TrimSpace(ma.UI.DisplayNoteZh) + } + if m.StandaloneDeploy == nil { + m.StandaloneDeploy = ma.Capabilities.StandaloneDeploy + } } - if m.StandaloneDeploy == nil { - m.StandaloneDeploy = ma.Capabilities.StandaloneDeploy + + // Speculative draft heads (e.g. DFlash/MTP) are companions of their + // parent model — the catalog names them via each variant's + // speculative_config.model — not independently deployable models. + if draftKeys[knowledge.NormalizeModelKey(m.Name)] { + if m.StandaloneDeploy == nil { + notStandalone := false + m.StandaloneDeploy = ¬Standalone + } + if strings.TrimSpace(m.UIRole) == "" { + m.UIRole = "draft" + } } } } diff --git a/cmd/aima/tooldeps_model_test.go b/cmd/aima/tooldeps_model_test.go index 55d98f0..f61cc46 100644 --- a/cmd/aima/tooldeps_model_test.go +++ b/cmd/aima/tooldeps_model_test.go @@ -581,3 +581,56 @@ func writeScanModelFixture(dir string, weightSize int) error { } return os.WriteFile(filepath.Join(dir, "model.safetensors"), make([]byte, weightSize), 0o644) } + +// A speculative draft head (e.g. DFlash/MTP), declared only as a variant's +// speculative_config.model in the catalog, must be marked non-standalone so the +// UI does not offer to deploy it on its own. Its parent model stays deployable. +func TestAnnotateModelsFromCatalog_SpeculativeDraftNotStandalone(t *testing.T) { + cat := &knowledge.Catalog{ + ModelAssets: []knowledge.ModelAsset{{ + Metadata: knowledge.ModelMetadata{ + Name: "qwen3.6-35b-a3b", + Aliases: []string{"Qwen3.6-35B-A3B"}, + }, + Variants: []knowledge.ModelVariant{{ + Name: "dflash", + DefaultConfig: map[string]any{ + "speculative_config": map[string]any{ + "method": "dflash", + "model": "/models/Qwen3.6-35B-A3B-DFlash", + }, + }, + }}, + }}, + } + models := []*state.Model{ + {Name: "Qwen3.6-35B-A3B"}, // parent: stays deployable + {Name: "Qwen3.6-35B-A3B-DFlash"}, // draft (safetensors) + {Name: "Qwen3.6-35B-A3B-DFlash-Q4_K_M"}, // draft (gguf quant) + } + + annotateModelsFromCatalog(models, cat) + + byName := make(map[string]*state.Model, len(models)) + for _, m := range models { + byName[m.Name] = m + } + + for _, name := range []string{"Qwen3.6-35B-A3B-DFlash", "Qwen3.6-35B-A3B-DFlash-Q4_K_M"} { + m := byName[name] + if m.StandaloneDeploy == nil || *m.StandaloneDeploy { + t.Errorf("%s: StandaloneDeploy = %v, want non-nil false", name, m.StandaloneDeploy) + } + if m.UIRole != "draft" { + t.Errorf("%s: UIRole = %q, want %q", name, m.UIRole, "draft") + } + } + + parent := byName["Qwen3.6-35B-A3B"] + if parent.StandaloneDeploy != nil && !*parent.StandaloneDeploy { + t.Errorf("parent model must not be marked non-standalone") + } + if parent.UIRole == "draft" { + t.Errorf("parent model must not be tagged as a draft") + } +} diff --git a/internal/knowledge/draft.go b/internal/knowledge/draft.go new file mode 100644 index 0000000..984172b --- /dev/null +++ b/internal/knowledge/draft.go @@ -0,0 +1,71 @@ +package knowledge + +import ( + "path" + "regexp" + "strings" +) + +// quantSuffixToken matches a single '-'-delimited trailing token that denotes a +// quantization, precision, or storage-layout variant rather than model identity +// (e.g. "q4_k_m", "bf16", "ud", "unfused"). Role-bearing tokens such as +// "dflash"/"mtp"/"flash" are deliberately excluded so a draft head keeps its +// identity. +var quantSuffixToken = regexp.MustCompile(`^(?:q\d[\dkmsl_]*|iq\d[\dxsa_]*|bf16|fp16|fp32|fp8|f16|f32|int4|int8|nf4|mxfp4|ud|awq|gptq|gguf|mlx|unfused|fused)$`) + +// NormalizeModelKey lowercases a model name and strips trailing +// quantization/precision/layout tokens so different on-disk artifacts of one +// logical model share a key. It keeps role-bearing tokens like "dflash" so a +// draft head normalizes to "-dflash", distinct from its parent "". +// +// "Qwen3.6-35B-A3B-UD-Q4_K_M" -> "qwen3.6-35b-a3b" +// "qwen3.6-35b-a3b-bf16-unfused" -> "qwen3.6-35b-a3b" +// "Qwen3.6-35B-A3B-DFlash-Q4_K_M" -> "qwen3.6-35b-a3b-dflash" +// "glm-4.7-flash" -> "glm-4.7-flash" +func NormalizeModelKey(name string) string { + name = strings.TrimSpace(strings.ToLower(name)) + if name == "" { + return "" + } + tokens := strings.Split(name, "-") + for len(tokens) > 1 && quantSuffixToken.MatchString(tokens[len(tokens)-1]) { + tokens = tokens[:len(tokens)-1] + } + return strings.Join(tokens, "-") +} + +// SpeculativeDraftModelKeys harvests every variant's speculative_config.model +// reference across all model assets and returns the set of normalized draft +// model keys. A scanned model whose NormalizeModelKey is in this set is a +// speculative draft head (e.g. DFlash/MTP) — a companion of its parent model, +// not an independently deployable model. +func (c *Catalog) SpeculativeDraftModelKeys() map[string]bool { + keys := make(map[string]bool) + if c == nil { + return keys + } + for i := range c.ModelAssets { + for _, v := range c.ModelAssets[i].Variants { + ref := speculativeModelRef(v.DefaultConfig) + if ref == "" { + continue + } + // The reference may be a path ("/models/X", "D:\models\X") or a + // bare name; reduce it to the artifact base name first. + base := path.Base(strings.ReplaceAll(ref, `\`, "/")) + if key := NormalizeModelKey(base); key != "" { + keys[key] = true + } + } + } + return keys +} + +func speculativeModelRef(dc map[string]any) string { + sc, ok := dc["speculative_config"].(map[string]any) + if !ok { + return "" + } + model, _ := sc["model"].(string) + return strings.TrimSpace(model) +} diff --git a/internal/knowledge/draft_test.go b/internal/knowledge/draft_test.go new file mode 100644 index 0000000..2164a50 --- /dev/null +++ b/internal/knowledge/draft_test.go @@ -0,0 +1,59 @@ +package knowledge + +import "testing" + +func TestNormalizeModelKey(t *testing.T) { + cases := []struct { + name string + in string + want string + }{ + {"plain", "qwen3.6-35b-a3b", "qwen3.6-35b-a3b"}, + {"draft safetensors", "Qwen3.6-35B-A3B-DFlash", "qwen3.6-35b-a3b-dflash"}, + {"draft gguf quant", "Qwen3.6-35B-A3B-DFlash-Q4_K_M", "qwen3.6-35b-a3b-dflash"}, + {"bf16", "qwen3.6-35b-a3b-bf16", "qwen3.6-35b-a3b"}, + {"bf16 unfused layout", "qwen3.6-35b-a3b-bf16-unfused", "qwen3.6-35b-a3b"}, + {"q4 quant", "qwen3.6-35b-a3b-q4_k_m", "qwen3.6-35b-a3b"}, + {"unsloth dynamic quant", "Qwen3.6-35B-A3B-UD-Q4_K_M", "qwen3.6-35b-a3b"}, + {"flash is identity not quant", "glm-4.7-flash", "glm-4.7-flash"}, + {"embedding q8", "qwen3-embedding-4b-q8_0", "qwen3-embedding-4b"}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + if got := NormalizeModelKey(c.in); got != c.want { + t.Errorf("NormalizeModelKey(%q) = %q, want %q", c.in, got, c.want) + } + }) + } +} + +func TestSpeculativeDraftModelKeys(t *testing.T) { + cat := &Catalog{ + ModelAssets: []ModelAsset{{ + Metadata: ModelMetadata{Name: "qwen3.6-35b-a3b"}, + Variants: []ModelVariant{ + {Name: "plain"}, // no speculative_config + {Name: "dflash", DefaultConfig: map[string]any{ + "speculative_config": map[string]any{ + "method": "dflash", + "model": "/models/Qwen3.6-35B-A3B-DFlash", + }, + }}, + }, + }}, + } + keys := cat.SpeculativeDraftModelKeys() + if !keys["qwen3.6-35b-a3b-dflash"] { + t.Fatalf("expected draft key %q in %v", "qwen3.6-35b-a3b-dflash", keys) + } + if keys["qwen3.6-35b-a3b"] { + t.Errorf("base model must not be a draft key: %v", keys) + } +} + +func TestSpeculativeDraftModelKeys_NilCatalog(t *testing.T) { + var c *Catalog + if got := c.SpeculativeDraftModelKeys(); len(got) != 0 { + t.Errorf("nil catalog should yield no draft keys, got %v", got) + } +}