From fe7a0a4f1488517abe6f43b53596155cff969735 Mon Sep 17 00:00:00 2001 From: rjckkkkk <59609580+rjckkkkk@users.noreply.github.com> Date: Thu, 11 Jun 2026 09:43:57 +0000 Subject: [PATCH] Auto-wire llama.cpp multimodal projector (--mmproj) for VL models GGUF vision models ship a co-located mmproj-*.gguf projector and llama-server needs --mmproj to accept images, but the deploy never passed it, so VL models served text only unless the user manually added `--config mmproj=`. On deploy, when the engine is llamacpp and the model format is gguf, look for a co-located mmproj-*.gguf next to the model file (preferring an f16 projector) and inject its path as the `mmproj` config (flows through configToFlags as --mmproj). Skipped when the caller already set mmproj or no projector is present, so plain LLMs are unaffected. Co-Authored-By: Claude Opus 4.8 (1M context) --- cmd/aima/tooldeps_deploy.go | 52 +++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/cmd/aima/tooldeps_deploy.go b/cmd/aima/tooldeps_deploy.go index 8f0a955..01a1cb4 100644 --- a/cmd/aima/tooldeps_deploy.go +++ b/cmd/aima/tooldeps_deploy.go @@ -7,6 +7,7 @@ import ( "log/slog" "math" "os" + "path/filepath" "strconv" "strings" "time" @@ -83,6 +84,22 @@ func buildDeployDeps(ac *appContext, deps *mcp.ToolDeps, } } + // Auto-wire the multimodal projector for llama.cpp VL models. A GGUF vision + // model ships a co-located mmproj-*.gguf, and llama-server needs --mmproj to + // accept images. If the caller didn't set it, inject the projector path so + // vision works zero-config (it flows through configToFlags as --mmproj). + if resolved.ModelFormat == "gguf" && strings.HasPrefix(strings.ToLower(resolved.Engine), "llamacpp") { + if _, set := resolved.Config["mmproj"]; !set { + if mm := findColocatedMMProj(modelPath); mm != "" { + if resolved.Config == nil { + resolved.Config = map[string]any{} + } + resolved.Config["mmproj"] = mm + slog.Info("auto-wired multimodal projector for vision", "model", modelName, "mmproj", mm) + } + } + } + req := &runtime.DeployRequest{ Name: modelName, Engine: resolved.Engine, @@ -850,3 +867,38 @@ func normalizeServedModelName(modelName, raw string) string { } return served } + +// findColocatedMMProj returns the path of a multimodal projector (mmproj-*.gguf) +// next to a GGUF model, preferring an f16 projector for quality. Returns "" when +// none is present (i.e. the model is not multimodal). modelPath may be the model +// file or its directory; the projector is expected in the same directory. +func findColocatedMMProj(modelPath string) string { + dir := modelPath + if fi, err := os.Stat(modelPath); err == nil && !fi.IsDir() { + dir = filepath.Dir(modelPath) + } + entries, err := os.ReadDir(dir) + if err != nil { + return "" + } + var f16, other string + for _, e := range entries { + if e.IsDir() { + continue + } + lower := strings.ToLower(e.Name()) + if !strings.HasSuffix(lower, ".gguf") || !strings.Contains(lower, "mmproj") { + continue + } + full := filepath.Join(dir, e.Name()) + if strings.Contains(lower, "f16") || strings.Contains(lower, "fp16") { + f16 = full + } else if other == "" { + other = full + } + } + if f16 != "" { + return f16 + } + return other +}