From fe7a0a4f1488517abe6f43b53596155cff969735 Mon Sep 17 00:00:00 2001
From: rjckkkkk <59609580+rjckkkkk@users.noreply.github.com>
Date: Thu, 11 Jun 2026 09:43:57 +0000
Subject: [PATCH] Auto-wire llama.cpp multimodal projector (--mmproj) for VL
 models

GGUF vision models ship a co-located mmproj-*.gguf projector and llama-server
needs --mmproj to accept images, but the deploy never passed it, so VL models
served text only unless the user manually added `--config mmproj=<path>`.

On deploy, when the engine is llamacpp and the model format is gguf, look for a
co-located mmproj-*.gguf next to the model file (preferring an f16 projector) and
inject its path as the `mmproj` config (flows through configToFlags as --mmproj).
Skipped when the caller already set mmproj or no projector is present, so plain
LLMs are unaffected.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 cmd/aima/tooldeps_deploy.go | 52 +++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
diff --git a/cmd/aima/tooldeps_deploy.go b/cmd/aima/tooldeps_deploy.go
index 8f0a955..01a1cb4 100644
--- a/cmd/aima/tooldeps_deploy.go
+++ b/cmd/aima/tooldeps_deploy.go
@@ -7,6 +7,7 @@ import (
 	"log/slog"
 	"math"
 	"os"
+	"path/filepath"
 	"strconv"
 	"strings"
 	"time"
@@ -83,6 +84,22 @@ func buildDeployDeps(ac *appContext, deps *mcp.ToolDeps,
 			}
 		}
 
+		// Auto-wire the multimodal projector for llama.cpp VL models. A GGUF vision
+		// model ships a co-located mmproj-*.gguf, and llama-server needs --mmproj to
+		// accept images. If the caller didn't set it, inject the projector path so
+		// vision works zero-config (it flows through configToFlags as --mmproj).
+		if resolved.ModelFormat == "gguf" && strings.HasPrefix(strings.ToLower(resolved.Engine), "llamacpp") {
+			if _, set := resolved.Config["mmproj"]; !set {
+				if mm := findColocatedMMProj(modelPath); mm != "" {
+					if resolved.Config == nil {
+						resolved.Config = map[string]any{}
+					}
+					resolved.Config["mmproj"] = mm
+					slog.Info("auto-wired multimodal projector for vision", "model", modelName, "mmproj", mm)
+				}
+			}
+		}
+
 		req := &runtime.DeployRequest{
 			Name:             modelName,
 			Engine:           resolved.Engine,
@@ -850,3 +867,38 @@ func normalizeServedModelName(modelName, raw string) string {
 	}
 	return served
 }
+
+// findColocatedMMProj returns the path of a multimodal projector (mmproj-*.gguf)
+// next to a GGUF model, preferring an f16 projector for quality. Returns "" when
+// none is present (i.e. the model is not multimodal). modelPath may be the model
+// file or its directory; the projector is expected in the same directory.
+func findColocatedMMProj(modelPath string) string {
+	dir := modelPath
+	if fi, err := os.Stat(modelPath); err == nil && !fi.IsDir() {
+		dir = filepath.Dir(modelPath)
+	}
+	entries, err := os.ReadDir(dir)
+	if err != nil {
+		return ""
+	}
+	var f16, other string
+	for _, e := range entries {
+		if e.IsDir() {
+			continue
+		}
+		lower := strings.ToLower(e.Name())
+		if !strings.HasSuffix(lower, ".gguf") || !strings.Contains(lower, "mmproj") {
+			continue
+		}
+		full := filepath.Join(dir, e.Name())
+		if strings.Contains(lower, "f16") || strings.Contains(lower, "fp16") {
+			f16 = full
+		} else if other == "" {
+			other = full
+		}
+	}
+	if f16 != "" {
+		return f16
+	}
+	return other
+}