williamkasasa · williamkasasa · Apr 10, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/README.md b/README.md
@@ -175,17 +175,19 @@ If you want one control plane in front of multiple models, point the UI at LiteL
 ```powershell
 $repoRoot = git rev-parse --show-toplevel
 Set-Location $repoRoot
-litellm --host 127.0.0.1 --port 4000
+.\scripts\start-litellm.ps1
 ```
 
 Then in the AG-Claw settings UI:
 
 - Provider: `openai-compatible`
 - API URL: `http://127.0.0.1:4000`
-- API key: your LiteLLM bearer token if enabled
+- API key: `agclaw-dev-key` by default, or your LiteLLM bearer token if you changed it
 
 That same gateway URL also works with the local benchmark script below.
 
+The starter config lives at `litellm/agclaw-config.local.yaml` and exposes `qwen2.5:3b`, `gemma3:1b`, and `qwen2.5vl:3b` through one OpenAI-compatible endpoint.
+
 ## Governed Eval Assets
 
 The `promptfoo` pack now includes an allowlisted Hugging Face ingestion path for evaluation assets.
@@ -201,6 +203,18 @@ Imported samples are written under `promptfoo/cases/hf/` and include governance
 
 If Hugging Face traffic is intercepted by a corporate proxy, set `AGCLAW_HF_CA_FILE` to the proxy PEM bundle before running the importer. Use `AGCLAW_HF_ALLOW_INSECURE_TLS=1` only as a temporary fallback.
 
+Build and run the multimodal promptfoo packs after importing both `rico-screen2words` and `ocr-vqa` samples:
+
+```powershell
+$repoRoot = git rev-parse --show-toplevel
+Set-Location (Join-Path $repoRoot "promptfoo")
+npm install
+$env:AGCLAW_PROMPTFOO_VISION_PROVIDER = "ollama"
+$env:AGCLAW_PROMPTFOO_VISION_BASE_URL = "http://127.0.0.1:11434"
+$env:AGCLAW_PROMPTFOO_VISION_MODEL = "qwen2.5vl:3b"
+npm run gate:vision-all
+```
+
 ## Local Benchmark Pass
 
 To compare the small local assistant defaults from this slice:

diff --git a/backend/agclaw_backend/mes_services.py b/backend/agclaw_backend/mes_services.py
@@ -20,6 +20,18 @@
 )
 
 
+def _route_suffix(route_key: str) -> str:
+    normalized = "_".join(part for part in str(route_key or "").strip().upper().split("-") if part)
+    return f"_{normalized}" if normalized else ""
+
+
+def _route_env(base_name: str, route_key: str = "") -> str:
+    routed = os.getenv(f"{base_name}{_route_suffix(route_key)}", "").strip()
+    if routed:
+        return routed
+    return os.getenv(base_name, "").strip()
+
+
 def _request_json(
     url: str,
     payload: dict[str, object],
@@ -104,24 +116,24 @@ def _extract_vision_summary(response: dict[str, object]) -> str:
     return ""
 
 
-def _vision_adapter() -> tuple[str, str, str, str]:
-    provider = os.getenv("AGCLAW_SCREEN_VISION_PROVIDER", "").strip().lower()
-    base_url = os.getenv("AGCLAW_SCREEN_VISION_BASE_URL", "").strip()
-    api_key = os.getenv("AGCLAW_SCREEN_VISION_API_KEY", "").strip()
-    model = os.getenv("AGCLAW_SCREEN_VISION_MODEL", "").strip()
+def _vision_adapter(route_key: str = "") -> tuple[str, str, str, str]:
+    provider = _route_env("AGCLAW_SCREEN_VISION_PROVIDER", route_key).lower()
+    base_url = _route_env("AGCLAW_SCREEN_VISION_BASE_URL", route_key)
+    api_key = _route_env("AGCLAW_SCREEN_VISION_API_KEY", route_key)
+    model = _route_env("AGCLAW_SCREEN_VISION_MODEL", route_key)
     return provider, base_url, api_key, model
 
 
-def _vision_timeout_seconds() -> float:
-    raw_value = os.getenv("AGCLAW_SCREEN_VISION_TIMEOUT_SECONDS", "180").strip()
+def _vision_timeout_seconds(route_key: str = "") -> float:
+    raw_value = _route_env("AGCLAW_SCREEN_VISION_TIMEOUT_SECONDS", route_key) or "180"
     try:
         return max(30.0, float(raw_value))
     except ValueError:
         return 180.0
 
 
-def _run_openai_vision(prompt: str, image_data_url: str) -> tuple[str, str]:
-    provider, base_url, api_key, model = _vision_adapter()
+def _run_openai_vision(prompt: str, image_data_url: str, route_key: str = "") -> tuple[str, str]:
+    provider, base_url, api_key, model = _vision_adapter(route_key)
     if provider not in {"github-models", "openai", "openai-compatible", "ollama", "vllm"} or not base_url or not model or not image_data_url:
         return "heuristic", ""
 
@@ -149,7 +161,7 @@ def _run_openai_vision(prompt: str, image_data_url: str) -> tuple[str, str]:
         target = f"{base_url.rstrip('/')}/chat/completions"
     else:
         target = f"{base_url.rstrip('/')}/v1/chat/completions"
-    response = _request_json(target, payload, headers, timeout_seconds=_vision_timeout_seconds())
+    response = _request_json(target, payload, headers, timeout_seconds=_vision_timeout_seconds(route_key))
     return provider, _extract_vision_summary(response)
 
 
@@ -226,7 +238,7 @@ def interpret_screen(request: ScreenInterpretRequest) -> ScreenInterpretResponse
             "and any operator prompts. Do not suggest control actions."
         )
         try:
-            adapter, vision_summary = _run_openai_vision(vision_prompt, request.image_data_url)
+            adapter, vision_summary = _run_openai_vision(vision_prompt, request.image_data_url, route_key="hmi")
             if vision_summary.strip():
                 observations.append(f"Vision summary: {vision_summary.strip()}")
         except RuntimeError as error:

diff --git a/backend/tests/test_live_vision.py b/backend/tests/test_live_vision.py
@@ -5,6 +5,7 @@
 import time
 import unittest
 from pathlib import Path
+from urllib.parse import urljoin
 from urllib.request import Request, urlopen
 
 from agclaw_backend.http_api import create_server
@@ -14,6 +15,10 @@ def _enabled() -> bool:
     return os.getenv("AGCLAW_LIVE_VISION_TESTS") == "1"
 
 
+def _vision_env(base_name: str) -> str:
+    return os.getenv(f"{base_name}_HMI") or os.getenv(base_name, "")
+
+
 @unittest.skipUnless(_enabled(), "Set AGCLAW_LIVE_VISION_TESTS=1 to run live local vision checks.")
 class LiveVisionTests(unittest.TestCase):
     @classmethod
@@ -23,6 +28,10 @@ def setUpClass(cls) -> None:
             "AGCLAW_SCREEN_VISION_BASE_URL": os.getenv("AGCLAW_SCREEN_VISION_BASE_URL"),
             "AGCLAW_SCREEN_VISION_API_KEY": os.getenv("AGCLAW_SCREEN_VISION_API_KEY"),
             "AGCLAW_SCREEN_VISION_MODEL": os.getenv("AGCLAW_SCREEN_VISION_MODEL"),
+            "AGCLAW_SCREEN_VISION_PROVIDER_HMI": os.getenv("AGCLAW_SCREEN_VISION_PROVIDER_HMI"),
+            "AGCLAW_SCREEN_VISION_BASE_URL_HMI": os.getenv("AGCLAW_SCREEN_VISION_BASE_URL_HMI"),
+            "AGCLAW_SCREEN_VISION_API_KEY_HMI": os.getenv("AGCLAW_SCREEN_VISION_API_KEY_HMI"),
+            "AGCLAW_SCREEN_VISION_MODEL_HMI": os.getenv("AGCLAW_SCREEN_VISION_MODEL_HMI"),
         }
 
         os.environ["AGCLAW_SCREEN_VISION_PROVIDER"] = os.getenv("AGCLAW_SCREEN_VISION_PROVIDER", "ollama")
@@ -31,10 +40,11 @@ def setUpClass(cls) -> None:
         os.environ["AGCLAW_SCREEN_VISION_MODEL"] = os.getenv("AGCLAW_SCREEN_VISION_MODEL", "qwen2.5vl:3b")
         os.environ["AGCLAW_SCREEN_VISION_TIMEOUT_SECONDS"] = os.getenv("AGCLAW_SCREEN_VISION_TIMEOUT_SECONDS", "180")
 
-        with urlopen("http://127.0.0.1:11434/api/tags", timeout=10) as response:
+        tags_url = urljoin(_vision_env("AGCLAW_SCREEN_VISION_BASE_URL").rstrip("/") + "/", "api/tags")
+        with urlopen(tags_url, timeout=10) as response:
             payload = json.loads(response.read().decode("utf-8"))
         models = {item.get("name", "") for item in payload.get("models", [])}
-        required_model = os.environ["AGCLAW_SCREEN_VISION_MODEL"]
+        required_model = _vision_env("AGCLAW_SCREEN_VISION_MODEL")
         if required_model not in models:
             raise unittest.SkipTest(f"Required local vision model is not installed: {required_model}")
 

diff --git a/docker/docker-compose.litellm.yml b/docker/docker-compose.litellm.yml
@@ -0,0 +1,20 @@
+services:
+  litellm:
+    image: ghcr.io/berriai/litellm:main-latest
+    ports:
+      - "${LITELLM_PORT:-4000}:4000"
+    environment:
+      # HuggingFace token — required for vision-*-hosted routes.
+      - HF_TOKEN=${HF_TOKEN:-}
+      # Override the master key if needed; keep the default only for local dev.
+      - LITELLM_MASTER_KEY=${LITELLM_MASTER_KEY:-agclaw-dev-key}
+    volumes:
+      - ../litellm/agclaw-config.local.yaml:/app/config.yaml:ro
+    command: ["--config", "/app/config.yaml", "--port", "4000"]
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:4000/health"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+      start_period: 15s
diff --git a/docs/agclaw-vision-runbook.md b/docs/agclaw-vision-runbook.md
@@ -9,6 +9,7 @@ Validate that `/api/mes/interpret-screen` is using a real vision-capable endpoin
 - `openai-compatible`
 - `ollama`
 - `vllm`
+- `github-models`
 
 ## Required Environment
 
@@ -19,6 +20,15 @@ $env:AGCLAW_SCREEN_VISION_BASE_URL = "http://127.0.0.1:8000"
 $env:AGCLAW_SCREEN_VISION_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
 ```
 
+Task-routed local HMI setup on this workstation:
+
+```powershell
+$env:AGCLAW_SCREEN_VISION_PROVIDER_HMI = "ollama"
+$env:AGCLAW_SCREEN_VISION_BASE_URL_HMI = "http://127.0.0.1:11500"
+$env:AGCLAW_SCREEN_VISION_MODEL_HMI = "qwen2.5vl:7b"
+$env:AGCLAW_SCREEN_VISION_TIMEOUT_SECONDS_HMI = "360"
+```
+
 Optional:
 
 ```powershell
@@ -49,3 +59,40 @@ python -m agclaw_backend.server --host 127.0.0.1 --port 8008
 
 - Treat all output as advisory-only.
 - Do not allow the vision path to generate control commands or bypass approval gates.
+
+## Verified Local Result
+
+This repository has now been validated against real local multimodal endpoints using:
+
+```powershell
+$env:AGCLAW_SCREEN_VISION_PROVIDER = "ollama"
+$env:AGCLAW_SCREEN_VISION_BASE_URL = "http://127.0.0.1:11500"
+$env:AGCLAW_SCREEN_VISION_MODEL = "qwen2.5vl:7b"
+```
+
+The validated path covers:
+
+- backend live vision unit test against `backend/tests/fixtures/hmi-sample.png`
+- browser E2E upload flow through `Research Workbench -> HMI Review`
+- promptfoo `gate:vision-caption`
+
+Expected verified outcome:
+
+- `Adapter: ollama`
+- visible `Vision summary:` content
+- batch or recipe context called out without control-action instructions
+
+Observed local limitation:
+
+- `qwen2.5vl:3b` on the default local Ollama port fell back to `Adapter: heuristic` on the HMI fixture because the request exceeded available system memory on this workstation.
+- `qwen2.5vl:7b` on the alternate E-backed Ollama instance handled backend HMI interpretation and the promptfoo caption pack successfully.
+- `gemma3:4b` on the alternate E-backed Ollama instance handled the sampled OCR workload successfully.
+- A single local model still does not pass every pack on this workstation, so the most reliable local split is `qwen2.5vl:7b` for screenshot captioning and HMI review, and `gemma3:4b` for the sampled OCR suite.
+
+## Routed LiteLLM Option
+
+If you want a single OpenAI-compatible endpoint with task aliases, use the starter LiteLLM config and route these model names through it:
+
+- `vision-caption-local` -> `qwen2.5vl:7b`
+- `vision-hmi-local` -> `qwen2.5vl:7b`
+- `vision-ocr-local` -> `gemma3:4b`
diff --git a/litellm/agclaw-config.local.yaml b/litellm/agclaw-config.local.yaml
@@ -0,0 +1,61 @@
+# Change api_base to http://127.0.0.1:11500 if you route through the E-backed Ollama instance.
+model_list:
+  - model_name: qwen2.5:3b
+    litellm_params:
+      model: ollama/qwen2.5:3b
+      api_base: http://127.0.0.1:11434
+
+  - model_name: gemma3:1b
+    litellm_params:
+      model: ollama/gemma3:1b
+      api_base: http://127.0.0.1:11434
+
+  - model_name: gemma3:4b
+    litellm_params:
+      model: ollama/gemma3:4b
+      api_base: http://127.0.0.1:11434
+
+  - model_name: qwen2.5vl:3b
+    litellm_params:
+      model: ollama/qwen2.5vl:3b
+      api_base: http://127.0.0.1:11434
+
+  - model_name: qwen2.5vl:7b
+    litellm_params:
+      model: ollama/qwen2.5vl:7b
+      api_base: http://127.0.0.1:11500
+
+  - model_name: vision-caption-local
+    litellm_params:
+      model: ollama/qwen2.5vl:7b
+      api_base: http://127.0.0.1:11500
+
+  - model_name: vision-hmi-local
+    litellm_params:
+      model: ollama/qwen2.5vl:7b
+      api_base: http://127.0.0.1:11500
+
+  - model_name: vision-ocr-local
+    litellm_params:
+      model: ollama/gemma3:4b
+      api_base: http://127.0.0.1:11500
+
+  # HuggingFace hosted models — requires HF_TOKEN env var.
+  - model_name: vision-caption-hosted
+    litellm_params:
+      model: huggingface/Qwen/Qwen2.5-VL-7B-Instruct
+      api_key: os.environ/HF_TOKEN
+
+  - model_name: vision-hmi-hosted
+    litellm_params:
+      model: huggingface/Qwen/Qwen2.5-VL-7B-Instruct
+      api_key: os.environ/HF_TOKEN
+
+  - model_name: vision-ocr-hosted
+    litellm_params:
+      model: huggingface/google/gemma-3-4b-it
+      api_key: os.environ/HF_TOKEN
+
+general_settings:
+  master_key: agclaw-dev-key
+  completion_model: qwen2.5:3b
diff --git a/litellm/start.sh b/litellm/start.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+# LiteLLM launcher for the AG-Claw multimodal gateway.
+#
+# Usage:
+#   ./litellm/start.sh [--config <path>] [--port <port>]
+#
+# Environment:
+#   HF_TOKEN          HuggingFace token for hosted-model routes (required for
+#                     vision-caption-hosted, vision-hmi-hosted, vision-ocr-hosted).
+#   LITELLM_PORT      Listening port (default: 4000).
+#   LITELLM_CONFIG    Path to the LiteLLM config file
+#                     (default: <this script's directory>/agclaw-config.local.yaml).
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+CONFIG="${LITELLM_CONFIG:-${SCRIPT_DIR}/agclaw-config.local.yaml}"
+PORT="${LITELLM_PORT:-4000}"
+
+# Parse optional CLI overrides.
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --config)
+      CONFIG="$2"
+      shift 2
+      ;;
+    --port)
+      PORT="$2"
+      shift 2
+      ;;
+    *)
+      echo "Unknown option: $1" >&2
+      exit 1
+      ;;
+  esac
+done
+
+if [[ ! -f "${CONFIG}" ]]; then
+  echo "LiteLLM config not found: ${CONFIG}" >&2
+  exit 1
+fi
+
+if [[ -z "${HF_TOKEN:-}" ]]; then
+  echo "Warning: HF_TOKEN is not set — hosted HuggingFace routes will be unavailable." >&2
+fi
+
+echo "Starting LiteLLM proxy  config=${CONFIG}  port=${PORT}"
+exec litellm --config "${CONFIG}" --port "${PORT}"