Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -175,17 +175,19 @@ If you want one control plane in front of multiple models, point the UI at LiteL
```powershell
$repoRoot = git rev-parse --show-toplevel
Set-Location $repoRoot
litellm --host 127.0.0.1 --port 4000
.\scripts\start-litellm.ps1
```

Then in the AG-Claw settings UI:

- Provider: `openai-compatible`
- API URL: `http://127.0.0.1:4000`
- API key: your LiteLLM bearer token if enabled
- API key: `agclaw-dev-key` by default, or your LiteLLM bearer token if you changed it

That same gateway URL also works with the local benchmark script below.

The starter config lives at `litellm/agclaw-config.local.yaml` and exposes `qwen2.5:3b`, `gemma3:1b`, and `qwen2.5vl:3b` through one OpenAI-compatible endpoint.

## Governed Eval Assets

The `promptfoo` pack now includes an allowlisted Hugging Face ingestion path for evaluation assets.
Expand All @@ -201,6 +203,18 @@ Imported samples are written under `promptfoo/cases/hf/` and include governance

If Hugging Face traffic is intercepted by a corporate proxy, set `AGCLAW_HF_CA_FILE` to the proxy PEM bundle before running the importer. Use `AGCLAW_HF_ALLOW_INSECURE_TLS=1` only as a temporary fallback.

Build and run the multimodal promptfoo packs after importing both `rico-screen2words` and `ocr-vqa` samples:

```powershell
$repoRoot = git rev-parse --show-toplevel
Set-Location (Join-Path $repoRoot "promptfoo")
npm install
$env:AGCLAW_PROMPTFOO_VISION_PROVIDER = "ollama"
$env:AGCLAW_PROMPTFOO_VISION_BASE_URL = "http://127.0.0.1:11434"
$env:AGCLAW_PROMPTFOO_VISION_MODEL = "qwen2.5vl:3b"
npm run gate:vision-all
```

## Local Benchmark Pass

To compare the small local assistant defaults from this slice:
Expand Down
34 changes: 23 additions & 11 deletions backend/agclaw_backend/mes_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,18 @@
)


def _route_suffix(route_key: str) -> str:
normalized = "_".join(part for part in str(route_key or "").strip().upper().split("-") if part)
return f"_{normalized}" if normalized else ""


def _route_env(base_name: str, route_key: str = "") -> str:
routed = os.getenv(f"{base_name}{_route_suffix(route_key)}", "").strip()
if routed:
return routed
return os.getenv(base_name, "").strip()


def _request_json(
url: str,
payload: dict[str, object],
Expand Down Expand Up @@ -104,24 +116,24 @@ def _extract_vision_summary(response: dict[str, object]) -> str:
return ""


def _vision_adapter() -> tuple[str, str, str, str]:
provider = os.getenv("AGCLAW_SCREEN_VISION_PROVIDER", "").strip().lower()
base_url = os.getenv("AGCLAW_SCREEN_VISION_BASE_URL", "").strip()
api_key = os.getenv("AGCLAW_SCREEN_VISION_API_KEY", "").strip()
model = os.getenv("AGCLAW_SCREEN_VISION_MODEL", "").strip()
def _vision_adapter(route_key: str = "") -> tuple[str, str, str, str]:
provider = _route_env("AGCLAW_SCREEN_VISION_PROVIDER", route_key).lower()
base_url = _route_env("AGCLAW_SCREEN_VISION_BASE_URL", route_key)
api_key = _route_env("AGCLAW_SCREEN_VISION_API_KEY", route_key)
model = _route_env("AGCLAW_SCREEN_VISION_MODEL", route_key)
return provider, base_url, api_key, model


def _vision_timeout_seconds() -> float:
raw_value = os.getenv("AGCLAW_SCREEN_VISION_TIMEOUT_SECONDS", "180").strip()
def _vision_timeout_seconds(route_key: str = "") -> float:
raw_value = _route_env("AGCLAW_SCREEN_VISION_TIMEOUT_SECONDS", route_key) or "180"
try:
return max(30.0, float(raw_value))
except ValueError:
return 180.0


def _run_openai_vision(prompt: str, image_data_url: str) -> tuple[str, str]:
provider, base_url, api_key, model = _vision_adapter()
def _run_openai_vision(prompt: str, image_data_url: str, route_key: str = "") -> tuple[str, str]:
provider, base_url, api_key, model = _vision_adapter(route_key)
if provider not in {"github-models", "openai", "openai-compatible", "ollama", "vllm"} or not base_url or not model or not image_data_url:
return "heuristic", ""

Expand Down Expand Up @@ -149,7 +161,7 @@ def _run_openai_vision(prompt: str, image_data_url: str) -> tuple[str, str]:
target = f"{base_url.rstrip('/')}/chat/completions"
else:
target = f"{base_url.rstrip('/')}/v1/chat/completions"
response = _request_json(target, payload, headers, timeout_seconds=_vision_timeout_seconds())
response = _request_json(target, payload, headers, timeout_seconds=_vision_timeout_seconds(route_key))
return provider, _extract_vision_summary(response)


Expand Down Expand Up @@ -226,7 +238,7 @@ def interpret_screen(request: ScreenInterpretRequest) -> ScreenInterpretResponse
"and any operator prompts. Do not suggest control actions."
)
try:
adapter, vision_summary = _run_openai_vision(vision_prompt, request.image_data_url)
adapter, vision_summary = _run_openai_vision(vision_prompt, request.image_data_url, route_key="hmi")
if vision_summary.strip():
observations.append(f"Vision summary: {vision_summary.strip()}")
except RuntimeError as error:
Expand Down
14 changes: 12 additions & 2 deletions backend/tests/test_live_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import time
import unittest
from pathlib import Path
from urllib.parse import urljoin
from urllib.request import Request, urlopen

from agclaw_backend.http_api import create_server
Expand All @@ -14,6 +15,10 @@ def _enabled() -> bool:
return os.getenv("AGCLAW_LIVE_VISION_TESTS") == "1"


def _vision_env(base_name: str) -> str:
return os.getenv(f"{base_name}_HMI") or os.getenv(base_name, "")


@unittest.skipUnless(_enabled(), "Set AGCLAW_LIVE_VISION_TESTS=1 to run live local vision checks.")
class LiveVisionTests(unittest.TestCase):
@classmethod
Expand All @@ -23,6 +28,10 @@ def setUpClass(cls) -> None:
"AGCLAW_SCREEN_VISION_BASE_URL": os.getenv("AGCLAW_SCREEN_VISION_BASE_URL"),
"AGCLAW_SCREEN_VISION_API_KEY": os.getenv("AGCLAW_SCREEN_VISION_API_KEY"),
"AGCLAW_SCREEN_VISION_MODEL": os.getenv("AGCLAW_SCREEN_VISION_MODEL"),
"AGCLAW_SCREEN_VISION_PROVIDER_HMI": os.getenv("AGCLAW_SCREEN_VISION_PROVIDER_HMI"),
"AGCLAW_SCREEN_VISION_BASE_URL_HMI": os.getenv("AGCLAW_SCREEN_VISION_BASE_URL_HMI"),
"AGCLAW_SCREEN_VISION_API_KEY_HMI": os.getenv("AGCLAW_SCREEN_VISION_API_KEY_HMI"),
"AGCLAW_SCREEN_VISION_MODEL_HMI": os.getenv("AGCLAW_SCREEN_VISION_MODEL_HMI"),
}

os.environ["AGCLAW_SCREEN_VISION_PROVIDER"] = os.getenv("AGCLAW_SCREEN_VISION_PROVIDER", "ollama")
Expand All @@ -31,10 +40,11 @@ def setUpClass(cls) -> None:
os.environ["AGCLAW_SCREEN_VISION_MODEL"] = os.getenv("AGCLAW_SCREEN_VISION_MODEL", "qwen2.5vl:3b")
os.environ["AGCLAW_SCREEN_VISION_TIMEOUT_SECONDS"] = os.getenv("AGCLAW_SCREEN_VISION_TIMEOUT_SECONDS", "180")

with urlopen("http://127.0.0.1:11434/api/tags", timeout=10) as response:
tags_url = urljoin(_vision_env("AGCLAW_SCREEN_VISION_BASE_URL").rstrip("/") + "/", "api/tags")
with urlopen(tags_url, timeout=10) as response:
payload = json.loads(response.read().decode("utf-8"))
models = {item.get("name", "") for item in payload.get("models", [])}
required_model = os.environ["AGCLAW_SCREEN_VISION_MODEL"]
required_model = _vision_env("AGCLAW_SCREEN_VISION_MODEL")
if required_model not in models:
raise unittest.SkipTest(f"Required local vision model is not installed: {required_model}")

Expand Down
20 changes: 20 additions & 0 deletions docker/docker-compose.litellm.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
services:
litellm:
image: ghcr.io/berriai/litellm:main-latest
ports:
- "${LITELLM_PORT:-4000}:4000"
environment:
# HuggingFace token — required for vision-*-hosted routes.
- HF_TOKEN=${HF_TOKEN:-}
# Override the master key if needed; keep the default only for local dev.
- LITELLM_MASTER_KEY=${LITELLM_MASTER_KEY:-agclaw-dev-key}
volumes:
- ../litellm/agclaw-config.local.yaml:/app/config.yaml:ro
command: ["--config", "/app/config.yaml", "--port", "4000"]
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:4000/health"]
interval: 30s
timeout: 5s
retries: 3
start_period: 15s
47 changes: 47 additions & 0 deletions docs/agclaw-vision-runbook.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Validate that `/api/mes/interpret-screen` is using a real vision-capable endpoin
- `openai-compatible`
- `ollama`
- `vllm`
- `github-models`

## Required Environment

Expand All @@ -19,6 +20,15 @@ $env:AGCLAW_SCREEN_VISION_BASE_URL = "http://127.0.0.1:8000"
$env:AGCLAW_SCREEN_VISION_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
```

Task-routed local HMI setup on this workstation:

```powershell
$env:AGCLAW_SCREEN_VISION_PROVIDER_HMI = "ollama"
$env:AGCLAW_SCREEN_VISION_BASE_URL_HMI = "http://127.0.0.1:11500"
$env:AGCLAW_SCREEN_VISION_MODEL_HMI = "qwen2.5vl:7b"
$env:AGCLAW_SCREEN_VISION_TIMEOUT_SECONDS_HMI = "360"
```

Optional:

```powershell
Expand Down Expand Up @@ -49,3 +59,40 @@ python -m agclaw_backend.server --host 127.0.0.1 --port 8008

- Treat all output as advisory-only.
- Do not allow the vision path to generate control commands or bypass approval gates.

## Verified Local Result

This repository has now been validated against real local multimodal endpoints using:

```powershell
$env:AGCLAW_SCREEN_VISION_PROVIDER = "ollama"
$env:AGCLAW_SCREEN_VISION_BASE_URL = "http://127.0.0.1:11500"
$env:AGCLAW_SCREEN_VISION_MODEL = "qwen2.5vl:7b"
```

The validated path covers:

- backend live vision unit test against `backend/tests/fixtures/hmi-sample.png`
- browser E2E upload flow through `Research Workbench -> HMI Review`
- promptfoo `gate:vision-caption`

Expected verified outcome:

- `Adapter: ollama`
- visible `Vision summary:` content
- batch or recipe context called out without control-action instructions

Observed local limitation:

- `qwen2.5vl:3b` on the default local Ollama port fell back to `Adapter: heuristic` on the HMI fixture because the request exceeded available system memory on this workstation.
- `qwen2.5vl:7b` on the alternate E-backed Ollama instance handled backend HMI interpretation and the promptfoo caption pack successfully.
- `gemma3:4b` on the alternate E-backed Ollama instance handled the sampled OCR workload successfully.
- A single local model still does not pass every pack on this workstation, so the most reliable local split is `qwen2.5vl:7b` for screenshot captioning and HMI review, and `gemma3:4b` for the sampled OCR suite.

## Routed LiteLLM Option

If you want a single OpenAI-compatible endpoint with task aliases, use the starter LiteLLM config and route these model names through it:

- `vision-caption-local` -> `qwen2.5vl:7b`
- `vision-hmi-local` -> `qwen2.5vl:7b`
- `vision-ocr-local` -> `gemma3:4b`
61 changes: 61 additions & 0 deletions litellm/agclaw-config.local.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Change api_base to http://127.0.0.1:11500 if you route through the E-backed Ollama instance.
model_list:
- model_name: qwen2.5:3b
litellm_params:
model: ollama/qwen2.5:3b
api_base: http://127.0.0.1:11434

- model_name: gemma3:1b
litellm_params:
model: ollama/gemma3:1b
api_base: http://127.0.0.1:11434

- model_name: gemma3:4b
litellm_params:
model: ollama/gemma3:4b
api_base: http://127.0.0.1:11434

- model_name: qwen2.5vl:3b
litellm_params:
model: ollama/qwen2.5vl:3b
api_base: http://127.0.0.1:11434

- model_name: qwen2.5vl:7b
litellm_params:
model: ollama/qwen2.5vl:7b
api_base: http://127.0.0.1:11500

- model_name: vision-caption-local
litellm_params:
model: ollama/qwen2.5vl:7b
api_base: http://127.0.0.1:11500

- model_name: vision-hmi-local
litellm_params:
model: ollama/qwen2.5vl:7b
api_base: http://127.0.0.1:11500

- model_name: vision-ocr-local
litellm_params:
model: ollama/gemma3:4b
api_base: http://127.0.0.1:11500

# HuggingFace hosted models — requires HF_TOKEN env var.
- model_name: vision-caption-hosted
litellm_params:
model: huggingface/Qwen/Qwen2.5-VL-7B-Instruct
api_key: os.environ/HF_TOKEN

- model_name: vision-hmi-hosted
litellm_params:
model: huggingface/Qwen/Qwen2.5-VL-7B-Instruct
api_key: os.environ/HF_TOKEN

- model_name: vision-ocr-hosted
litellm_params:
model: huggingface/google/gemma-3-4b-it
api_key: os.environ/HF_TOKEN

general_settings:
master_key: agclaw-dev-key
completion_model: qwen2.5:3b
49 changes: 49 additions & 0 deletions litellm/start.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env bash
# LiteLLM launcher for the AG-Claw multimodal gateway.
#
# Usage:
# ./litellm/start.sh [--config <path>] [--port <port>]
#
# Environment:
# HF_TOKEN HuggingFace token for hosted-model routes (required for
# vision-caption-hosted, vision-hmi-hosted, vision-ocr-hosted).
# LITELLM_PORT Listening port (default: 4000).
# LITELLM_CONFIG Path to the LiteLLM config file
# (default: <this script's directory>/agclaw-config.local.yaml).

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

CONFIG="${LITELLM_CONFIG:-${SCRIPT_DIR}/agclaw-config.local.yaml}"
PORT="${LITELLM_PORT:-4000}"

# Parse optional CLI overrides.
while [[ $# -gt 0 ]]; do
case "$1" in
--config)
CONFIG="$2"
shift 2
;;
--port)
PORT="$2"
shift 2
;;
*)
echo "Unknown option: $1" >&2
exit 1
;;
esac
done

if [[ ! -f "${CONFIG}" ]]; then
echo "LiteLLM config not found: ${CONFIG}" >&2
exit 1
fi

if [[ -z "${HF_TOKEN:-}" ]]; then
echo "Warning: HF_TOKEN is not set — hosted HuggingFace routes will be unavailable." >&2
fi

echo "Starting LiteLLM proxy config=${CONFIG} port=${PORT}"
exec litellm --config "${CONFIG}" --port "${PORT}"
Loading
Loading