Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions DEPLOY_AWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -272,3 +272,31 @@ The endpoint starts but chat completions return 500. The error is generic; the *
2. **Model name** – Cloud Brain now auto-discovers the model from `/v1/models`. If RunPod uses `OPENAI_SERVED_MODEL_NAME_OVERRIDE`, the discovered name will be used automatically.

3. **Image size** – If logs show OOM during inference, reduce frame size on the Mac: in `webcam_stream.py`, lower `FRAME_SIZE` (e.g. `(384, 384)`) and/or `JPEG_QUALITY`.

---

## Backend Selection (GUARD_BACKEND)

The guard's model calls go through a `GuardBackend` interface so we can swap
between request/response and streaming servers without changing conversation
code.

| Value | Serving process | Transport | Status |
|-------|-----------------|-----------|--------|
| `vllm` (default) | vLLM's OpenAI-compatible HTTP server | HTTP `/v1/chat/completions` | Ready. |
| `realtime` | OpenBMB's MiniCPM-o realtime server | WebSocket | Not implemented — see follow-up plan; requires the realtime server to be running on the MiniCPM-o host. |

Set on the Mac side (or wherever `main.py` runs):

```bash
export GUARD_BACKEND=vllm # default
```

`vllm` uses `MINICPMO_URL` / `CLOUD_AI_URL` as today. `realtime` will use a
separate `MINICPMO_REALTIME_URL` (websocket) once that backend lands.

Tunable while on `vllm`:

- `VLLM_FRAME_BUFFER_SIZE` (default `1`) — how many recent frames the
backend buffers per encounter. Currently only the newest frame is sent per
turn; larger buffers are reserved for a future frame-burst experiment.
27 changes: 27 additions & 0 deletions backend_factory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""Pick a GuardBackend based on the GUARD_BACKEND env var.

GUARD_BACKEND=vllm (default) — VllmBackend, request/response HTTP.
GUARD_BACKEND=realtime — RealtimeBackend (not implemented in this
plan; see follow-up plan). Requires
OpenBMB's realtime WebSocket server to
be running on the MiniCPM-o host.
"""
import os

from guard_backend import GuardBackend
from vllm_backend import VllmBackend


def get_backend() -> GuardBackend:
name = (os.environ.get("GUARD_BACKEND") or "vllm").strip().lower()
if name == "vllm":
return VllmBackend()
if name == "realtime":
raise NotImplementedError(
"GUARD_BACKEND=realtime requires the OpenBMB realtime server "
"to be running on the MiniCPM-o host. Not implemented in this "
"plan — see the follow-up plan for the RealtimeBackend."
)
raise ValueError(
f"Unknown GUARD_BACKEND: {name!r} (expected 'vllm' or 'realtime')"
)
22 changes: 18 additions & 4 deletions conversation_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from typing import Any, Callable, Awaitable

from db import init_db, create_conversation, add_turn, close_conversation
from minicpmo_client import chat as minicpmo_chat
from guard_backend import GuardBackend, GuardEncounter
from mic_listener import listen_for_response

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -142,6 +142,8 @@ def _broadcast(event: dict) -> None:
class ConversationManager:
def __init__(
self,
*,
backend: GuardBackend,
play_audio_fn: Callable[[bytes, str], Awaitable[None]],
get_frame_fn: Callable[[], bytes | None],
camera_id: str | None = None,
Expand All @@ -151,11 +153,13 @@ def __init__(
transcribe_fn: Callable[[bytes], Awaitable[str | None]] | None = None,
):
"""
backend: GuardBackend instance (opens GuardEncounter per conversation)
play_audio_fn: async (wav_bytes, format) → None (calls _guarded_play with force=True)
get_frame_fn: sync () → jpeg_bytes | None (called in thread)
speak_text_fn: async (text) → None (OpenAI TTS fallback when VLM returns no audio)
transcribe_fn: async (wav_bytes) → str | None (OpenAI Whisper transcription of person's audio)
"""
self.backend = backend
self.play_audio_fn = play_audio_fn
self.get_frame_fn = get_frame_fn
self.speak_text_fn = speak_text_fn
Expand All @@ -174,6 +178,7 @@ def __init__(
self._last_ended: float = 0.0
self._speaking_event = threading.Event() # set while TTS audio is playing
self._audio_task: asyncio.Task | None = None # background audio dispatch task
self._encounter: GuardEncounter | None = None

# ── Public API ────────────────────────────────────────────────────────────

Expand Down Expand Up @@ -237,6 +242,7 @@ async def _run_conversation(self, jpeg_bytes: bytes, initial_text: str | None =
self._set_state(ConversationState.WARNING)

try:
self._encounter = await self.backend.open_encounter()
if initial_text:
# The initial detection message was already spoken aloud by the Cloud AI.
# Log it as GUARD turn 1 and skip _do_turn so we don't speak a second
Expand Down Expand Up @@ -269,10 +275,12 @@ async def _do_turn(self, jpeg_bytes: bytes) -> None:
"""Send frame (+ optional prior context) to MiniCPM-o, play response."""
self._set_state(ConversationState.SPEAKING)
try:
text, wav_bytes = await minicpmo_chat(
jpeg_bytes=jpeg_bytes,
assert self._encounter is not None, "_do_turn called before _run_conversation opened an encounter"
await self._encounter.push_frame(jpeg_bytes)
text, wav_bytes = await self._encounter.turn(
audio_bytes=None,
system_prompt=self.system_prompt,
conversation_history=self.history,
history=self.history,
user_text=_TURN_ROLE_REMINDER,
)
except Exception as exc:
Expand Down Expand Up @@ -420,6 +428,12 @@ async def _end_conversation(self) -> None:
except asyncio.CancelledError:
pass
self._speaking_event.clear()
if self._encounter is not None:
try:
await self._encounter.close()
except Exception as exc:
logger.warning("[ConvMgr] encounter.close() failed: %s", exc)
self._encounter = None
ended_at = _now()
state = self.state

Expand Down
Loading