From e93b168e1748d6ee3adfad983e575fc6aec96709 Mon Sep 17 00:00:00 2001 From: Mickael Farina Date: Thu, 2 Jul 2026 16:21:11 +0200 Subject: [PATCH] feat(demo): wire Project auto-escalation, fix agent live-progress polling, harden vision-click screenshots MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Demo-hardening from the 2026-07-02 dry-run: - Step 10 Q11 finally wired: after a task-shaped chat reply (regex prefilter: >=60 chars + action verb, so casual messages never pay the classifier cost), _should_escalate_to_project runs post-reply and the stream emits an escalate_project frame; the UI renders a suggestion chip ("Start as Project" pre-fills Project mode — user still hits send; "No thanks" → new POST /api/chat/escalate_silence, Q11 session silence). Emits agent_auto_escalated_from_chat audit event. - Agent live progress: _startAgentPoller now auto-resumes after a page reload while the bound agent is active (piggybacked on the existing lifecycle tick, idempotent); schedulePollAgentTimeline's type filter included only legacy types (agent_reply/agent_status) — the runner posts agent_update/agent_blocked/agent_question/agent_done/_aborted, so live updates were filtered out on the reply path. - mouse_control: screenshot capture 5s→10s timeout + one retry. Stress test (6 runs): UI-TARS coords stable within 2px, but 2/6 runs died on screencapture timeout — the screenshot, not the model, is the flaky part. Manifest regenerated. Tests: 4 new (escalation prefilter/verdict/never-raises). Full suite 2440 passed (4 pre-existing pilot-e2e failures, known-issues). Co-Authored-By: Claude Fable 5 --- codec_chat.html | 41 ++++++++++++++++++++++++-- routes/chat.py | 55 +++++++++++++++++++++++++++++++++++ skills/.manifest.json | 2 +- skills/mouse_control.py | 30 ++++++++++++------- tests/test_escalate_wiring.py | 50 +++++++++++++++++++++++++++++++ 5 files changed, 165 insertions(+), 13 deletions(-) create mode 100644 tests/test_escalate_wiring.py diff --git a/codec_chat.html b/codec_chat.html index 0091f23..118116f 100644 --- a/codec_chat.html +++ b/codec_chat.html @@ -499,6 +499,37 @@

CODEC

return ok; }catch(e){return false} } +// ── Step 10 Q11: Project-promotion suggestion chip (2026-07) ── +// Backend emits {escalate_project:{estimated_checkpoints,reason}} after a +// task-shaped chat reply. "Start as Project" pre-fills the input in Project +// mode (user still hits send — no surprise dispatch); "No thanks" silences +// the suggestion for this session. +var _lastEscalateText=''; +function renderEscalateChip(sugg,userText){ + _lastEscalateText=userText||''; + var n=sugg&&sugg.estimated_checkpoints?sugg.estimated_checkpoints:'several'; + var div=document.createElement('div');div.className='msg assistant'; + div.innerHTML='
'+ + '
\uD83D\uDCA1 This looks like a multi-step project (~'+escHtml(String(n))+' checkpoints)
'+ + '
CODEC can draft a plan, ask for your approval once, then run it autonomously in the background.
'+ + '
'+ + ''+ + ''+ + '
'; + document.getElementById('messages').appendChild(div);scrollBottom(); +} +function escalateStartProject(btn){ + setMode('project'); + var input=document.getElementById('chatInput'); + input.value=_lastEscalateText;input.focus(); + var card=btn.closest('.msg');if(card)card.remove(); + showToast('Project mode — hit send to draft the plan'); +} +function escalateDismiss(btn){ + try{fetch('/api/chat/escalate_silence',{method:'POST',headers:{'Content-Type':'application/json'}, + body:JSON.stringify({session_id:(typeof sessionId!=='undefined'&&sessionId)||''})})}catch(e){} + var card=btn.closest('.msg');if(card)card.remove(); +} function copyCodeBlock(btn){ // Per-code-block copy (2026-07): grabs the rendered code text (already // HTML-unescaped by innerText) and reuses copyMsgText's clipboard path. @@ -1135,7 +1166,7 @@

CODEC

if(!line.startsWith('data: '))continue; var payload=line.substring(6); if(payload==='[DONE]')break; - try{var j=JSON.parse(payload);if(j.token){if(firstToken){bubble.innerHTML='';firstToken=false}fullText+=j.token;bubble.innerHTML=formatMsg(fullText);scrollBottom()}if(j.error){fullText+='\n\nError: '+j.error}}catch(pe){} + try{var j=JSON.parse(payload);if(j.token){if(firstToken){bubble.innerHTML='';firstToken=false}fullText+=j.token;bubble.innerHTML=formatMsg(fullText);scrollBottom()}if(j.escalate_project){renderEscalateChip(j.escalate_project,text)}if(j.error){fullText+='\n\nError: '+j.error}}catch(pe){} } } if(fullText){div.remove();addMessage('assistant',fullText);chatHist.push({role:'assistant',content:fullText});saveMessages([{role:'assistant',content:fullText}])} @@ -1839,7 +1870,7 @@

CODEC

if (!r.ok) return; var data = await r.json(); var msgs = (data.messages || []).filter(function(m){ - return (m.ts || 0) > since && (m.type === 'agent_reply' || m.type === 'agent_status' || m.type === 'plan_revision'); + return (m.ts || 0) > since && ['agent_reply','agent_status','plan_revision','agent_update','agent_blocked','agent_question','agent_done','agent_aborted'].indexOf(m.type) >= 0; }); if (msgs.length){ clearInterval(poll); @@ -1868,6 +1899,12 @@

CODEC

if (!r.ok) return; var data = await r.json(); var status = (data.manifest && data.manifest.status) || data.status || ''; + // Resume live updates after a page reload while the agent is active + // (2026-07 demo fix — poller previously only started from the approve click). + var activeStates = ['approved','running','paused']; + if (activeStates.indexOf(status) >= 0 || status.indexOf('blocked_') === 0){ + _startAgentPoller(_activeAgentId); + } var terminal = ['done','complete','completed','failed','error','aborted','user_aborted']; if (terminal.indexOf(status) >= 0){ addMessage('assistant', '`'+_activeAgentId.slice(0,12)+'` reached `'+status+'`. Conversation closed — next message starts a fresh project.'); diff --git a/routes/chat.py b/routes/chat.py index 0538527..c90748c 100644 --- a/routes/chat.py +++ b/routes/chat.py @@ -40,6 +40,10 @@ import codec_llm # A-12 canonical LLM caller from codec_chat_stream import SkillTagBuffer, SKILL_TAG_RE # A-6 token machine from codec_chat_pipeline import _StepBudget, _is_conversational # B6-P2 +from codec_chat_pipeline import ( # Step 10 Q11 wiring (2026-07) + _should_escalate_to_project, + silence_session_autoescalate, +) from routes._shared import CONFIG_PATH router = APIRouter() @@ -616,12 +620,59 @@ def _build_chat_system_prompt(config: dict, budget, has_attachment: bool, +import re as _re_esc + +_ESCALATE_HINT_RE = _re_esc.compile( + r"\b(build|create|research|plan|organi[sz]e|automate|migrate|design|" + r"set\s?up|write me|make me|prepare|launch|develop)\b", _re_esc.IGNORECASE) + + +def _maybe_escalate_suggestion(user_text: str, session_id: str): + """Step 10 auto-escalation, finally wired (2026-07). Runs AFTER the reply + so it never adds latency to the answer itself. The regex prefilter keeps + the Qwen classifier call off casual messages — only task-shaped text + (>= 60 chars + an action verb) pays for classification. Returns the + suggestion dict for the UI chip, or None.""" + try: + if len(user_text or "") < 60 or not _ESCALATE_HINT_RE.search(user_text): + return None + verdict = _should_escalate_to_project(user_text, session_id) + if not verdict.get("escalate"): + return None + log_event("agent_auto_escalated_from_chat", "codec-dashboard", + f"Suggested Project promotion ({verdict.get('estimated_checkpoints')} checkpoints)", + extra={"session_id": session_id, + "estimated_checkpoints": verdict.get("estimated_checkpoints"), + "verdict": verdict.get("reason", "")[:200], + "silenced": False}) + return {"estimated_checkpoints": verdict.get("estimated_checkpoints"), + "reason": (verdict.get("reason") or "")[:200]} + except Exception as e: + log.debug(f"escalation check failed (non-fatal): {e}") + return None + + +@router.post("/api/chat/escalate_silence") +async def escalate_silence(request: Request): + """Q11: user said "No thanks" to a Project suggestion — silence the + prompt for the rest of this chat session (in-memory, resets on restart).""" + try: + body = await request.json() + except Exception: + body = {} + sid = str(body.get("session_id") or "") + if sid: + silence_session_autoescalate(sid) + return {"ok": True, "silenced": bool(sid)} + + @router.post("/api/chat") async def chat_completion(request: Request): """Direct LLM chat with full context window + tool calling""" from codec_metrics import metrics metrics.inc("codec_chat_requests_total") body = await request.json() + _session_id = request.query_params.get("s") or "" messages = body.get("messages", []) if not messages: return JSONResponse({"error": "No messages"}, status_code=400) @@ -868,6 +919,10 @@ def _resolve_skill_tag(raw_tag): "busy, restarting, or out of context. Please try " "again in a moment." ) + # Step 10 Q11 (2026-07): post-reply Project suggestion. + _sugg = _maybe_escalate_suggestion(last_user_text, _session_id) + if _sugg: + yield f"data: {json.dumps({'escalate_project': _sugg})}\n\n" yield "data: [DONE]\n\n" except Exception as e: yield f"data: {json.dumps({'error': str(e)})}\n\n" diff --git a/skills/.manifest.json b/skills/.manifest.json index a969b53..3801c13 100644 --- a/skills/.manifest.json +++ b/skills/.manifest.json @@ -56,7 +56,7 @@ "memory_history.py": "a2762c03c325517d10907f8aa9511103a5716a29f9e956d48473b817105fb65c", "memory_save.py": "3d801338bfd0818aaf1e65e692e404af0a0fba6886d26150f0fb66e4b4fde424", "memory_search.py": "249e8644254e039cbdf7155fd372b85c234e18483e8e7ea7361ce28bf8fb875f", - "mouse_control.py": "85398544e83ecfc9d01c6ec9f9d55f50bf9fd9228c95a0d02f82da014f00f725", + "mouse_control.py": "d00f8b94d008b2f69a0ee1880039a53b46454a097f599b3cfbce88db06839ae4", "music.py": "c42bddc6414b11e8c4734cd826a824a30c4ff34a618d69da11a5d84b737b2f55", "network_info.py": "bd776b619cf7c18d67fe03cb0f0456cf9c4f9bf71475740a233a9ca1e6672fcd", "notes.py": "7d50d1544ea955f59917a1f0e7902d115e9dbccd3188b641057a55b6a5b2803a", diff --git a/skills/mouse_control.py b/skills/mouse_control.py index 32e5fa8..7e35330 100644 --- a/skills/mouse_control.py +++ b/skills/mouse_control.py @@ -147,19 +147,29 @@ def _get_screen_size(): return (1920, 1080) -def _take_screenshot(): - """Capture screen and return base64-encoded PNG.""" +def _take_screenshot(timeout_s=10, attempts=2): + """Capture screen and return base64-encoded PNG. + + 2026-07 hardening: under load screencapture intermittently exceeds the + old 5s cap (stress test: 2 of 6 vision-locate runs failed on screenshot + timeout while the model itself was stable). One retry + a 10s cap turn + a hard "Could not take screenshot" into a rare slow path.""" try: os.makedirs(os.path.dirname(_SCREENSHOT_PATH), exist_ok=True) - subprocess.run( - ["screencapture", "-x", "-C", _SCREENSHOT_PATH], - capture_output=True, timeout=5 - ) - if os.path.exists(_SCREENSHOT_PATH) and os.path.getsize(_SCREENSHOT_PATH) > 1000: - with open(_SCREENSHOT_PATH, "rb") as f: - return base64.b64encode(f.read()).decode() except Exception as e: - log.warning(f"Screenshot error: {e}") + log.warning(f"Screenshot dir error: {e}") + return None + for attempt in range(attempts): + try: + subprocess.run( + ["screencapture", "-x", "-C", _SCREENSHOT_PATH], + capture_output=True, timeout=timeout_s + ) + if os.path.exists(_SCREENSHOT_PATH) and os.path.getsize(_SCREENSHOT_PATH) > 1000: + with open(_SCREENSHOT_PATH, "rb") as f: + return base64.b64encode(f.read()).decode() + except Exception as e: + log.warning(f"Screenshot error (attempt {attempt + 1}/{attempts}): {e}") return None diff --git a/tests/test_escalate_wiring.py b/tests/test_escalate_wiring.py new file mode 100644 index 0000000..ec8d342 --- /dev/null +++ b/tests/test_escalate_wiring.py @@ -0,0 +1,50 @@ +"""Step 10 Q11 wiring (2026-07): post-reply Project-promotion suggestion. + +The regex prefilter must keep the Qwen classifier off casual messages; +the silence endpoint must mark the session.""" +from __future__ import annotations + +import sys +from pathlib import Path + +REPO = Path(__file__).resolve().parent.parent +if str(REPO) not in sys.path: + sys.path.insert(0, str(REPO)) + +import routes.chat as chat + + +def test_prefilter_skips_short_or_casual(monkeypatch): + def boom(*a, **kw): + raise AssertionError("classifier must not be called for casual text") + monkeypatch.setattr(chat, "_should_escalate_to_project", boom) + assert chat._maybe_escalate_suggestion("hi", "s1") is None + assert chat._maybe_escalate_suggestion("what's the weather like today?" * 3, "s1") is None + + +def test_prefilter_passes_task_shaped_text(monkeypatch): + calls = {} + def fake_gate(text, sid): + calls["hit"] = (text, sid) + return {"escalate": True, "estimated_checkpoints": 4, "reason": "multi-step"} + monkeypatch.setattr(chat, "_should_escalate_to_project", fake_gate) + monkeypatch.setattr(chat, "log_event", lambda *a, **kw: None) + out = chat._maybe_escalate_suggestion( + "research the top 5 competitors in my niche, build a comparison and prepare a report", + "sess42") + assert out == {"estimated_checkpoints": 4, "reason": "multi-step"} + assert calls["hit"][1] == "sess42" + + +def test_gate_negative_verdict_returns_none(monkeypatch): + monkeypatch.setattr(chat, "_should_escalate_to_project", + lambda t, s: {"escalate": False, "reason": "single-step"}) + assert chat._maybe_escalate_suggestion( + "build me one tiny thing that is actually simple but worded long enough", "s") is None + + +def test_gate_never_raises(monkeypatch): + monkeypatch.setattr(chat, "_should_escalate_to_project", + lambda t, s: (_ for _ in ()).throw(RuntimeError("qwen down"))) + assert chat._maybe_escalate_suggestion( + "research and build and prepare a giant multi step plan for my business", "s") is None