diff --git a/codec_chat.html b/codec_chat.html index abef1cd..0091f23 100644 --- a/codec_chat.html +++ b/codec_chat.html @@ -168,6 +168,11 @@ .empty-state img{width:140px;opacity:.25;margin-bottom:20px;filter:drop-shadow(0 0 30px rgba(232,113,26,0.4))} .empty-state p{color:var(--text-muted);font-size:16px;line-height:1.6} pre{background:var(--bg4);padding:10px;border-radius:8px;overflow-x:auto;font-family:var(--mono);font-size:12px;margin:8px 0} +.code-wrap{position:relative;margin:8px 0} +.code-wrap pre{margin:0;padding-right:38px} +.code-copy{position:absolute;top:6px;right:6px;background:var(--bg2,rgba(0,0,0,.35));border:1px solid var(--bg4,#444);border-radius:6px;color:var(--text-dim,#9aa);padding:3px 6px;cursor:pointer;opacity:.65;transition:opacity .15s,color .15s,border-color .15s;line-height:0} +.code-copy:hover{opacity:1;color:var(--accent);border-color:var(--accent)} +.code-copy.copied{opacity:1;color:var(--success,#4caf50);border-color:var(--success,#4caf50)} code{font-family:var(--mono);font-size:12px;background:var(--bg4);padding:2px 6px;border-radius:4px} input[type=file]{display:none} @@ -494,6 +499,12 @@

CODEC

return ok; }catch(e){return false} } +function copyCodeBlock(btn){ + // Per-code-block copy (2026-07): grabs the rendered code text (already + // HTML-unescaped by innerText) and reuses copyMsgText's clipboard path. + var code=btn.parentNode.querySelector('code'); + if(code)copyMsgText(code.innerText.replace(/\n$/,''),btn); +} function copyMsgText(text,btn){ function ok(){btn.classList.add('copied');showToast('Copied');setTimeout(function(){btn.classList.remove('copied')},1500)} function fail(){if(_copyFallback(text)){ok()}else{showToast('Copy failed')}} @@ -733,7 +744,7 @@

CODEC

function escHtml(s){var d=document.createElement('div');d.textContent=s||'';return d.innerHTML} function formatMsg(c){ var f=escHtml(c); - f=f.replace(/```(\w*)\n([\s\S]*?)```/g,'
$2
'); + f=f.replace(/```(\w*)\n([\s\S]*?)```/g,'
$2
'); f=f.replace(/`([^`]+)`/g,'$1'); f=f.replace(/\*\*(.+?)\*\*/g,'$1'); f=f.replace(/\*(.+?)\*/g,'$1'); diff --git a/codec_chat_stream.py b/codec_chat_stream.py index 7960d8c..5a9bb43 100644 --- a/codec_chat_stream.py +++ b/codec_chat_stream.py @@ -48,6 +48,10 @@ def __init__(self, resolve_skill_tag: Callable[[str], str]): self.skill_buf = "" self.buffering = False self.visible_chars = 0 + # Count of complete [SKILL:...] tags handed to the resolver — lets the + # caller's blank-bubble fallback distinguish "all output was dropped + # tool tags" from "the model produced nothing at all" (2026-07 fix). + self.tags_resolved = 0 def _count(self, text: str) -> str: """Account visible chars (only non-empty), return the text unchanged. @@ -97,6 +101,7 @@ def feed(self, token: str) -> Iterator[str]: # Tag complete? if self.skill_buf.endswith("]"): if SKILL_TAG_RE.search(self.skill_buf): + self.tags_resolved += 1 yield self._count(self._resolve(self.skill_buf)) else: yield self._count(self.skill_buf) @@ -126,6 +131,8 @@ def finish(self) -> Iterator[str]: """Flush any pending buffer at end-of-stream (resolve if it's a tag). Idempotent — a no-op once the buffer has been flushed.""" if self.skill_buf: + if SKILL_TAG_RE.search(self.skill_buf): + self.tags_resolved += 1 yield self._count(self._resolve(self.skill_buf)) self.skill_buf = "" self.buffering = False diff --git a/codec_llm.py b/codec_llm.py index 531540f..7a59b78 100644 --- a/codec_llm.py +++ b/codec_llm.py @@ -45,6 +45,17 @@ class LLMError(Exception): # (codec_session.qwen_stream) are unaffected. KEEPALIVE = object() +# Sentinels yielded by stream(error_sentinel=True) — 2026-07 chat-visibility +# fix. stream() never raises by contract, so before these existed a mid-reply +# connection drop / non-200 / read timeout was indistinguishable from a clean +# finish: the dashboard rendered an empty or silently-truncated bubble. +# STREAM_ERROR — the stream died abnormally (connect/HTTP/read error). +# FINISH_LENGTH — the model stopped at the max_tokens cap +# (finish_reason == "length"), i.e. the reply is truncated. +# Only yielded when error_sentinel=True; all existing callers are unaffected. +STREAM_ERROR = object() +FINISH_LENGTH = object() + def strip_think(text: str) -> str: """Remove reasoning blocks and surrounding whitespace.""" @@ -206,6 +217,7 @@ def stream( enable_thinking: bool = False, extra_kwargs: Optional[Dict[str, Any]] = None, keepalive: bool = False, + error_sentinel: bool = False, ) -> Iterator[Any]: """POST with `stream=True` and yield the RAW assistant content deltas in order. Centralizes the SSE plumbing: header/payload build (shared with @@ -222,6 +234,13 @@ def stream( `KEEPALIVE` sentinel every 10th empty (1st, 11th, …) so an SSE caller can emit a transport keepalive. Content-only callers leave it off and only ever see `str` deltas. + + `error_sentinel=True` (default off): yield `STREAM_ERROR` when the stream + dies abnormally (non-200, connect/read exception) and `FINISH_LENGTH` when + the model stops at the max_tokens cap — so a UI caller can tell the user + the reply was interrupted / truncated instead of rendering an empty or + silently-cut bubble. Callers that leave it off see the old behavior + (stream just ends). """ import json as _json import requests @@ -242,6 +261,8 @@ def stream( if r.status_code != 200: log.warning("LLM stream %s returned %s: %s", url, r.status_code, getattr(r, "text", "")[:200]) + if error_sentinel: + yield STREAM_ERROR return for line in r.iter_lines(): if not line: @@ -254,8 +275,8 @@ def stream( if data.strip() == "[DONE]": return try: - delta = (_json.loads(data).get("choices", [{}])[0] - .get("delta", {}).get("content", "")) + choice = _json.loads(data).get("choices", [{}])[0] + delta = choice.get("delta", {}).get("content", "") except Exception as e: log.warning("LLM stream chunk parse failed: %s", e) continue @@ -265,8 +286,13 @@ def stream( _empty += 1 if _empty % 10 == 1: # 1st, 11th, 21st … (matches dashboard) yield KEEPALIVE + if error_sentinel and choice.get("finish_reason") == "length": + # Model hit the max_tokens cap — reply is truncated. + yield FINISH_LENGTH except Exception as e: log.warning("LLM stream call failed: %s", e) + if error_sentinel: + yield STREAM_ERROR return diff --git a/routes/chat.py b/routes/chat.py index c9652e7..0538527 100644 --- a/routes/chat.py +++ b/routes/chat.py @@ -753,9 +753,16 @@ async def _skill_stream(): # wins (matches the old chat_template_kwargs assignment after the update). _extra = {"top_p": 0.9, "frequency_penalty": 1.1, **{k: v for k, v in kwargs.items() if k != "chat_template_kwargs"}} + # 2026-07 chat-visibility fix: max_tokens + timeout are operator-tunable + # via ~/.codec/config.json:chat.{max_tokens, llm_timeout_s}. Note the + # cap includes tokens when thinking mode is on — deep answers + # that burn a lot of reasoning eat into the visible-reply budget. + _chat_cfg = config.get("chat", {}) if isinstance(config.get("chat"), dict) else {} _common = dict(base_url=base_url, model=model, api_key=api_key, - max_tokens=28000, temperature=0.7, enable_thinking=thinking, - extra_kwargs=_extra, timeout=300) + max_tokens=int(_chat_cfg.get("max_tokens", 28000)), + temperature=0.7, enable_thinking=thinking, + extra_kwargs=_extra, + timeout=float(_chat_cfg.get("llm_timeout_s", 300))) if stream_mode: # SSE streaming — keeps Cloudflare tunnel alive, sends tokens as they arrive @@ -807,24 +814,60 @@ def _resolve_skill_tag(raw_tag): buf = SkillTagBuffer(_resolve_skill_tag) try: # codec_llm.stream yields raw content deltas (it owns the SSE - # POST + data:/[DONE] parsing) and the KEEPALIVE sentinel on - # empty thinking-chunks (keepalive=True) to hold the tunnel. - for item in codec_llm.stream(messages, **_common, keepalive=True): + # POST + data:/[DONE] parsing), the KEEPALIVE sentinel on + # empty thinking-chunks (keepalive=True) to hold the tunnel, + # and — 2026-07 chat-visibility fix — STREAM_ERROR / + # FINISH_LENGTH sentinels so an interrupted or truncated + # reply is SAID to the user instead of silently rendering + # as an empty / mid-sentence bubble. + stream_died = False + hit_token_cap = False + for item in codec_llm.stream(messages, **_common, + keepalive=True, + error_sentinel=True): if item is codec_llm.KEEPALIVE: yield ": keepalive\n\n" continue + if item is codec_llm.STREAM_ERROR: + stream_died = True + continue + if item is codec_llm.FINISH_LENGTH: + hit_token_cap = True + continue for s in buf.feed(item): yield _frame(s) # Stream ended ([DONE] or close): flush, then blank-bubble net. for s in buf.finish(): yield _frame(s) - # Safety net: LLM emitted ONLY [SKILL:...] tags and we dropped - # them all → blank bubble; send a graceful fallback (2026-04-27). - if buf.visible_chars == 0: + if hit_token_cap: + yield _frame( + "\n\n⚠️ *Reply truncated — the model hit the " + "`chat.max_tokens` cap. Raise it in " + "`~/.codec/config.json` (chat → max_tokens) for " + "longer replies.*" + ) + if stream_died: yield _frame( - "I tried to use a tool that didn't apply here. " - "Could you rephrase, or just ask me to write it directly?" + "\n\n⚠️ *Reply interrupted — the connection to the " + "local model dropped mid-answer. Ask me to continue, " + "or retry. (If this repeats: `pm2 logs qwen3.6`.)*" ) + # Blank-bubble net. Distinguish the two empty cases + # (2026-07): dropped tool tags vs. the model producing + # nothing at all — the old single message blamed a "tool" + # even when the LLM was just down/overloaded. + if buf.visible_chars == 0 and not stream_died: + if buf.tags_resolved: + yield _frame( + "I tried to use a tool that didn't apply here. " + "Could you rephrase, or just ask me to write it directly?" + ) + else: + yield _frame( + "⚠️ The model returned an empty reply — it may be " + "busy, restarting, or out of context. Please try " + "again in a moment." + ) yield "data: [DONE]\n\n" except Exception as e: yield f"data: {json.dumps({'error': str(e)})}\n\n" diff --git a/tests/test_chat_stream.py b/tests/test_chat_stream.py index f067b59..f80dbf3 100644 --- a/tests/test_chat_stream.py +++ b/tests/test_chat_stream.py @@ -140,3 +140,27 @@ def resolve(tag): def test_skill_tag_re_matches(): m = SKILL_TAG_RE.search("prefix [SKILL:translate:hola] suffix") assert m and m.group(1) == "translate" and m.group(2) == "hola" + + +# ── tags_resolved counter (2026-07 chat-visibility fix) ────────────────────── + + +def test_tags_resolved_counts_complete_tags(): + buf = SkillTagBuffer(lambda tag: "RESULT") + out = list(buf.feed("before [SKILL:weather:paris] after")) + assert buf.tags_resolved == 1 + assert "RESULT" in "".join(out) + + +def test_tags_resolved_zero_for_plain_text(): + buf = SkillTagBuffer(lambda tag: "RESULT") + list(buf.feed("no tags here at all")) + list(buf.finish()) + assert buf.tags_resolved == 0 + + +def test_tags_resolved_counts_finish_flush(): + buf = SkillTagBuffer(lambda tag: "RESULT") + list(buf.feed("[SKILL:weather:paris]")) # complete in feed + list(buf.finish()) + assert buf.tags_resolved == 1 diff --git a/tests/test_llm_stream.py b/tests/test_llm_stream.py index 8df469d..fca2f7e 100644 --- a/tests/test_llm_stream.py +++ b/tests/test_llm_stream.py @@ -298,3 +298,61 @@ def fake_post(url, json=None, headers=None, timeout=None, stream=None): out = list(codec_llm.stream([{"role": "user", "content": "q"}], base_url="http://x/v1", model="m", keepalive=True)) assert out == [codec_llm.KEEPALIVE, codec_llm.KEEPALIVE] + + +# ── error_sentinel (2026-07 chat-visibility fix) ───────────────────────────── + + +def _sse_finish(reason): + return "data: " + json.dumps( + {"choices": [{"delta": {}, "finish_reason": reason}]}) + + +def test_stream_error_sentinel_on_non_200(monkeypatch): + monkeypatch.setattr("requests.post", + lambda *a, **kw: _StreamResp(503, text="busy")) + out = list(codec_llm.stream( + [{"role": "user", "content": "x"}], + base_url="http://x", model="m", error_sentinel=True)) + assert out == [codec_llm.STREAM_ERROR] + + +def test_stream_error_sentinel_on_exception(monkeypatch): + def _boom(*a, **kw): + raise ConnectionError("dropped") + monkeypatch.setattr("requests.post", _boom) + out = list(codec_llm.stream( + [{"role": "user", "content": "x"}], + base_url="http://x", model="m", error_sentinel=True)) + assert out == [codec_llm.STREAM_ERROR] + + +def test_stream_finish_length_sentinel(monkeypatch): + lines = [_sse("partial answer"), _sse_finish("length")] + monkeypatch.setattr("requests.post", + lambda *a, **kw: _StreamResp(200, lines)) + out = list(codec_llm.stream( + [{"role": "user", "content": "x"}], + base_url="http://x", model="m", error_sentinel=True)) + assert out == ["partial answer", codec_llm.FINISH_LENGTH] + + +def test_stream_clean_stop_no_sentinels(monkeypatch): + lines = [_sse("full answer"), _sse_finish("stop"), "data: [DONE]"] + monkeypatch.setattr("requests.post", + lambda *a, **kw: _StreamResp(200, lines)) + out = list(codec_llm.stream( + [{"role": "user", "content": "x"}], + base_url="http://x", model="m", error_sentinel=True)) + assert out == ["full answer"] + + +def test_stream_sentinels_off_by_default(monkeypatch): + """Existing callers (no error_sentinel) keep the old contract: errors and + length-stops just end the stream with no sentinel objects.""" + monkeypatch.setattr("requests.post", + lambda *a, **kw: _StreamResp(500, text="err")) + out = list(codec_llm.stream( + [{"role": "user", "content": "x"}], + base_url="http://x", model="m")) + assert out == []