diff --git a/codec_chat.html b/codec_chat.html
index abef1cd..0091f23 100644
--- a/codec_chat.html
+++ b/codec_chat.html
@@ -168,6 +168,11 @@
.empty-state img{width:140px;opacity:.25;margin-bottom:20px;filter:drop-shadow(0 0 30px rgba(232,113,26,0.4))}
.empty-state p{color:var(--text-muted);font-size:16px;line-height:1.6}
pre{background:var(--bg4);padding:10px;border-radius:8px;overflow-x:auto;font-family:var(--mono);font-size:12px;margin:8px 0}
+.code-wrap{position:relative;margin:8px 0}
+.code-wrap pre{margin:0;padding-right:38px}
+.code-copy{position:absolute;top:6px;right:6px;background:var(--bg2,rgba(0,0,0,.35));border:1px solid var(--bg4,#444);border-radius:6px;color:var(--text-dim,#9aa);padding:3px 6px;cursor:pointer;opacity:.65;transition:opacity .15s,color .15s,border-color .15s;line-height:0}
+.code-copy:hover{opacity:1;color:var(--accent);border-color:var(--accent)}
+.code-copy.copied{opacity:1;color:var(--success,#4caf50);border-color:var(--success,#4caf50)}
code{font-family:var(--mono);font-size:12px;background:var(--bg4);padding:2px 6px;border-radius:4px}
input[type=file]{display:none}
@@ -494,6 +499,12 @@
return ok;
}catch(e){return false}
}
+function copyCodeBlock(btn){
+ // Per-code-block copy (2026-07): grabs the rendered code text (already
+ // HTML-unescaped by innerText) and reuses copyMsgText's clipboard path.
+ var code=btn.parentNode.querySelector('code');
+ if(code)copyMsgText(code.innerText.replace(/\n$/,''),btn);
+}
function copyMsgText(text,btn){
function ok(){btn.classList.add('copied');showToast('Copied');setTimeout(function(){btn.classList.remove('copied')},1500)}
function fail(){if(_copyFallback(text)){ok()}else{showToast('Copy failed')}}
@@ -733,7 +744,7 @@
function escHtml(s){var d=document.createElement('div');d.textContent=s||'';return d.innerHTML}
function formatMsg(c){
var f=escHtml(c);
- f=f.replace(/```(\w*)\n([\s\S]*?)```/g,'$2
');
+ f=f.replace(/```(\w*)\n([\s\S]*?)```/g,'');
f=f.replace(/`([^`]+)`/g,'$1');
f=f.replace(/\*\*(.+?)\*\*/g,'$1');
f=f.replace(/\*(.+?)\*/g,'$1');
diff --git a/codec_chat_stream.py b/codec_chat_stream.py
index 7960d8c..5a9bb43 100644
--- a/codec_chat_stream.py
+++ b/codec_chat_stream.py
@@ -48,6 +48,10 @@ def __init__(self, resolve_skill_tag: Callable[[str], str]):
self.skill_buf = ""
self.buffering = False
self.visible_chars = 0
+ # Count of complete [SKILL:...] tags handed to the resolver — lets the
+ # caller's blank-bubble fallback distinguish "all output was dropped
+ # tool tags" from "the model produced nothing at all" (2026-07 fix).
+ self.tags_resolved = 0
def _count(self, text: str) -> str:
"""Account visible chars (only non-empty), return the text unchanged.
@@ -97,6 +101,7 @@ def feed(self, token: str) -> Iterator[str]:
# Tag complete?
if self.skill_buf.endswith("]"):
if SKILL_TAG_RE.search(self.skill_buf):
+ self.tags_resolved += 1
yield self._count(self._resolve(self.skill_buf))
else:
yield self._count(self.skill_buf)
@@ -126,6 +131,8 @@ def finish(self) -> Iterator[str]:
"""Flush any pending buffer at end-of-stream (resolve if it's a tag).
Idempotent — a no-op once the buffer has been flushed."""
if self.skill_buf:
+ if SKILL_TAG_RE.search(self.skill_buf):
+ self.tags_resolved += 1
yield self._count(self._resolve(self.skill_buf))
self.skill_buf = ""
self.buffering = False
diff --git a/codec_llm.py b/codec_llm.py
index 531540f..7a59b78 100644
--- a/codec_llm.py
+++ b/codec_llm.py
@@ -45,6 +45,17 @@ class LLMError(Exception):
# (codec_session.qwen_stream) are unaffected.
KEEPALIVE = object()
+# Sentinels yielded by stream(error_sentinel=True) — 2026-07 chat-visibility
+# fix. stream() never raises by contract, so before these existed a mid-reply
+# connection drop / non-200 / read timeout was indistinguishable from a clean
+# finish: the dashboard rendered an empty or silently-truncated bubble.
+# STREAM_ERROR — the stream died abnormally (connect/HTTP/read error).
+# FINISH_LENGTH — the model stopped at the max_tokens cap
+# (finish_reason == "length"), i.e. the reply is truncated.
+# Only yielded when error_sentinel=True; all existing callers are unaffected.
+STREAM_ERROR = object()
+FINISH_LENGTH = object()
+
def strip_think(text: str) -> str:
"""Remove … reasoning blocks and surrounding whitespace."""
@@ -206,6 +217,7 @@ def stream(
enable_thinking: bool = False,
extra_kwargs: Optional[Dict[str, Any]] = None,
keepalive: bool = False,
+ error_sentinel: bool = False,
) -> Iterator[Any]:
"""POST with `stream=True` and yield the RAW assistant content deltas in
order. Centralizes the SSE plumbing: header/payload build (shared with
@@ -222,6 +234,13 @@ def stream(
`KEEPALIVE` sentinel every 10th empty (1st, 11th, …) so an SSE caller can
emit a transport keepalive. Content-only callers leave it off and only ever
see `str` deltas.
+
+ `error_sentinel=True` (default off): yield `STREAM_ERROR` when the stream
+ dies abnormally (non-200, connect/read exception) and `FINISH_LENGTH` when
+ the model stops at the max_tokens cap — so a UI caller can tell the user
+ the reply was interrupted / truncated instead of rendering an empty or
+ silently-cut bubble. Callers that leave it off see the old behavior
+ (stream just ends).
"""
import json as _json
import requests
@@ -242,6 +261,8 @@ def stream(
if r.status_code != 200:
log.warning("LLM stream %s returned %s: %s",
url, r.status_code, getattr(r, "text", "")[:200])
+ if error_sentinel:
+ yield STREAM_ERROR
return
for line in r.iter_lines():
if not line:
@@ -254,8 +275,8 @@ def stream(
if data.strip() == "[DONE]":
return
try:
- delta = (_json.loads(data).get("choices", [{}])[0]
- .get("delta", {}).get("content", ""))
+ choice = _json.loads(data).get("choices", [{}])[0]
+ delta = choice.get("delta", {}).get("content", "")
except Exception as e:
log.warning("LLM stream chunk parse failed: %s", e)
continue
@@ -265,8 +286,13 @@ def stream(
_empty += 1
if _empty % 10 == 1: # 1st, 11th, 21st … (matches dashboard)
yield KEEPALIVE
+ if error_sentinel and choice.get("finish_reason") == "length":
+ # Model hit the max_tokens cap — reply is truncated.
+ yield FINISH_LENGTH
except Exception as e:
log.warning("LLM stream call failed: %s", e)
+ if error_sentinel:
+ yield STREAM_ERROR
return
diff --git a/routes/chat.py b/routes/chat.py
index c9652e7..0538527 100644
--- a/routes/chat.py
+++ b/routes/chat.py
@@ -753,9 +753,16 @@ async def _skill_stream():
# wins (matches the old chat_template_kwargs assignment after the update).
_extra = {"top_p": 0.9, "frequency_penalty": 1.1,
**{k: v for k, v in kwargs.items() if k != "chat_template_kwargs"}}
+ # 2026-07 chat-visibility fix: max_tokens + timeout are operator-tunable
+ # via ~/.codec/config.json:chat.{max_tokens, llm_timeout_s}. Note the
+ # cap includes tokens when thinking mode is on — deep answers
+ # that burn a lot of reasoning eat into the visible-reply budget.
+ _chat_cfg = config.get("chat", {}) if isinstance(config.get("chat"), dict) else {}
_common = dict(base_url=base_url, model=model, api_key=api_key,
- max_tokens=28000, temperature=0.7, enable_thinking=thinking,
- extra_kwargs=_extra, timeout=300)
+ max_tokens=int(_chat_cfg.get("max_tokens", 28000)),
+ temperature=0.7, enable_thinking=thinking,
+ extra_kwargs=_extra,
+ timeout=float(_chat_cfg.get("llm_timeout_s", 300)))
if stream_mode:
# SSE streaming — keeps Cloudflare tunnel alive, sends tokens as they arrive
@@ -807,24 +814,60 @@ def _resolve_skill_tag(raw_tag):
buf = SkillTagBuffer(_resolve_skill_tag)
try:
# codec_llm.stream yields raw content deltas (it owns the SSE
- # POST + data:/[DONE] parsing) and the KEEPALIVE sentinel on
- # empty thinking-chunks (keepalive=True) to hold the tunnel.
- for item in codec_llm.stream(messages, **_common, keepalive=True):
+ # POST + data:/[DONE] parsing), the KEEPALIVE sentinel on
+ # empty thinking-chunks (keepalive=True) to hold the tunnel,
+ # and — 2026-07 chat-visibility fix — STREAM_ERROR /
+ # FINISH_LENGTH sentinels so an interrupted or truncated
+ # reply is SAID to the user instead of silently rendering
+ # as an empty / mid-sentence bubble.
+ stream_died = False
+ hit_token_cap = False
+ for item in codec_llm.stream(messages, **_common,
+ keepalive=True,
+ error_sentinel=True):
if item is codec_llm.KEEPALIVE:
yield ": keepalive\n\n"
continue
+ if item is codec_llm.STREAM_ERROR:
+ stream_died = True
+ continue
+ if item is codec_llm.FINISH_LENGTH:
+ hit_token_cap = True
+ continue
for s in buf.feed(item):
yield _frame(s)
# Stream ended ([DONE] or close): flush, then blank-bubble net.
for s in buf.finish():
yield _frame(s)
- # Safety net: LLM emitted ONLY [SKILL:...] tags and we dropped
- # them all → blank bubble; send a graceful fallback (2026-04-27).
- if buf.visible_chars == 0:
+ if hit_token_cap:
+ yield _frame(
+ "\n\n⚠️ *Reply truncated — the model hit the "
+ "`chat.max_tokens` cap. Raise it in "
+ "`~/.codec/config.json` (chat → max_tokens) for "
+ "longer replies.*"
+ )
+ if stream_died:
yield _frame(
- "I tried to use a tool that didn't apply here. "
- "Could you rephrase, or just ask me to write it directly?"
+ "\n\n⚠️ *Reply interrupted — the connection to the "
+ "local model dropped mid-answer. Ask me to continue, "
+ "or retry. (If this repeats: `pm2 logs qwen3.6`.)*"
)
+ # Blank-bubble net. Distinguish the two empty cases
+ # (2026-07): dropped tool tags vs. the model producing
+ # nothing at all — the old single message blamed a "tool"
+ # even when the LLM was just down/overloaded.
+ if buf.visible_chars == 0 and not stream_died:
+ if buf.tags_resolved:
+ yield _frame(
+ "I tried to use a tool that didn't apply here. "
+ "Could you rephrase, or just ask me to write it directly?"
+ )
+ else:
+ yield _frame(
+ "⚠️ The model returned an empty reply — it may be "
+ "busy, restarting, or out of context. Please try "
+ "again in a moment."
+ )
yield "data: [DONE]\n\n"
except Exception as e:
yield f"data: {json.dumps({'error': str(e)})}\n\n"
diff --git a/tests/test_chat_stream.py b/tests/test_chat_stream.py
index f067b59..f80dbf3 100644
--- a/tests/test_chat_stream.py
+++ b/tests/test_chat_stream.py
@@ -140,3 +140,27 @@ def resolve(tag):
def test_skill_tag_re_matches():
m = SKILL_TAG_RE.search("prefix [SKILL:translate:hola] suffix")
assert m and m.group(1) == "translate" and m.group(2) == "hola"
+
+
+# ── tags_resolved counter (2026-07 chat-visibility fix) ──────────────────────
+
+
+def test_tags_resolved_counts_complete_tags():
+ buf = SkillTagBuffer(lambda tag: "RESULT")
+ out = list(buf.feed("before [SKILL:weather:paris] after"))
+ assert buf.tags_resolved == 1
+ assert "RESULT" in "".join(out)
+
+
+def test_tags_resolved_zero_for_plain_text():
+ buf = SkillTagBuffer(lambda tag: "RESULT")
+ list(buf.feed("no tags here at all"))
+ list(buf.finish())
+ assert buf.tags_resolved == 0
+
+
+def test_tags_resolved_counts_finish_flush():
+ buf = SkillTagBuffer(lambda tag: "RESULT")
+ list(buf.feed("[SKILL:weather:paris]")) # complete in feed
+ list(buf.finish())
+ assert buf.tags_resolved == 1
diff --git a/tests/test_llm_stream.py b/tests/test_llm_stream.py
index 8df469d..fca2f7e 100644
--- a/tests/test_llm_stream.py
+++ b/tests/test_llm_stream.py
@@ -298,3 +298,61 @@ def fake_post(url, json=None, headers=None, timeout=None, stream=None):
out = list(codec_llm.stream([{"role": "user", "content": "q"}],
base_url="http://x/v1", model="m", keepalive=True))
assert out == [codec_llm.KEEPALIVE, codec_llm.KEEPALIVE]
+
+
+# ── error_sentinel (2026-07 chat-visibility fix) ─────────────────────────────
+
+
+def _sse_finish(reason):
+ return "data: " + json.dumps(
+ {"choices": [{"delta": {}, "finish_reason": reason}]})
+
+
+def test_stream_error_sentinel_on_non_200(monkeypatch):
+ monkeypatch.setattr("requests.post",
+ lambda *a, **kw: _StreamResp(503, text="busy"))
+ out = list(codec_llm.stream(
+ [{"role": "user", "content": "x"}],
+ base_url="http://x", model="m", error_sentinel=True))
+ assert out == [codec_llm.STREAM_ERROR]
+
+
+def test_stream_error_sentinel_on_exception(monkeypatch):
+ def _boom(*a, **kw):
+ raise ConnectionError("dropped")
+ monkeypatch.setattr("requests.post", _boom)
+ out = list(codec_llm.stream(
+ [{"role": "user", "content": "x"}],
+ base_url="http://x", model="m", error_sentinel=True))
+ assert out == [codec_llm.STREAM_ERROR]
+
+
+def test_stream_finish_length_sentinel(monkeypatch):
+ lines = [_sse("partial answer"), _sse_finish("length")]
+ monkeypatch.setattr("requests.post",
+ lambda *a, **kw: _StreamResp(200, lines))
+ out = list(codec_llm.stream(
+ [{"role": "user", "content": "x"}],
+ base_url="http://x", model="m", error_sentinel=True))
+ assert out == ["partial answer", codec_llm.FINISH_LENGTH]
+
+
+def test_stream_clean_stop_no_sentinels(monkeypatch):
+ lines = [_sse("full answer"), _sse_finish("stop"), "data: [DONE]"]
+ monkeypatch.setattr("requests.post",
+ lambda *a, **kw: _StreamResp(200, lines))
+ out = list(codec_llm.stream(
+ [{"role": "user", "content": "x"}],
+ base_url="http://x", model="m", error_sentinel=True))
+ assert out == ["full answer"]
+
+
+def test_stream_sentinels_off_by_default(monkeypatch):
+ """Existing callers (no error_sentinel) keep the old contract: errors and
+ length-stops just end the stream with no sentinel objects."""
+ monkeypatch.setattr("requests.post",
+ lambda *a, **kw: _StreamResp(500, text="err"))
+ out = list(codec_llm.stream(
+ [{"role": "user", "content": "x"}],
+ base_url="http://x", model="m"))
+ assert out == []