AVADSA25 · AVADSA25 · Jul 2, 2026 · Jul 2, 2026
diff --git a/codec_chat.html b/codec_chat.html
@@ -168,6 +168,11 @@
 .empty-state img{width:140px;opacity:.25;margin-bottom:20px;filter:drop-shadow(0 0 30px rgba(232,113,26,0.4))}
 .empty-state p{color:var(--text-muted);font-size:16px;line-height:1.6}
 pre{background:var(--bg4);padding:10px;border-radius:8px;overflow-x:auto;font-family:var(--mono);font-size:12px;margin:8px 0}
+.code-wrap{position:relative;margin:8px 0}
+.code-wrap pre{margin:0;padding-right:38px}
+.code-copy{position:absolute;top:6px;right:6px;background:var(--bg2,rgba(0,0,0,.35));border:1px solid var(--bg4,#444);border-radius:6px;color:var(--text-dim,#9aa);padding:3px 6px;cursor:pointer;opacity:.65;transition:opacity .15s,color .15s,border-color .15s;line-height:0}
+.code-copy:hover{opacity:1;color:var(--accent);border-color:var(--accent)}
+.code-copy.copied{opacity:1;color:var(--success,#4caf50);border-color:var(--success,#4caf50)}
 code{font-family:var(--mono);font-size:12px;background:var(--bg4);padding:2px 6px;border-radius:4px}
 input[type=file]{display:none}
 
@@ -494,6 +499,12 @@ <h1><a href="/" style="color:inherit;text-decoration:none">CODEC</a></h1>
     return ok;
   }catch(e){return false}
 }
+function copyCodeBlock(btn){
+  // Per-code-block copy (2026-07): grabs the rendered code text (already
+  // HTML-unescaped by innerText) and reuses copyMsgText's clipboard path.
+  var code=btn.parentNode.querySelector('code');
+  if(code)copyMsgText(code.innerText.replace(/\n$/,''),btn);
+}
 function copyMsgText(text,btn){
   function ok(){btn.classList.add('copied');showToast('Copied');setTimeout(function(){btn.classList.remove('copied')},1500)}
   function fail(){if(_copyFallback(text)){ok()}else{showToast('Copy failed')}}
@@ -733,7 +744,7 @@ <h1><a href="/" style="color:inherit;text-decoration:none">CODEC</a></h1>
 function escHtml(s){var d=document.createElement('div');d.textContent=s||'';return d.innerHTML}
 function formatMsg(c){
   var f=escHtml(c);
-  f=f.replace(/```(\w*)\n([\s\S]*?)```/g,'<pre><code>$2</code></pre>');
+  f=f.replace(/```(\w*)\n([\s\S]*?)```/g,'<div class="code-wrap"><button class="code-copy" onclick="copyCodeBlock(this)" title="Copy code"><svg class="ico ico-sm" viewBox="0 0 24 24"><rect x="9" y="9" width="13" height="13" rx="2"/><path d="M5 15H4a2 2 0 01-2-2V4a2 2 0 012-2h9a2 2 0 012 2v1"/></svg></button><pre><code>$2</code></pre></div>');
   f=f.replace(/`([^`]+)`/g,'<code>$1</code>');
   f=f.replace(/\*\*(.+?)\*\*/g,'<strong>$1</strong>');
   f=f.replace(/\*(.+?)\*/g,'<em>$1</em>');

diff --git a/codec_chat_stream.py b/codec_chat_stream.py
@@ -48,6 +48,10 @@ def __init__(self, resolve_skill_tag: Callable[[str], str]):
         self.skill_buf = ""
         self.buffering = False
         self.visible_chars = 0
+        # Count of complete [SKILL:...] tags handed to the resolver — lets the
+        # caller's blank-bubble fallback distinguish "all output was dropped
+        # tool tags" from "the model produced nothing at all" (2026-07 fix).
+        self.tags_resolved = 0
 
     def _count(self, text: str) -> str:
         """Account visible chars (only non-empty), return the text unchanged.
@@ -97,6 +101,7 @@ def feed(self, token: str) -> Iterator[str]:
                 # Tag complete?
                 if self.skill_buf.endswith("]"):
                     if SKILL_TAG_RE.search(self.skill_buf):
+                        self.tags_resolved += 1
                         yield self._count(self._resolve(self.skill_buf))
                     else:
                         yield self._count(self.skill_buf)
@@ -126,6 +131,8 @@ def finish(self) -> Iterator[str]:
         """Flush any pending buffer at end-of-stream (resolve if it's a tag).
         Idempotent — a no-op once the buffer has been flushed."""
         if self.skill_buf:
+            if SKILL_TAG_RE.search(self.skill_buf):
+                self.tags_resolved += 1
             yield self._count(self._resolve(self.skill_buf))
             self.skill_buf = ""
             self.buffering = False
diff --git a/codec_llm.py b/codec_llm.py
@@ -45,6 +45,17 @@ class LLMError(Exception):
 # (codec_session.qwen_stream) are unaffected.
 KEEPALIVE = object()
 
+# Sentinels yielded by stream(error_sentinel=True) — 2026-07 chat-visibility
+# fix. stream() never raises by contract, so before these existed a mid-reply
+# connection drop / non-200 / read timeout was indistinguishable from a clean
+# finish: the dashboard rendered an empty or silently-truncated bubble.
+#   STREAM_ERROR   — the stream died abnormally (connect/HTTP/read error).
+#   FINISH_LENGTH  — the model stopped at the max_tokens cap
+#                    (finish_reason == "length"), i.e. the reply is truncated.
+# Only yielded when error_sentinel=True; all existing callers are unaffected.
+STREAM_ERROR = object()
+FINISH_LENGTH = object()
+
 
 def strip_think(text: str) -> str:
     """Remove <think>…</think> reasoning blocks and surrounding whitespace."""
@@ -206,6 +217,7 @@ def stream(
     enable_thinking: bool = False,
     extra_kwargs: Optional[Dict[str, Any]] = None,
     keepalive: bool = False,
+    error_sentinel: bool = False,
 ) -> Iterator[Any]:
     """POST with `stream=True` and yield the RAW assistant content deltas in
     order. Centralizes the SSE plumbing: header/payload build (shared with
@@ -222,6 +234,13 @@ def stream(
     `KEEPALIVE` sentinel every 10th empty (1st, 11th, …) so an SSE caller can
     emit a transport keepalive. Content-only callers leave it off and only ever
     see `str` deltas.
+
+    `error_sentinel=True` (default off): yield `STREAM_ERROR` when the stream
+    dies abnormally (non-200, connect/read exception) and `FINISH_LENGTH` when
+    the model stops at the max_tokens cap — so a UI caller can tell the user
+    the reply was interrupted / truncated instead of rendering an empty or
+    silently-cut bubble. Callers that leave it off see the old behavior
+    (stream just ends).
     """
     import json as _json
     import requests
@@ -242,6 +261,8 @@ def stream(
             if r.status_code != 200:
                 log.warning("LLM stream %s returned %s: %s",
                             url, r.status_code, getattr(r, "text", "")[:200])
+                if error_sentinel:
+                    yield STREAM_ERROR
                 return
             for line in r.iter_lines():
                 if not line:
@@ -254,8 +275,8 @@ def stream(
                 if data.strip() == "[DONE]":
                     return
                 try:
-                    delta = (_json.loads(data).get("choices", [{}])[0]
-                             .get("delta", {}).get("content", ""))
+                    choice = _json.loads(data).get("choices", [{}])[0]
+                    delta = choice.get("delta", {}).get("content", "")
                 except Exception as e:
                     log.warning("LLM stream chunk parse failed: %s", e)
                     continue
@@ -265,8 +286,13 @@ def stream(
                     _empty += 1
                     if _empty % 10 == 1:   # 1st, 11th, 21st … (matches dashboard)
                         yield KEEPALIVE
+                if error_sentinel and choice.get("finish_reason") == "length":
+                    # Model hit the max_tokens cap — reply is truncated.
+                    yield FINISH_LENGTH
     except Exception as e:
         log.warning("LLM stream call failed: %s", e)
+        if error_sentinel:
+            yield STREAM_ERROR
         return
 
 

diff --git a/routes/chat.py b/routes/chat.py
@@ -753,9 +753,16 @@ async def _skill_stream():
         # wins (matches the old chat_template_kwargs assignment after the update).
         _extra = {"top_p": 0.9, "frequency_penalty": 1.1,
                   **{k: v for k, v in kwargs.items() if k != "chat_template_kwargs"}}
+        # 2026-07 chat-visibility fix: max_tokens + timeout are operator-tunable
+        # via ~/.codec/config.json:chat.{max_tokens, llm_timeout_s}. Note the
+        # cap includes <think> tokens when thinking mode is on — deep answers
+        # that burn a lot of reasoning eat into the visible-reply budget.
+        _chat_cfg = config.get("chat", {}) if isinstance(config.get("chat"), dict) else {}
         _common = dict(base_url=base_url, model=model, api_key=api_key,
-                       max_tokens=28000, temperature=0.7, enable_thinking=thinking,
-                       extra_kwargs=_extra, timeout=300)
+                       max_tokens=int(_chat_cfg.get("max_tokens", 28000)),
+                       temperature=0.7, enable_thinking=thinking,
+                       extra_kwargs=_extra,
+                       timeout=float(_chat_cfg.get("llm_timeout_s", 300)))
 
         if stream_mode:
             # SSE streaming — keeps Cloudflare tunnel alive, sends tokens as they arrive
@@ -807,24 +814,60 @@ def _resolve_skill_tag(raw_tag):
                 buf = SkillTagBuffer(_resolve_skill_tag)
                 try:
                     # codec_llm.stream yields raw content deltas (it owns the SSE
-                    # POST + data:/[DONE] parsing) and the KEEPALIVE sentinel on
-                    # empty thinking-chunks (keepalive=True) to hold the tunnel.
-                    for item in codec_llm.stream(messages, **_common, keepalive=True):
+                    # POST + data:/[DONE] parsing), the KEEPALIVE sentinel on
+                    # empty thinking-chunks (keepalive=True) to hold the tunnel,
+                    # and — 2026-07 chat-visibility fix — STREAM_ERROR /
+                    # FINISH_LENGTH sentinels so an interrupted or truncated
+                    # reply is SAID to the user instead of silently rendering
+                    # as an empty / mid-sentence bubble.
+                    stream_died = False
+                    hit_token_cap = False
+                    for item in codec_llm.stream(messages, **_common,
+                                                 keepalive=True,
+                                                 error_sentinel=True):
                         if item is codec_llm.KEEPALIVE:
                             yield ": keepalive\n\n"
                             continue
+                        if item is codec_llm.STREAM_ERROR:
+                            stream_died = True
+                            continue
+                        if item is codec_llm.FINISH_LENGTH:
+                            hit_token_cap = True
+                            continue
                         for s in buf.feed(item):
                             yield _frame(s)
                     # Stream ended ([DONE] or close): flush, then blank-bubble net.
                     for s in buf.finish():
                         yield _frame(s)
-                    # Safety net: LLM emitted ONLY [SKILL:...] tags and we dropped
-                    # them all → blank bubble; send a graceful fallback (2026-04-27).
-                    if buf.visible_chars == 0:
+                    if hit_token_cap:
+                        yield _frame(
+                            "\n\n⚠️ *Reply truncated — the model hit the "
+                            "`chat.max_tokens` cap. Raise it in "
+                            "`~/.codec/config.json` (chat → max_tokens) for "
+                            "longer replies.*"
+                        )
+                    if stream_died:
                         yield _frame(
-                            "I tried to use a tool that didn't apply here. "
-                            "Could you rephrase, or just ask me to write it directly?"
+                            "\n\n⚠️ *Reply interrupted — the connection to the "
+                            "local model dropped mid-answer. Ask me to continue, "
+                            "or retry. (If this repeats: `pm2 logs qwen3.6`.)*"
                         )
+                    # Blank-bubble net. Distinguish the two empty cases
+                    # (2026-07): dropped tool tags vs. the model producing
+                    # nothing at all — the old single message blamed a "tool"
+                    # even when the LLM was just down/overloaded.
+                    if buf.visible_chars == 0 and not stream_died:
+                        if buf.tags_resolved:
+                            yield _frame(
+                                "I tried to use a tool that didn't apply here. "
+                                "Could you rephrase, or just ask me to write it directly?"
+                            )
+                        else:
+                            yield _frame(
+                                "⚠️ The model returned an empty reply — it may be "
+                                "busy, restarting, or out of context. Please try "
+                                "again in a moment."
+                            )
                     yield "data: [DONE]\n\n"
                 except Exception as e:
                     yield f"data: {json.dumps({'error': str(e)})}\n\n"

diff --git a/tests/test_chat_stream.py b/tests/test_chat_stream.py
@@ -140,3 +140,27 @@ def resolve(tag):
 def test_skill_tag_re_matches():
     m = SKILL_TAG_RE.search("prefix [SKILL:translate:hola] suffix")
     assert m and m.group(1) == "translate" and m.group(2) == "hola"
+
+
+# ── tags_resolved counter (2026-07 chat-visibility fix) ──────────────────────
+
+
+def test_tags_resolved_counts_complete_tags():
+    buf = SkillTagBuffer(lambda tag: "RESULT")
+    out = list(buf.feed("before [SKILL:weather:paris] after"))
+    assert buf.tags_resolved == 1
+    assert "RESULT" in "".join(out)
+
+
+def test_tags_resolved_zero_for_plain_text():
+    buf = SkillTagBuffer(lambda tag: "RESULT")
+    list(buf.feed("no tags here at all"))
+    list(buf.finish())
+    assert buf.tags_resolved == 0
+
+
+def test_tags_resolved_counts_finish_flush():
+    buf = SkillTagBuffer(lambda tag: "RESULT")
+    list(buf.feed("[SKILL:weather:paris]"))  # complete in feed
+    list(buf.finish())
+    assert buf.tags_resolved == 1
diff --git a/tests/test_llm_stream.py b/tests/test_llm_stream.py
@@ -298,3 +298,61 @@ def fake_post(url, json=None, headers=None, timeout=None, stream=None):
     out = list(codec_llm.stream([{"role": "user", "content": "q"}],
                                 base_url="http://x/v1", model="m", keepalive=True))
     assert out == [codec_llm.KEEPALIVE, codec_llm.KEEPALIVE]
+
+
+# ── error_sentinel (2026-07 chat-visibility fix) ─────────────────────────────
+
+
+def _sse_finish(reason):
+    return "data: " + json.dumps(
+        {"choices": [{"delta": {}, "finish_reason": reason}]})
+
+
+def test_stream_error_sentinel_on_non_200(monkeypatch):
+    monkeypatch.setattr("requests.post",
+                        lambda *a, **kw: _StreamResp(503, text="busy"))
+    out = list(codec_llm.stream(
+        [{"role": "user", "content": "x"}],
+        base_url="http://x", model="m", error_sentinel=True))
+    assert out == [codec_llm.STREAM_ERROR]
+
+
+def test_stream_error_sentinel_on_exception(monkeypatch):
+    def _boom(*a, **kw):
+        raise ConnectionError("dropped")
+    monkeypatch.setattr("requests.post", _boom)
+    out = list(codec_llm.stream(
+        [{"role": "user", "content": "x"}],
+        base_url="http://x", model="m", error_sentinel=True))
+    assert out == [codec_llm.STREAM_ERROR]
+
+
+def test_stream_finish_length_sentinel(monkeypatch):
+    lines = [_sse("partial answer"), _sse_finish("length")]
+    monkeypatch.setattr("requests.post",
+                        lambda *a, **kw: _StreamResp(200, lines))
+    out = list(codec_llm.stream(
+        [{"role": "user", "content": "x"}],
+        base_url="http://x", model="m", error_sentinel=True))
+    assert out == ["partial answer", codec_llm.FINISH_LENGTH]
+
+
+def test_stream_clean_stop_no_sentinels(monkeypatch):
+    lines = [_sse("full answer"), _sse_finish("stop"), "data: [DONE]"]
+    monkeypatch.setattr("requests.post",
+                        lambda *a, **kw: _StreamResp(200, lines))
+    out = list(codec_llm.stream(
+        [{"role": "user", "content": "x"}],
+        base_url="http://x", model="m", error_sentinel=True))
+    assert out == ["full answer"]
+
+
+def test_stream_sentinels_off_by_default(monkeypatch):
+    """Existing callers (no error_sentinel) keep the old contract: errors and
+    length-stops just end the stream with no sentinel objects."""
+    monkeypatch.setattr("requests.post",
+                        lambda *a, **kw: _StreamResp(500, text="err"))
+    out = list(codec_llm.stream(
+        [{"role": "user", "content": "x"}],
+        base_url="http://x", model="m"))
+    assert out == []