Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion codec_chat.html
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,11 @@
.empty-state img{width:140px;opacity:.25;margin-bottom:20px;filter:drop-shadow(0 0 30px rgba(232,113,26,0.4))}
.empty-state p{color:var(--text-muted);font-size:16px;line-height:1.6}
pre{background:var(--bg4);padding:10px;border-radius:8px;overflow-x:auto;font-family:var(--mono);font-size:12px;margin:8px 0}
.code-wrap{position:relative;margin:8px 0}
.code-wrap pre{margin:0;padding-right:38px}
.code-copy{position:absolute;top:6px;right:6px;background:var(--bg2,rgba(0,0,0,.35));border:1px solid var(--bg4,#444);border-radius:6px;color:var(--text-dim,#9aa);padding:3px 6px;cursor:pointer;opacity:.65;transition:opacity .15s,color .15s,border-color .15s;line-height:0}
.code-copy:hover{opacity:1;color:var(--accent);border-color:var(--accent)}
.code-copy.copied{opacity:1;color:var(--success,#4caf50);border-color:var(--success,#4caf50)}
code{font-family:var(--mono);font-size:12px;background:var(--bg4);padding:2px 6px;border-radius:4px}
input[type=file]{display:none}

Expand Down Expand Up @@ -494,6 +499,12 @@ <h1><a href="/" style="color:inherit;text-decoration:none">CODEC</a></h1>
return ok;
}catch(e){return false}
}
function copyCodeBlock(btn){
// Per-code-block copy (2026-07): grabs the rendered code text (already
// HTML-unescaped by innerText) and reuses copyMsgText's clipboard path.
var code=btn.parentNode.querySelector('code');
if(code)copyMsgText(code.innerText.replace(/\n$/,''),btn);
}
function copyMsgText(text,btn){
function ok(){btn.classList.add('copied');showToast('Copied');setTimeout(function(){btn.classList.remove('copied')},1500)}
function fail(){if(_copyFallback(text)){ok()}else{showToast('Copy failed')}}
Expand Down Expand Up @@ -733,7 +744,7 @@ <h1><a href="/" style="color:inherit;text-decoration:none">CODEC</a></h1>
function escHtml(s){var d=document.createElement('div');d.textContent=s||'';return d.innerHTML}
function formatMsg(c){
var f=escHtml(c);
f=f.replace(/```(\w*)\n([\s\S]*?)```/g,'<pre><code>$2</code></pre>');
f=f.replace(/```(\w*)\n([\s\S]*?)```/g,'<div class="code-wrap"><button class="code-copy" onclick="copyCodeBlock(this)" title="Copy code"><svg class="ico ico-sm" viewBox="0 0 24 24"><rect x="9" y="9" width="13" height="13" rx="2"/><path d="M5 15H4a2 2 0 01-2-2V4a2 2 0 012-2h9a2 2 0 012 2v1"/></svg></button><pre><code>$2</code></pre></div>');
f=f.replace(/`([^`]+)`/g,'<code>$1</code>');
f=f.replace(/\*\*(.+?)\*\*/g,'<strong>$1</strong>');
f=f.replace(/\*(.+?)\*/g,'<em>$1</em>');
Expand Down
7 changes: 7 additions & 0 deletions codec_chat_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ def __init__(self, resolve_skill_tag: Callable[[str], str]):
self.skill_buf = ""
self.buffering = False
self.visible_chars = 0
# Count of complete [SKILL:...] tags handed to the resolver — lets the
# caller's blank-bubble fallback distinguish "all output was dropped
# tool tags" from "the model produced nothing at all" (2026-07 fix).
self.tags_resolved = 0

def _count(self, text: str) -> str:
"""Account visible chars (only non-empty), return the text unchanged.
Expand Down Expand Up @@ -97,6 +101,7 @@ def feed(self, token: str) -> Iterator[str]:
# Tag complete?
if self.skill_buf.endswith("]"):
if SKILL_TAG_RE.search(self.skill_buf):
self.tags_resolved += 1
yield self._count(self._resolve(self.skill_buf))
else:
yield self._count(self.skill_buf)
Expand Down Expand Up @@ -126,6 +131,8 @@ def finish(self) -> Iterator[str]:
"""Flush any pending buffer at end-of-stream (resolve if it's a tag).
Idempotent — a no-op once the buffer has been flushed."""
if self.skill_buf:
if SKILL_TAG_RE.search(self.skill_buf):
self.tags_resolved += 1
yield self._count(self._resolve(self.skill_buf))
self.skill_buf = ""
self.buffering = False
30 changes: 28 additions & 2 deletions codec_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,17 @@ class LLMError(Exception):
# (codec_session.qwen_stream) are unaffected.
KEEPALIVE = object()

# Sentinels yielded by stream(error_sentinel=True) — 2026-07 chat-visibility
# fix. stream() never raises by contract, so before these existed a mid-reply
# connection drop / non-200 / read timeout was indistinguishable from a clean
# finish: the dashboard rendered an empty or silently-truncated bubble.
# STREAM_ERROR — the stream died abnormally (connect/HTTP/read error).
# FINISH_LENGTH — the model stopped at the max_tokens cap
# (finish_reason == "length"), i.e. the reply is truncated.
# Only yielded when error_sentinel=True; all existing callers are unaffected.
STREAM_ERROR = object()
FINISH_LENGTH = object()


def strip_think(text: str) -> str:
"""Remove <think>…</think> reasoning blocks and surrounding whitespace."""
Expand Down Expand Up @@ -206,6 +217,7 @@ def stream(
enable_thinking: bool = False,
extra_kwargs: Optional[Dict[str, Any]] = None,
keepalive: bool = False,
error_sentinel: bool = False,
) -> Iterator[Any]:
"""POST with `stream=True` and yield the RAW assistant content deltas in
order. Centralizes the SSE plumbing: header/payload build (shared with
Expand All @@ -222,6 +234,13 @@ def stream(
`KEEPALIVE` sentinel every 10th empty (1st, 11th, …) so an SSE caller can
emit a transport keepalive. Content-only callers leave it off and only ever
see `str` deltas.

`error_sentinel=True` (default off): yield `STREAM_ERROR` when the stream
dies abnormally (non-200, connect/read exception) and `FINISH_LENGTH` when
the model stops at the max_tokens cap — so a UI caller can tell the user
the reply was interrupted / truncated instead of rendering an empty or
silently-cut bubble. Callers that leave it off see the old behavior
(stream just ends).
"""
import json as _json
import requests
Expand All @@ -242,6 +261,8 @@ def stream(
if r.status_code != 200:
log.warning("LLM stream %s returned %s: %s",
url, r.status_code, getattr(r, "text", "")[:200])
if error_sentinel:
yield STREAM_ERROR
return
for line in r.iter_lines():
if not line:
Expand All @@ -254,8 +275,8 @@ def stream(
if data.strip() == "[DONE]":
return
try:
delta = (_json.loads(data).get("choices", [{}])[0]
.get("delta", {}).get("content", ""))
choice = _json.loads(data).get("choices", [{}])[0]
delta = choice.get("delta", {}).get("content", "")
except Exception as e:
log.warning("LLM stream chunk parse failed: %s", e)
continue
Expand All @@ -265,8 +286,13 @@ def stream(
_empty += 1
if _empty % 10 == 1: # 1st, 11th, 21st … (matches dashboard)
yield KEEPALIVE
if error_sentinel and choice.get("finish_reason") == "length":
# Model hit the max_tokens cap — reply is truncated.
yield FINISH_LENGTH
except Exception as e:
log.warning("LLM stream call failed: %s", e)
if error_sentinel:
yield STREAM_ERROR
return


Expand Down
63 changes: 53 additions & 10 deletions routes/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -753,9 +753,16 @@ async def _skill_stream():
# wins (matches the old chat_template_kwargs assignment after the update).
_extra = {"top_p": 0.9, "frequency_penalty": 1.1,
**{k: v for k, v in kwargs.items() if k != "chat_template_kwargs"}}
# 2026-07 chat-visibility fix: max_tokens + timeout are operator-tunable
# via ~/.codec/config.json:chat.{max_tokens, llm_timeout_s}. Note the
# cap includes <think> tokens when thinking mode is on — deep answers
# that burn a lot of reasoning eat into the visible-reply budget.
_chat_cfg = config.get("chat", {}) if isinstance(config.get("chat"), dict) else {}
_common = dict(base_url=base_url, model=model, api_key=api_key,
max_tokens=28000, temperature=0.7, enable_thinking=thinking,
extra_kwargs=_extra, timeout=300)
max_tokens=int(_chat_cfg.get("max_tokens", 28000)),
temperature=0.7, enable_thinking=thinking,
extra_kwargs=_extra,
timeout=float(_chat_cfg.get("llm_timeout_s", 300)))

if stream_mode:
# SSE streaming — keeps Cloudflare tunnel alive, sends tokens as they arrive
Expand Down Expand Up @@ -807,24 +814,60 @@ def _resolve_skill_tag(raw_tag):
buf = SkillTagBuffer(_resolve_skill_tag)
try:
# codec_llm.stream yields raw content deltas (it owns the SSE
# POST + data:/[DONE] parsing) and the KEEPALIVE sentinel on
# empty thinking-chunks (keepalive=True) to hold the tunnel.
for item in codec_llm.stream(messages, **_common, keepalive=True):
# POST + data:/[DONE] parsing), the KEEPALIVE sentinel on
# empty thinking-chunks (keepalive=True) to hold the tunnel,
# and — 2026-07 chat-visibility fix — STREAM_ERROR /
# FINISH_LENGTH sentinels so an interrupted or truncated
# reply is SAID to the user instead of silently rendering
# as an empty / mid-sentence bubble.
stream_died = False
hit_token_cap = False
for item in codec_llm.stream(messages, **_common,
keepalive=True,
error_sentinel=True):
if item is codec_llm.KEEPALIVE:
yield ": keepalive\n\n"
continue
if item is codec_llm.STREAM_ERROR:
stream_died = True
continue
if item is codec_llm.FINISH_LENGTH:
hit_token_cap = True
continue
for s in buf.feed(item):
yield _frame(s)
# Stream ended ([DONE] or close): flush, then blank-bubble net.
for s in buf.finish():
yield _frame(s)
# Safety net: LLM emitted ONLY [SKILL:...] tags and we dropped
# them all → blank bubble; send a graceful fallback (2026-04-27).
if buf.visible_chars == 0:
if hit_token_cap:
yield _frame(
"\n\n⚠️ *Reply truncated — the model hit the "
"`chat.max_tokens` cap. Raise it in "
"`~/.codec/config.json` (chat → max_tokens) for "
"longer replies.*"
)
if stream_died:
yield _frame(
"I tried to use a tool that didn't apply here. "
"Could you rephrase, or just ask me to write it directly?"
"\n\n⚠️ *Reply interrupted — the connection to the "
"local model dropped mid-answer. Ask me to continue, "
"or retry. (If this repeats: `pm2 logs qwen3.6`.)*"
)
# Blank-bubble net. Distinguish the two empty cases
# (2026-07): dropped tool tags vs. the model producing
# nothing at all — the old single message blamed a "tool"
# even when the LLM was just down/overloaded.
if buf.visible_chars == 0 and not stream_died:
if buf.tags_resolved:
yield _frame(
"I tried to use a tool that didn't apply here. "
"Could you rephrase, or just ask me to write it directly?"
)
else:
yield _frame(
"⚠️ The model returned an empty reply — it may be "
"busy, restarting, or out of context. Please try "
"again in a moment."
)
yield "data: [DONE]\n\n"
except Exception as e:
yield f"data: {json.dumps({'error': str(e)})}\n\n"
Expand Down
24 changes: 24 additions & 0 deletions tests/test_chat_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,27 @@ def resolve(tag):
def test_skill_tag_re_matches():
m = SKILL_TAG_RE.search("prefix [SKILL:translate:hola] suffix")
assert m and m.group(1) == "translate" and m.group(2) == "hola"


# ── tags_resolved counter (2026-07 chat-visibility fix) ──────────────────────


def test_tags_resolved_counts_complete_tags():
buf = SkillTagBuffer(lambda tag: "RESULT")
out = list(buf.feed("before [SKILL:weather:paris] after"))
assert buf.tags_resolved == 1
assert "RESULT" in "".join(out)


def test_tags_resolved_zero_for_plain_text():
buf = SkillTagBuffer(lambda tag: "RESULT")
list(buf.feed("no tags here at all"))
list(buf.finish())
assert buf.tags_resolved == 0


def test_tags_resolved_counts_finish_flush():
buf = SkillTagBuffer(lambda tag: "RESULT")
list(buf.feed("[SKILL:weather:paris]")) # complete in feed
list(buf.finish())
assert buf.tags_resolved == 1
58 changes: 58 additions & 0 deletions tests/test_llm_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,3 +298,61 @@ def fake_post(url, json=None, headers=None, timeout=None, stream=None):
out = list(codec_llm.stream([{"role": "user", "content": "q"}],
base_url="http://x/v1", model="m", keepalive=True))
assert out == [codec_llm.KEEPALIVE, codec_llm.KEEPALIVE]


# ── error_sentinel (2026-07 chat-visibility fix) ─────────────────────────────


def _sse_finish(reason):
return "data: " + json.dumps(
{"choices": [{"delta": {}, "finish_reason": reason}]})


def test_stream_error_sentinel_on_non_200(monkeypatch):
monkeypatch.setattr("requests.post",
lambda *a, **kw: _StreamResp(503, text="busy"))
out = list(codec_llm.stream(
[{"role": "user", "content": "x"}],
base_url="http://x", model="m", error_sentinel=True))
assert out == [codec_llm.STREAM_ERROR]


def test_stream_error_sentinel_on_exception(monkeypatch):
def _boom(*a, **kw):
raise ConnectionError("dropped")
monkeypatch.setattr("requests.post", _boom)
out = list(codec_llm.stream(
[{"role": "user", "content": "x"}],
base_url="http://x", model="m", error_sentinel=True))
assert out == [codec_llm.STREAM_ERROR]


def test_stream_finish_length_sentinel(monkeypatch):
lines = [_sse("partial answer"), _sse_finish("length")]
monkeypatch.setattr("requests.post",
lambda *a, **kw: _StreamResp(200, lines))
out = list(codec_llm.stream(
[{"role": "user", "content": "x"}],
base_url="http://x", model="m", error_sentinel=True))
assert out == ["partial answer", codec_llm.FINISH_LENGTH]


def test_stream_clean_stop_no_sentinels(monkeypatch):
lines = [_sse("full answer"), _sse_finish("stop"), "data: [DONE]"]
monkeypatch.setattr("requests.post",
lambda *a, **kw: _StreamResp(200, lines))
out = list(codec_llm.stream(
[{"role": "user", "content": "x"}],
base_url="http://x", model="m", error_sentinel=True))
assert out == ["full answer"]


def test_stream_sentinels_off_by_default(monkeypatch):
"""Existing callers (no error_sentinel) keep the old contract: errors and
length-stops just end the stream with no sentinel objects."""
monkeypatch.setattr("requests.post",
lambda *a, **kw: _StreamResp(500, text="err"))
out = list(codec_llm.stream(
[{"role": "user", "content": "x"}],
base_url="http://x", model="m"))
assert out == []