diff --git a/apps/memos-local-plugin/core/retrieval/llm-filter.ts b/apps/memos-local-plugin/core/retrieval/llm-filter.ts index 15868fb68..2d1f77180 100644 --- a/apps/memos-local-plugin/core/retrieval/llm-filter.ts +++ b/apps/memos-local-plugin/core/retrieval/llm-filter.ts @@ -24,12 +24,32 @@ import type { LlmClient } from "../llm/index.js"; import type { Logger } from "../logger/types.js"; import { RETRIEVAL_FILTER_PROMPT } from "../llm/prompts/index.js"; import type { RankedCandidate } from "./ranker.js"; -import type { RetrievalConfig } from "./types.js"; +import type { RetrievalConfig, TraceCandidate } from "./types.js"; const DEFAULT_CANDIDATE_BODY_CHARS = 500; const MIN_FILTER_OUTPUT_TOKENS = 160; const MAX_FILTER_OUTPUT_TOKENS = 2048; +/** + * A trace whose `agentText` falls under this length, with no LLM summary + * or reflection to back it up, is treated as a near-duplicate question + * trace (issue #1913). The rescue path keeps these *behind* informative + * candidates so the answer-bearing trace surfaces first. + */ +const INFORMATIVE_AGENT_TEXT_MIN_CHARS = 20; + +/** + * Short acknowledgement / scaffold replies that the filter prompt + * rightly classes as "scaffolding chatter". When the LLM filter empties + * the kept set we still need to make a rescue call — these strings let + * us prefer informative replies over plain acks. Bounded list, exact + * matches only after trimming surrounding punctuation / whitespace. + */ +const SHORT_ACK_PATTERNS: readonly RegExp[] = [ + /^(ok|okay|sure|got it|noted|understood|alright|will do|copy|copy that|thanks|thank you|✓|✅|👍)[\s.!]*$/i, + /^(记住了|已记住|已经记住|好的|明白|收到|了解|谢谢)[\s。!]*$/, +]; + export interface FilterInput { query: string; ranked: readonly RankedCandidate[]; @@ -70,6 +90,12 @@ export interface FilterResult { | "deferred_to_final" | "llm_kept_all" | "llm_filtered" + // The LLM returned an empty selection over a non-empty ranked list + // (issue #1913 — repeated question traces crowding the hit set). + // We rescued the top-K best-scoring candidates so the agent always + // sees a packet when retrieval succeeded; `sufficient` is forced + // to `false` so downstream callers know the injection is weak. + | "llm_filtered_refilled" // The LLM was supposed to run but the call failed / parsed badly. // We applied a mechanical relevance cutoff (top-K above // `relativeThresholdFloor · topRelevance`) instead of dumping the @@ -169,15 +195,18 @@ ${list}`, ); const keepIndices = new Set(cappedIndices); if (keepIndices.size === 0) { - // Model asked us to drop everything — honoured. Surface this - // explicitly so the Logs page can show "LLM found nothing - // relevant" instead of silently injecting a partial packet. - return { - kept: [], - dropped: [...ranked], - outcome: "llm_filtered", - sufficient: sufficient ?? false, - }; + // Issue #1913: the model asked us to drop everything. Honouring + // that verbatim used to collapse `turn.start` injection to "" even + // when retrieval was healthy — the failure mode is a hit set + // dominated by near-duplicate question traces from prior + // sessions, where each candidate individually looks like + // "surface-similar wrong sub-problem" to the filter prompt. + // Instead, rescue the top-K best-scoring candidates (preferring + // informative traces over pure-question / ack-only chatter) so + // the agent always sees a packet when retrieval succeeded. + // `safeCutoff`'s sibling escape hatch (`llmFilterMaxKeep === 0`) + // is honoured so operators can still ask for hard drop. + return rescueFromEmptySelection(ranked, deps, sufficient); } const kept = cappedIndices.map((i) => ranked[i]!); const dropped: RankedCandidate[] = []; @@ -214,6 +243,95 @@ function passthrough( return { kept: [...ranked], dropped: [], outcome, sufficient: null }; } +/** + * Issue #1913 rescue path. Invoked when the LLM relevance filter + * returned `selected: []` for a *non-empty* ranked candidate list — the + * most common cause is a hit set dominated by near-duplicate question + * traces from previous sessions, where the filter prompt's "drop + * scaffolding chatter" / "drop surface-similar wrong sub-problem" + * rubric is applied to every candidate. + * + * Strategy: keep the top-K best-scoring candidates, preferring + * informative traces (skill / episode / experience / world-model, or a + * trace whose `agentText`/`summary`/`reflection` carries real content) + * over pure-question chatter. We do NOT re-query the LLM — the rescue + * is a single O(n) partition + slice. Outcome label is + * `"llm_filtered_refilled"` so the Logs viewer can show "LLM collapsed, + * safety net fired" distinct from a normal `"llm_filtered"`. + * + * Escape hatch: `llmFilterMaxKeep === 0` skips the rescue entirely and + * honours the "drop everything" request (matches existing `safeCutoff` + * semantics for the same config value). + */ +function rescueFromEmptySelection( + ranked: readonly RankedCandidate[], + deps: FilterDeps, + sufficient: boolean | null, +): FilterResult { + const keepCap = Math.max(0, deps.config.llmFilterMaxKeep); + if (keepCap === 0 || ranked.length === 0) { + return { + kept: [], + dropped: [...ranked], + outcome: "llm_filtered", + sufficient: sufficient ?? false, + }; + } + const informative: RankedCandidate[] = []; + const chatter: RankedCandidate[] = []; + for (const r of ranked) { + if (isInformativeCandidate(r)) informative.push(r); + else chatter.push(r); + } + // Preserve ranker order within each bucket; informative first so the + // answer-bearing trace surfaces even when the ranker placed it below + // surface-similar question traces. + const ordered = [...informative, ...chatter]; + const kept = ordered.slice(0, Math.min(keepCap, ordered.length)); + const keptSet = new Set(kept); + const dropped = ranked.filter((r) => !keptSet.has(r)); + deps.log.debug("llm_filter.collapsed_refill", { + ranked: ranked.length, + rescued: kept.length, + informative: informative.length, + chatter: chatter.length, + filteredAll: true, + }); + return { + kept, + dropped, + outcome: "llm_filtered_refilled", + sufficient: sufficient ?? false, + }; +} + +/** + * Returns true when a ranked candidate carries content the agent can + * actually use. Skills, episodes, experiences, and world-models always + * count. Traces count when their `summary` or `reflection` is non-empty + * or their `agentText` is longer than a short acknowledgement. + * + * Used by the rescue path (and intentionally only there) to bias the + * rescued set toward traces with informative assistant text. False + * negatives (an informative trace mistakenly labelled chatter) still + * get rescued because they sit in the second half of the ordered list. + */ +function isInformativeCandidate(r: RankedCandidate): boolean { + const c = r.candidate; + if (c.refKind !== "trace") return true; + const t = c as TraceCandidate; + if ((t.summary?.trim().length ?? 0) > 0) return true; + if ((t.reflection?.trim().length ?? 0) > 0) return true; + const agent = t.agentText?.trim() ?? ""; + if (agent.length === 0) return false; + if (isShortAck(agent)) return false; + return agent.length >= INFORMATIVE_AGENT_TEXT_MIN_CHARS; +} + +function isShortAck(text: string): boolean { + return SHORT_ACK_PATTERNS.some((re) => re.test(text)); +} + /** * Mechanical fail-closed: when the LLM is unavailable / errored, * apply a relative-relevance cutoff so we don't dump the entire ranked diff --git a/apps/memos-local-plugin/core/retrieval/types.ts b/apps/memos-local-plugin/core/retrieval/types.ts index ae1b6f944..29abb358d 100644 --- a/apps/memos-local-plugin/core/retrieval/types.ts +++ b/apps/memos-local-plugin/core/retrieval/types.ts @@ -737,6 +737,7 @@ export interface RetrievalStats { | "deferred_to_final" | "llm_kept_all" | "llm_filtered" + | "llm_filtered_refilled" | "llm_failed_safe_cutoff"; llmFilterSufficient?: boolean; llmFilterKept?: number; diff --git a/apps/memos-local-plugin/tests/unit/retrieval/integration.test.ts b/apps/memos-local-plugin/tests/unit/retrieval/integration.test.ts index 3d2fb0049..09e607c92 100644 --- a/apps/memos-local-plugin/tests/unit/retrieval/integration.test.ts +++ b/apps/memos-local-plugin/tests/unit/retrieval/integration.test.ts @@ -402,6 +402,45 @@ describe("retrieval/integration", () => { expect(res.stats.emptyPacket).toBe(false); }); + it("turn_start rescues injection when LLM filter empties the kept set (#1913)", async () => { + // Repro for issue #1913: when the LLM relevance filter returns + // `selected: []` for a non-empty ranked list (the case where the + // top hits are all near-duplicate question traces), the packet + // used to collapse to an empty injection. The rescue path keeps + // the top-K best-scoring candidates so the agent still gets a + // packet, and surfaces `llm_filtered_refilled` so the Logs viewer + // can show the safety net fired. + const llm: any = { + completeJson: async () => ({ + value: { selected: [], sufficient: false }, + servedBy: "fake", + }), + }; + const res = await turnStartRetrieve( + { + ...makeDeps(handle), + llm, + config: { + ...makeDeps(handle).config, + llmFilterEnabled: true, + llmFilterMinCandidates: 1, + }, + }, + { + reason: "turn_start", + agent: "openclaw", + sessionId: "s_current" as SessionId, + userText: "run docker compose", + ts: NOW as never, + }, + ); + + expect(res.packet.snippets.length).toBeGreaterThan(0); + expect(res.packet.rendered.length).toBeGreaterThan(0); + expect(res.stats.llmFilterOutcome).toBe("llm_filtered_refilled"); + expect(res.stats.emptyPacket).toBe(false); + }); + it("skill_invoke is tier1-heavy", async () => { const res = await skillInvokeRetrieve(makeDeps(handle), { reason: "skill_invoke", diff --git a/apps/memos-local-plugin/tests/unit/retrieval/llm-filter.test.ts b/apps/memos-local-plugin/tests/unit/retrieval/llm-filter.test.ts index ddb58c604..2355732d7 100644 --- a/apps/memos-local-plugin/tests/unit/retrieval/llm-filter.test.ts +++ b/apps/memos-local-plugin/tests/unit/retrieval/llm-filter.test.ts @@ -57,6 +57,33 @@ function trace(id: string, score: number): RankedCandidate { }; } +/** + * Build a trace candidate with no LLM-generated summary / reflection + * and only a short acknowledgement-style `agentText` — i.e. exactly the + * "near-duplicate question trace" shape from issue #1913 where the + * user re-asked a stored fact across multiple sessions and the + * assistant only acked it. The current LLM filter prompt classes these + * as "scaffolding chatter" and is allowed to drop them all. + */ +function chatterTrace( + id: string, + score: number, + overrides: { agentText?: string; userText?: string } = {}, +): RankedCandidate { + const r = trace(id, score); + const cand = r.candidate as TraceCandidate; + return { + ...r, + candidate: { + ...cand, + userText: overrides.userText ?? `What does HERMES_REAL_E2E_1910 mean? (${id})`, + agentText: overrides.agentText ?? "OK", + summary: null, + reflection: null, + }, + }; +} + describe("retrieval/llm-filter", () => { it("disabled → passthrough with null sufficient", async () => { const result = await llmFilterCandidates( @@ -135,7 +162,100 @@ describe("retrieval/llm-filter", () => { expect(result.sufficient).toBe(true); }); - it("LLM returns empty selection → drops everything and marks insufficient", async () => { + it("LLM returns empty selection over non-empty ranked → rescues top-K informative candidates (#1913)", async () => { + // Issue #1913 repro shape: 3 near-duplicate question traces from + // previous sessions plus 1 answer-bearing trace. Previous filter + // honoured `selected: []` and collapsed to empty injection. + const llm: any = { + completeJson: vi.fn().mockResolvedValue({ + value: { selected: [], sufficient: false }, + servedBy: "fake", + }), + }; + const ranked = [ + chatterTrace("q1", 0.95), + chatterTrace("q2", 0.94), + // Answer-bearing trace: long informative agentText, even though + // the ranker placed it below the question duplicates. + (() => { + const r = trace("answer", 0.85); + const c = r.candidate as TraceCandidate; + return { + ...r, + candidate: { + ...c, + userText: + "Remember this fact: HERMES_REAL_E2E_1910 means the bridge leak fix verification.", + agentText: "Noted. HERMES_REAL_E2E_1910 → bridge leak fix verification.", + summary: + "HERMES_REAL_E2E_1910 marks the bridge-leak fix verification.", + reflection: null, + }, + }; + })(), + chatterTrace("q3", 0.8), + ]; + const result = await llmFilterCandidates( + { query: "What does HERMES_REAL_E2E_1910 mean?", ranked }, + { llm, log, config: cfg }, + ); + expect(result.outcome).toBe("llm_filtered_refilled"); + expect(result.kept.length).toBeGreaterThanOrEqual(1); + expect(result.kept[0]!.candidate.refId).toBe("answer"); + expect(result.sufficient).toBe(false); + // Strong negative assertion: the bug returned kept=[], dropped=ranked. + expect(result.dropped.length).toBeLessThan(ranked.length); + }); + + it("rescue fires even when every ranked candidate is short-ack chatter", async () => { + const llm: any = { + completeJson: vi.fn().mockResolvedValue({ + value: { selected: [], sufficient: false }, + servedBy: "fake", + }), + }; + const ranked = [ + chatterTrace("q1", 0.9, { agentText: "OK" }), + chatterTrace("q2", 0.85, { agentText: "记住了" }), + chatterTrace("q3", 0.8, { agentText: "👍" }), + ]; + const result = await llmFilterCandidates( + { query: "What does HERMES_REAL_E2E_1910 mean?", ranked }, + { llm, log, config: cfg }, + ); + expect(result.outcome).toBe("llm_filtered_refilled"); + // No informative candidate exists — rescue still keeps top-K by + // ranker score so the agent at least sees one memory. + expect(result.kept.length).toBeGreaterThanOrEqual(1); + expect(result.kept[0]!.candidate.refId).toBe("q1"); // highest score + expect(result.sufficient).toBe(false); + }); + + it("rescue respects llmFilterMaxKeep cap", async () => { + const llm: any = { + completeJson: vi.fn().mockResolvedValue({ + value: { selected: [], sufficient: false }, + servedBy: "fake", + }), + }; + const ranked = [ + trace("a", 0.95), + trace("b", 0.9), + trace("c", 0.85), + trace("d", 0.8), + ]; + const result = await llmFilterCandidates( + { query: "q", ranked }, + { llm, log, config: { ...cfg, llmFilterMaxKeep: 2 } }, + ); + expect(result.outcome).toBe("llm_filtered_refilled"); + expect(result.kept.length).toBe(2); + expect(result.dropped.length).toBe(2); + }); + + it("llmFilterMaxKeep=0 disables rescue: honours empty selection (drop-everything override)", async () => { + // This is the only configured way to ask the filter to truly drop + // everything; keep it explicit so operators have an escape hatch. const llm: any = { completeJson: vi.fn().mockResolvedValue({ value: { selected: [], sufficient: false }, @@ -145,7 +265,7 @@ describe("retrieval/llm-filter", () => { const ranked = [trace("a", 0.9), trace("b", 0.8)]; const result = await llmFilterCandidates( { query: "q", ranked }, - { llm, log, config: cfg }, + { llm, log, config: { ...cfg, llmFilterMaxKeep: 0 } }, ); expect(result.outcome).toBe("llm_filtered"); expect(result.kept.length).toBe(0); @@ -153,6 +273,22 @@ describe("retrieval/llm-filter", () => { expect(result.sufficient).toBe(false); }); + it("LLM returns empty selection with empty ranked list → unchanged (still llm_filtered, kept=[])", async () => { + const llm: any = { + completeJson: vi.fn().mockResolvedValue({ + value: { selected: [], sufficient: false }, + servedBy: "fake", + }), + }; + // ranked empty but minCandidates=0 so the filter still runs + const result = await llmFilterCandidates( + { query: "q", ranked: [] }, + { llm, log, config: { ...cfg, llmFilterMinCandidates: 0 } }, + ); + expect(result.outcome).toBe("below_threshold"); + expect(result.kept.length).toBe(0); + }); + it("coerces string / number `sufficient` fields sent by lax models", async () => { const llm: any = { completeJson: vi.fn().mockResolvedValue({ diff --git a/apps/memos-local-plugin/viewer/src/views/LogsView.tsx b/apps/memos-local-plugin/viewer/src/views/LogsView.tsx index abac410ed..659bdf3cf 100644 --- a/apps/memos-local-plugin/viewer/src/views/LogsView.tsx +++ b/apps/memos-local-plugin/viewer/src/views/LogsView.tsx @@ -725,6 +725,7 @@ function RetrievalFunnel({ stats }: { stats: RetrievalStatsPayload }) { const localFilterDeferred = outcome === "deferred_to_final"; const finalLlmRan = finalFilter?.outcome === "llm_kept_all" || finalFilter?.outcome === "llm_filtered" || + finalFilter?.outcome === "llm_filtered_refilled" || finalFilter?.outcome === "llm_failed_safe_cutoff"; const fmtNum = (n: number | undefined, digits = 3) => typeof n === "number" && Number.isFinite(n) ? n.toFixed(digits) : "—";