Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 128 additions & 10 deletions apps/memos-local-plugin/core/retrieval/llm-filter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,32 @@ import type { LlmClient } from "../llm/index.js";
import type { Logger } from "../logger/types.js";
import { RETRIEVAL_FILTER_PROMPT } from "../llm/prompts/index.js";
import type { RankedCandidate } from "./ranker.js";
import type { RetrievalConfig } from "./types.js";
import type { RetrievalConfig, TraceCandidate } from "./types.js";

const DEFAULT_CANDIDATE_BODY_CHARS = 500;
const MIN_FILTER_OUTPUT_TOKENS = 160;
const MAX_FILTER_OUTPUT_TOKENS = 2048;

/**
* A trace whose `agentText` falls under this length, with no LLM summary
* or reflection to back it up, is treated as a near-duplicate question
* trace (issue #1913). The rescue path keeps these *behind* informative
* candidates so the answer-bearing trace surfaces first.
*/
const INFORMATIVE_AGENT_TEXT_MIN_CHARS = 20;

/**
* Short acknowledgement / scaffold replies that the filter prompt
* rightly classes as "scaffolding chatter". When the LLM filter empties
* the kept set we still need to make a rescue call — these strings let
* us prefer informative replies over plain acks. Bounded list, exact
* matches only after trimming surrounding punctuation / whitespace.
*/
const SHORT_ACK_PATTERNS: readonly RegExp[] = [
/^(ok|okay|sure|got it|noted|understood|alright|will do|copy|copy that|thanks|thank you|✓|✅|👍)[\s.!]*$/i,
/^(记住了|已记住|已经记住|好的|明白|收到|了解|谢谢)[\s。!]*$/,
];

export interface FilterInput {
query: string;
ranked: readonly RankedCandidate[];
Expand Down Expand Up @@ -70,6 +90,12 @@ export interface FilterResult {
| "deferred_to_final"
| "llm_kept_all"
| "llm_filtered"
// The LLM returned an empty selection over a non-empty ranked list
// (issue #1913 — repeated question traces crowding the hit set).
// We rescued the top-K best-scoring candidates so the agent always
// sees a packet when retrieval succeeded; `sufficient` is forced
// to `false` so downstream callers know the injection is weak.
| "llm_filtered_refilled"
// The LLM was supposed to run but the call failed / parsed badly.
// We applied a mechanical relevance cutoff (top-K above
// `relativeThresholdFloor · topRelevance`) instead of dumping the
Expand Down Expand Up @@ -169,15 +195,18 @@ ${list}`,
);
const keepIndices = new Set(cappedIndices);
if (keepIndices.size === 0) {
// Model asked us to drop everything — honoured. Surface this
// explicitly so the Logs page can show "LLM found nothing
// relevant" instead of silently injecting a partial packet.
return {
kept: [],
dropped: [...ranked],
outcome: "llm_filtered",
sufficient: sufficient ?? false,
};
// Issue #1913: the model asked us to drop everything. Honouring
// that verbatim used to collapse `turn.start` injection to "" even
// when retrieval was healthy — the failure mode is a hit set
// dominated by near-duplicate question traces from prior
// sessions, where each candidate individually looks like
// "surface-similar wrong sub-problem" to the filter prompt.
// Instead, rescue the top-K best-scoring candidates (preferring
// informative traces over pure-question / ack-only chatter) so
// the agent always sees a packet when retrieval succeeded.
// `safeCutoff`'s sibling escape hatch (`llmFilterMaxKeep === 0`)
// is honoured so operators can still ask for hard drop.
return rescueFromEmptySelection(ranked, deps, sufficient);
}
const kept = cappedIndices.map((i) => ranked[i]!);
const dropped: RankedCandidate[] = [];
Expand Down Expand Up @@ -214,6 +243,95 @@ function passthrough(
return { kept: [...ranked], dropped: [], outcome, sufficient: null };
}

/**
* Issue #1913 rescue path. Invoked when the LLM relevance filter
* returned `selected: []` for a *non-empty* ranked candidate list — the
* most common cause is a hit set dominated by near-duplicate question
* traces from previous sessions, where the filter prompt's "drop
* scaffolding chatter" / "drop surface-similar wrong sub-problem"
* rubric is applied to every candidate.
*
* Strategy: keep the top-K best-scoring candidates, preferring
* informative traces (skill / episode / experience / world-model, or a
* trace whose `agentText`/`summary`/`reflection` carries real content)
* over pure-question chatter. We do NOT re-query the LLM — the rescue
* is a single O(n) partition + slice. Outcome label is
* `"llm_filtered_refilled"` so the Logs viewer can show "LLM collapsed,
* safety net fired" distinct from a normal `"llm_filtered"`.
*
* Escape hatch: `llmFilterMaxKeep === 0` skips the rescue entirely and
* honours the "drop everything" request (matches existing `safeCutoff`
* semantics for the same config value).
*/
function rescueFromEmptySelection(
ranked: readonly RankedCandidate[],
deps: FilterDeps,
sufficient: boolean | null,
): FilterResult {
const keepCap = Math.max(0, deps.config.llmFilterMaxKeep);
if (keepCap === 0 || ranked.length === 0) {
return {
kept: [],
dropped: [...ranked],
outcome: "llm_filtered",
sufficient: sufficient ?? false,
};
}
const informative: RankedCandidate[] = [];
const chatter: RankedCandidate[] = [];
for (const r of ranked) {
if (isInformativeCandidate(r)) informative.push(r);
else chatter.push(r);
}
// Preserve ranker order within each bucket; informative first so the
// answer-bearing trace surfaces even when the ranker placed it below
// surface-similar question traces.
const ordered = [...informative, ...chatter];
const kept = ordered.slice(0, Math.min(keepCap, ordered.length));
const keptSet = new Set(kept);
const dropped = ranked.filter((r) => !keptSet.has(r));
deps.log.debug("llm_filter.collapsed_refill", {
ranked: ranked.length,
rescued: kept.length,
informative: informative.length,
chatter: chatter.length,
filteredAll: true,
});
return {
kept,
dropped,
outcome: "llm_filtered_refilled",
sufficient: sufficient ?? false,
};
}

/**
* Returns true when a ranked candidate carries content the agent can
* actually use. Skills, episodes, experiences, and world-models always
* count. Traces count when their `summary` or `reflection` is non-empty
* or their `agentText` is longer than a short acknowledgement.
*
* Used by the rescue path (and intentionally only there) to bias the
* rescued set toward traces with informative assistant text. False
* negatives (an informative trace mistakenly labelled chatter) still
* get rescued because they sit in the second half of the ordered list.
*/
function isInformativeCandidate(r: RankedCandidate): boolean {
const c = r.candidate;
if (c.refKind !== "trace") return true;
const t = c as TraceCandidate;
if ((t.summary?.trim().length ?? 0) > 0) return true;
if ((t.reflection?.trim().length ?? 0) > 0) return true;
const agent = t.agentText?.trim() ?? "";
if (agent.length === 0) return false;
if (isShortAck(agent)) return false;
return agent.length >= INFORMATIVE_AGENT_TEXT_MIN_CHARS;
}

function isShortAck(text: string): boolean {
return SHORT_ACK_PATTERNS.some((re) => re.test(text));
}

/**
* Mechanical fail-closed: when the LLM is unavailable / errored,
* apply a relative-relevance cutoff so we don't dump the entire ranked
Expand Down
1 change: 1 addition & 0 deletions apps/memos-local-plugin/core/retrieval/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -737,6 +737,7 @@ export interface RetrievalStats {
| "deferred_to_final"
| "llm_kept_all"
| "llm_filtered"
| "llm_filtered_refilled"
| "llm_failed_safe_cutoff";
llmFilterSufficient?: boolean;
llmFilterKept?: number;
Expand Down
39 changes: 39 additions & 0 deletions apps/memos-local-plugin/tests/unit/retrieval/integration.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,45 @@ describe("retrieval/integration", () => {
expect(res.stats.emptyPacket).toBe(false);
});

it("turn_start rescues injection when LLM filter empties the kept set (#1913)", async () => {
// Repro for issue #1913: when the LLM relevance filter returns
// `selected: []` for a non-empty ranked list (the case where the
// top hits are all near-duplicate question traces), the packet
// used to collapse to an empty injection. The rescue path keeps
// the top-K best-scoring candidates so the agent still gets a
// packet, and surfaces `llm_filtered_refilled` so the Logs viewer
// can show the safety net fired.
const llm: any = {
completeJson: async () => ({
value: { selected: [], sufficient: false },
servedBy: "fake",
}),
};
const res = await turnStartRetrieve(
{
...makeDeps(handle),
llm,
config: {
...makeDeps(handle).config,
llmFilterEnabled: true,
llmFilterMinCandidates: 1,
},
},
{
reason: "turn_start",
agent: "openclaw",
sessionId: "s_current" as SessionId,
userText: "run docker compose",
ts: NOW as never,
},
);

expect(res.packet.snippets.length).toBeGreaterThan(0);
expect(res.packet.rendered.length).toBeGreaterThan(0);
expect(res.stats.llmFilterOutcome).toBe("llm_filtered_refilled");
expect(res.stats.emptyPacket).toBe(false);
});

it("skill_invoke is tier1-heavy", async () => {
const res = await skillInvokeRetrieve(makeDeps(handle), {
reason: "skill_invoke",
Expand Down
140 changes: 138 additions & 2 deletions apps/memos-local-plugin/tests/unit/retrieval/llm-filter.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,33 @@ function trace(id: string, score: number): RankedCandidate {
};
}

/**
* Build a trace candidate with no LLM-generated summary / reflection
* and only a short acknowledgement-style `agentText` — i.e. exactly the
* "near-duplicate question trace" shape from issue #1913 where the
* user re-asked a stored fact across multiple sessions and the
* assistant only acked it. The current LLM filter prompt classes these
* as "scaffolding chatter" and is allowed to drop them all.
*/
function chatterTrace(
id: string,
score: number,
overrides: { agentText?: string; userText?: string } = {},
): RankedCandidate {
const r = trace(id, score);
const cand = r.candidate as TraceCandidate;
return {
...r,
candidate: {
...cand,
userText: overrides.userText ?? `What does HERMES_REAL_E2E_1910 mean? (${id})`,
agentText: overrides.agentText ?? "OK",
summary: null,
reflection: null,
},
};
}

describe("retrieval/llm-filter", () => {
it("disabled → passthrough with null sufficient", async () => {
const result = await llmFilterCandidates(
Expand Down Expand Up @@ -135,7 +162,100 @@ describe("retrieval/llm-filter", () => {
expect(result.sufficient).toBe(true);
});

it("LLM returns empty selection → drops everything and marks insufficient", async () => {
it("LLM returns empty selection over non-empty ranked → rescues top-K informative candidates (#1913)", async () => {
// Issue #1913 repro shape: 3 near-duplicate question traces from
// previous sessions plus 1 answer-bearing trace. Previous filter
// honoured `selected: []` and collapsed to empty injection.
const llm: any = {
completeJson: vi.fn().mockResolvedValue({
value: { selected: [], sufficient: false },
servedBy: "fake",
}),
};
const ranked = [
chatterTrace("q1", 0.95),
chatterTrace("q2", 0.94),
// Answer-bearing trace: long informative agentText, even though
// the ranker placed it below the question duplicates.
(() => {
const r = trace("answer", 0.85);
const c = r.candidate as TraceCandidate;
return {
...r,
candidate: {
...c,
userText:
"Remember this fact: HERMES_REAL_E2E_1910 means the bridge leak fix verification.",
agentText: "Noted. HERMES_REAL_E2E_1910 → bridge leak fix verification.",
summary:
"HERMES_REAL_E2E_1910 marks the bridge-leak fix verification.",
reflection: null,
},
};
})(),
chatterTrace("q3", 0.8),
];
const result = await llmFilterCandidates(
{ query: "What does HERMES_REAL_E2E_1910 mean?", ranked },
{ llm, log, config: cfg },
);
expect(result.outcome).toBe("llm_filtered_refilled");
expect(result.kept.length).toBeGreaterThanOrEqual(1);
expect(result.kept[0]!.candidate.refId).toBe("answer");
expect(result.sufficient).toBe(false);
// Strong negative assertion: the bug returned kept=[], dropped=ranked.
expect(result.dropped.length).toBeLessThan(ranked.length);
});

it("rescue fires even when every ranked candidate is short-ack chatter", async () => {
const llm: any = {
completeJson: vi.fn().mockResolvedValue({
value: { selected: [], sufficient: false },
servedBy: "fake",
}),
};
const ranked = [
chatterTrace("q1", 0.9, { agentText: "OK" }),
chatterTrace("q2", 0.85, { agentText: "记住了" }),
chatterTrace("q3", 0.8, { agentText: "👍" }),
];
const result = await llmFilterCandidates(
{ query: "What does HERMES_REAL_E2E_1910 mean?", ranked },
{ llm, log, config: cfg },
);
expect(result.outcome).toBe("llm_filtered_refilled");
// No informative candidate exists — rescue still keeps top-K by
// ranker score so the agent at least sees one memory.
expect(result.kept.length).toBeGreaterThanOrEqual(1);
expect(result.kept[0]!.candidate.refId).toBe("q1"); // highest score
expect(result.sufficient).toBe(false);
});

it("rescue respects llmFilterMaxKeep cap", async () => {
const llm: any = {
completeJson: vi.fn().mockResolvedValue({
value: { selected: [], sufficient: false },
servedBy: "fake",
}),
};
const ranked = [
trace("a", 0.95),
trace("b", 0.9),
trace("c", 0.85),
trace("d", 0.8),
];
const result = await llmFilterCandidates(
{ query: "q", ranked },
{ llm, log, config: { ...cfg, llmFilterMaxKeep: 2 } },
);
expect(result.outcome).toBe("llm_filtered_refilled");
expect(result.kept.length).toBe(2);
expect(result.dropped.length).toBe(2);
});

it("llmFilterMaxKeep=0 disables rescue: honours empty selection (drop-everything override)", async () => {
// This is the only configured way to ask the filter to truly drop
// everything; keep it explicit so operators have an escape hatch.
const llm: any = {
completeJson: vi.fn().mockResolvedValue({
value: { selected: [], sufficient: false },
Expand All @@ -145,14 +265,30 @@ describe("retrieval/llm-filter", () => {
const ranked = [trace("a", 0.9), trace("b", 0.8)];
const result = await llmFilterCandidates(
{ query: "q", ranked },
{ llm, log, config: cfg },
{ llm, log, config: { ...cfg, llmFilterMaxKeep: 0 } },
);
expect(result.outcome).toBe("llm_filtered");
expect(result.kept.length).toBe(0);
expect(result.dropped.length).toBe(2);
expect(result.sufficient).toBe(false);
});

it("LLM returns empty selection with empty ranked list → unchanged (still llm_filtered, kept=[])", async () => {
const llm: any = {
completeJson: vi.fn().mockResolvedValue({
value: { selected: [], sufficient: false },
servedBy: "fake",
}),
};
// ranked empty but minCandidates=0 so the filter still runs
const result = await llmFilterCandidates(
{ query: "q", ranked: [] },
{ llm, log, config: { ...cfg, llmFilterMinCandidates: 0 } },
);
expect(result.outcome).toBe("below_threshold");
expect(result.kept.length).toBe(0);
});

it("coerces string / number `sufficient` fields sent by lax models", async () => {
const llm: any = {
completeJson: vi.fn().mockResolvedValue({
Expand Down
Loading
Loading