From 8a4c25cd991dd1293a787d348b6bf98eadb16e47 Mon Sep 17 00:00:00 2001 From: Devin Oldenburg <158351052+devinoldenburg@users.noreply.github.com> Date: Sun, 21 Jun 2026 17:12:03 +0000 Subject: [PATCH 1/2] fix: prevent exhaustion loophole and add lean gate mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two critical fixes: 1. Exhaustion loophole — a strong LLM could exhaust every approach, convince all reviewers it 'tried everything', and get PASS verdicts even though the main goal was never achieved (violates Ralph loop). Fix: now requires evidence coverage — every acceptance criterion must have at least one piece of fresh (post-last-edit) recorded verification evidence. This is a programmatic gate no LLM can persuade past. Configurable via (default true). 2. Lean gate mode — the full 5+contextual reviewer set consumes significant tokens per review cycle. Add config option (default false) that reduces base gates to 3 (prompt-auditor, reviewer, final-auditor) and disables contextual gates, dramatically cutting token consumption. The evidence-coverage check keeps the exhaustion loophole closed even in lean mode. Also: continuation messages now detect and report uncovered acceptance criteria, so an agent stuck in an exhaustion loop gets explicit guidance. --- plugins/goal-guard/agents.js | 13 ++++ plugins/goal-guard/autocontinue.js | 27 ++++++- plugins/goal-guard/completion.js | 23 +++++- plugins/goal-guard/config.js | 16 +++++ plugins/goal-guard/gates.js | 50 +++++++++++-- plugins/goal-guard/summary.js | 6 +- tests/autocontinue.test.mjs | 1 + tests/config.test.mjs | 12 ++++ tests/gates.test.mjs | 112 ++++++++++++++++++++++++++++- tests/plugin.test.mjs | 5 ++ tests/programmatic-review.test.mjs | 2 +- tests/review-runner.test.mjs | 7 +- tests/summary.test.mjs | 1 + 13 files changed, 260 insertions(+), 15 deletions(-) diff --git a/plugins/goal-guard/agents.js b/plugins/goal-guard/agents.js index a3068ff..7fdcac4 100644 --- a/plugins/goal-guard/agents.js +++ b/plugins/goal-guard/agents.js @@ -93,6 +93,19 @@ export const BASE_GATES = Object.freeze([ "goal-final-auditor", ]); +/** + * Minimal base-gate set for lean/token-conscious mode. Drops the code-only + * diff-reviewer and verifier gates — each full review pass runs O(N) subagent + * subtasks (one per required gate) so reducing N by 2 saves two subtask tokens + * per cycle. The prompt auditor, correctness reviewer, and final auditor are the + * irreducible safety floor (goal alignment + correctness + finality). + */ +export const LEAN_BASE_GATES = Object.freeze([ + "goal-prompt-auditor", + "goal-reviewer", + "goal-final-auditor", +]); + /** * Gates that only make sense when the goal actually changed code. A research, * analysis, explanation, or planning goal produces a text/evidence deliverable diff --git a/plugins/goal-guard/autocontinue.js b/plugins/goal-guard/autocontinue.js index 3763445..428a378 100644 --- a/plugins/goal-guard/autocontinue.js +++ b/plugins/goal-guard/autocontinue.js @@ -16,7 +16,7 @@ * `state`, so it is fully unit-testable. */ -import { completionAllowed, missingGates } from "./gates.js"; +import { completionAllowed, missingGates, evidenceCoverageMet } from "./gates.js"; /** Consecutive no-change idle ticks after which auto-continue pauses for the human. */ export const NO_PROGRESS_LIMIT = 4; @@ -54,6 +54,9 @@ export function progressSignature(state, config) { * exact reviewer gates via the task tool. */ export function continuationMessage(state, config) { const missing = missingGates(state, config); + const criteria = Array.isArray(state?.contract?.acceptanceCriteria) ? state.contract.acceptanceCriteria : []; + const hasCriteria = criteria.length > 0; + const evidenceOk = evidenceCoverageMet(state); const lines = ["The goal is NOT complete — do not stop. Continue working now."]; if (!state?.contract) { lines.push("First, record the Goal Contract with the `goal_contract` tool (title, the original request, and concrete acceptance criteria) so the objective is anchored."); @@ -61,6 +64,28 @@ export function continuationMessage(state, config) { if (state?.dirty) { lines.push("There are changes that are not yet reviewed/verified after your latest edits — actually run the code/tests and record it with `goal_evidence`."); } + if (hasCriteria && !evidenceOk) { + const uncovered = criteria.filter((c) => { + const full = String(c).trim().toLowerCase(); + if (!full) return false; + const entries = Array.isArray(state?.evidence) ? state.evidence : []; + const lastEditSeq = state?.lastEditSeq || 0; + return !entries.some((entry) => { + const ecriteria = Array.isArray(entry.criteria) ? entry.criteria : []; + if (!ecriteria.some((ec) => String(ec).trim().toLowerCase() === full)) return false; + if (!entry.seq) return lastEditSeq === 0; + return entry.seq > lastEditSeq; + }); + }); + if (uncovered.length) { + lines.push( + `EVIDENCE COVERAGE MISSING — every acceptance criterion must have fresh recorded ` + + `evidence. Uncovered criteria: ${uncovered.map((c) => `"${c}"`).join(", ")}. ` + + `Run verification and record with \`goal_evidence\`, passing each uncovered criterion. ` + + `Mere exhaustion of approaches is NOT success — the guard requires proof.`, + ); + } + } if (missing.length) { if (config?.programmaticReview) { lines.push( diff --git a/plugins/goal-guard/completion.js b/plugins/goal-guard/completion.js index 27b9a7d..4523cfe 100644 --- a/plugins/goal-guard/completion.js +++ b/plugins/goal-guard/completion.js @@ -28,7 +28,7 @@ * adversarial digit-runs cannot trigger polynomial backtracking (issue #367). */ -import { missingGates, completionAllowed } from "./gates.js"; +import { missingGates, completionAllowed, evidenceCoverageMet } from "./gates.js"; import { summarizeState } from "./summary.js"; const CYCLES_RE = /Review cycles:\s*(\d+)/gi; @@ -131,7 +131,26 @@ export function evaluateCompletionClaim(state, config, text) { reason = `claimed review cycles (${claimedCycles}) do not match recorded review cycles (${state.reviewCycles})`; } else if (!completionAllowed(state, config)) { const missing = missingGates(state, config).join(", "); - reason = `required review gates are missing or stale (${missing || "goal session not active"})`; + if (missing) { + reason = `required review gates are missing or stale (${missing})`; + } else if (!evidenceCoverageMet(state)) { + const criteria = Array.isArray(state?.contract?.acceptanceCriteria) ? state.contract.acceptanceCriteria : []; + const uncovered = criteria.filter((c) => { + const full = String(c).trim().toLowerCase(); + if (!full) return false; + const entries = Array.isArray(state?.evidence) ? state.evidence : []; + const lastEditSeq = state?.lastEditSeq || 0; + return !entries.some((entry) => { + const ecriteria = Array.isArray(entry.criteria) ? entry.criteria : []; + if (!ecriteria.some((ec) => String(ec).trim().toLowerCase() === full)) return false; + if (!entry.seq) return lastEditSeq === 0; + return entry.seq > lastEditSeq; + }); + }); + reason = `acceptance criteria lack evidence coverage (${uncovered.map((c) => `"${c}"`).join(", ") || "all criteria"}) — goal not achieved, mere exhaustion is not success`; + } else { + reason = "goal session not active"; + } } if (!reason) return { blocked: false, claimedCycles }; diff --git a/plugins/goal-guard/config.js b/plugins/goal-guard/config.js index f8560df..ed05f4b 100644 --- a/plugins/goal-guard/config.js +++ b/plugins/goal-guard/config.js @@ -56,6 +56,18 @@ export const DEFAULT_CONFIG = Object.freeze({ * explanation goal is gated on evidence instead, never on an empty diff); * "always" forces them on; "never" turns them off. */ requireCodeReview: "auto", + /** Require fresh evidence covering every acceptance criterion before completion + * is allowed. This is the programmatic exhaustion-prevention gate: a strong LLM + * that exhausts every approach and persuades the reviewers it "tried everything" + * cannot satisfy this check for criteria it never actually achieved — the guard + * demands verified, recorded proof, not statements of effort. Default true. */ + requireEvidenceCoverage: true, + /** Lean mode — reduces the base gate set to the 3 safety-critical reviewers + * (prompt-auditor, reviewer, final-auditor) and disables contextual gates, so + * each full review round runs fewer subagent subtasks and token consumption per + * cycle drops sharply. Combined with the evidence-coverage check this keeps the + * exhaustion loophole closed. Default false. */ + leanGates: false, /** Block non-Goal agents from invoking the goal-* subagents via the task tool. */ restrictSubagents: true, /** Maximum tracked sessions before LRU eviction. */ @@ -134,6 +146,8 @@ export const CONFIG_DOCS = Object.freeze({ // Gates & scope contextualGates: { group: "Gates", summary: "Require specialist reviewer gates derived from goal text / changed files." }, requireCodeReview: { group: "Gates", summary: "When to require the code-only diff/verification gates: 'auto' (only once the goal edits a file), 'always', or 'never'. Lets non-code agentic goals complete on evidence." }, + requireEvidenceCoverage: { group: "Gates", summary: "Require fresh recorded evidence for every acceptance criterion before completion (exhaustion-prevention gate — the guard demands verified proof, not statements of effort)." }, + leanGates: { group: "Gates", summary: "Lean mode: reduce the base reviewer set and disable contextual gates for much lower token consumption per review cycle." }, restrictSubagents: { group: "Gates", summary: "Lock the goal-* subagents to the Goal agent (other agents can't call them)." }, // State & lifecycle injectSystemState: { group: "State", summary: "Inject a live Goal Guard state block into the system prompt." }, @@ -218,6 +232,8 @@ function fromEnv(env) { GOAL_GUARD_PERSIST: ["persist", coerceBool], GOAL_GUARD_CONTEXTUAL_GATES: ["contextualGates", coerceBool], GOAL_GUARD_REQUIRE_CODE_REVIEW: ["requireCodeReview", coerceStr], + GOAL_GUARD_REQUIRE_EVIDENCE_COVERAGE: ["requireEvidenceCoverage", coerceBool], + GOAL_GUARD_LEAN_GATES: ["leanGates", coerceBool], GOAL_GUARD_RESTRICT_SUBAGENTS: ["restrictSubagents", coerceBool], GOAL_GUARD_MAX_SESSIONS: ["maxSessions", coerceInt], GOAL_GUARD_SESSION_TTL_MS: ["sessionTtlMs", coerceInt], diff --git a/plugins/goal-guard/gates.js b/plugins/goal-guard/gates.js index 943c6ce..6a8a1ec 100644 --- a/plugins/goal-guard/gates.js +++ b/plugins/goal-guard/gates.js @@ -13,7 +13,7 @@ * Re-running verification after a clean review no longer re-opens the gates. */ -import { BASE_GATES, CODE_GATES, CONTEXTUAL_GATES } from "./agents.js"; +import { BASE_GATES, LEAN_BASE_GATES, CODE_GATES, CONTEXTUAL_GATES } from "./agents.js"; /** * Whether this goal has touched code. A goal is code-bearing once it is dirty or has @@ -79,14 +79,19 @@ export function refreshStickyGates(state) { /** The reviewers that must PASS for this state, given config. */ export function requiredGates(state, config) { + // Lean mode uses the minimal base-gate set (3 reviewers instead of 5) and + // disables contextual gates — drastically reduces token consumption per cycle. + const lean = !!(config?.leanGates); // Code-only gates (diff review, verification) are required only when the goal // actually changed code; a non-code goal is gated on its evidence instead, so it // is not blocked forever by an empty `git diff`. `requireCodeReview` overrides the // auto-detection: "always" forces them on, "never" off. const mode = config?.requireCodeReview || "auto"; const includeCodeGates = mode === "always" ? true : mode === "never" ? false : isCodeBearing(state); - const gates = BASE_GATES.filter((agent) => includeCodeGates || !CODE_GATES.includes(agent)); - if (!config || config.contextualGates) { + const baseSet = lean ? LEAN_BASE_GATES : BASE_GATES; + const gates = baseSet.filter((agent) => includeCodeGates || !CODE_GATES.includes(agent)); + // Contextual gates are disabled in lean mode (token economy). + if (!lean && (!config || config.contextualGates)) { const contextual = new Set([...(state.stickyGates || []), ...contextualGatesFor(state)]); for (const agent of contextual) { if (!gates.includes(agent)) gates.push(agent); @@ -106,6 +111,43 @@ export function missingGates(state, config) { return requiredGates(state, config).filter((agent) => !gatePassedFresh(state, agent)); } +/** + * Whether the evidence ledger covers every recorded acceptance criterion with + * at least one piece of fresh (post-last-edit) evidence. A goal with no criteria + * trivially passes. + * + * This is a programmatic exhaustion-prevention gate: a strong LLM agent that + * exhausts every approach and convinces the reviewers it "tried everything" + * cannot fake evidence for criteria it never achieved. The guard requires + * VERIFIED, recorded proof — not statements of effort. + */ +export function evidenceCoverageMet(state) { + const criteria = Array.isArray(state?.contract?.acceptanceCriteria) + ? state.contract.acceptanceCriteria + : []; + if (!criteria.length) return true; // no criteria = nothing to cover + const entries = Array.isArray(state?.evidence) ? state.evidence : []; + const lastEditSeq = state?.lastEditSeq || 0; + for (const criterion of criteria) { + const full = String(criterion).trim().toLowerCase(); + if (!full) continue; + const covered = entries.some((entry) => { + const ecriteria = Array.isArray(entry.criteria) ? entry.criteria : []; + if (!ecriteria.some((c) => String(c).trim().toLowerCase() === full)) return false; + // Evidence must be fresh (recorded after the last edit). + if (!entry.seq) return lastEditSeq === 0; + return entry.seq > lastEditSeq; + }); + if (!covered) return false; + } + return true; +} + export function completionAllowed(state, config) { - return Boolean(state.active) && missingGates(state, config).length === 0; + if (!state.active) return false; + if (missingGates(state, config).length !== 0) return false; + // Evidence-coverage check: every acceptance criterion must have fresh recorded + // evidence — a programmatic gate the reviewers cannot be persuaded past. + if (config?.requireEvidenceCoverage !== false && !evidenceCoverageMet(state)) return false; + return true; } diff --git a/plugins/goal-guard/summary.js b/plugins/goal-guard/summary.js index 828f546..cf95243 100644 --- a/plugins/goal-guard/summary.js +++ b/plugins/goal-guard/summary.js @@ -3,7 +3,7 @@ * messages, and the `goal_status` tool. Kept pure and dependency-light. */ -import { requiredGates, missingGates, gatePassedFresh } from "./gates.js"; +import { requiredGates, missingGates, gatePassedFresh, completionAllowed } from "./gates.js"; import { prettyAgentName } from "./agents.js"; /** @@ -99,7 +99,7 @@ export function sidebarView(state, config) { const cycles = Number(state.reviewCycles) || 0; const gates = `${passing}/${required.length} gates`; const todos = sidebarTodos(state, required, missing); - const done = required.length > 0 && missing.length === 0 && !state.dirty; + const done = completionAllowed(state, config) && !state.dirty; if (done) { return { state: "done", @@ -190,7 +190,7 @@ export function statusReport(state, config) { reviewerMemory: reviewerMemoryReport(state), changedFiles: state.changedFiles.slice(-50), contract: state.contract, - completionAllowed: Boolean(state.active) && missing.length === 0, + completionAllowed: completionAllowed(state, config), }; } diff --git a/tests/autocontinue.test.mjs b/tests/autocontinue.test.mjs index f303bef..4a45d47 100644 --- a/tests/autocontinue.test.mjs +++ b/tests/autocontinue.test.mjs @@ -56,6 +56,7 @@ test("a COMPLETE goal does not auto-continue and resets the counters", () => { reviewCycles: 1, autoContinueCount: 7, autoContinueNoProgress: 3, + evidence: [{ command: "test", result: "pass", criteria: ["done"], seq: 2 }], }); const d = evaluateAutoContinue(st, DEFAULT_CONFIG); assert.equal(d.continue, false); diff --git a/tests/config.test.mjs b/tests/config.test.mjs index 8aa70de..ab47b2b 100644 --- a/tests/config.test.mjs +++ b/tests/config.test.mjs @@ -128,3 +128,15 @@ test("REGRESSION: a degenerate maxSessions (0/negative) falls back to the defaul assert.equal(resolveConfig({ maxSessions: 25 }).maxSessions, 25); assert.equal(resolveConfig({}, { GOAL_GUARD_MAX_SESSIONS: "0" }).maxSessions, DEFAULT_CONFIG.maxSessions); }); + +test("requireEvidenceCoverage defaults to true and is configurable", () => { + assert.equal(DEFAULT_CONFIG.requireEvidenceCoverage, true); + assert.equal(resolveConfig({ requireEvidenceCoverage: false }).requireEvidenceCoverage, false); + assert.equal(resolveConfig(undefined, { GOAL_GUARD_REQUIRE_EVIDENCE_COVERAGE: "off" }).requireEvidenceCoverage, false); +}); + +test("leanGates defaults to false and is configurable", () => { + assert.equal(DEFAULT_CONFIG.leanGates, false); + assert.equal(resolveConfig({ leanGates: true }).leanGates, true); + assert.equal(resolveConfig(undefined, { GOAL_GUARD_LEAN_GATES: "1" }).leanGates, true); +}); diff --git a/tests/gates.test.mjs b/tests/gates.test.mjs index 23d4272..b396e22 100644 --- a/tests/gates.test.mjs +++ b/tests/gates.test.mjs @@ -1,8 +1,8 @@ import test from "node:test"; import assert from "node:assert/strict"; import { createState } from "../plugins/goal-guard/state.js"; -import { requiredGates, gatePassedFresh, missingGates, completionAllowed, refreshStickyGates, isCodeBearing } from "../plugins/goal-guard/gates.js"; -import { BASE_GATES, CODE_GATES, CONTEXTUAL_GATES } from "../plugins/goal-guard/agents.js"; +import { requiredGates, gatePassedFresh, missingGates, completionAllowed, refreshStickyGates, isCodeBearing, evidenceCoverageMet } from "../plugins/goal-guard/gates.js"; +import { BASE_GATES, LEAN_BASE_GATES, CODE_GATES, CONTEXTUAL_GATES } from "../plugins/goal-guard/agents.js"; const cfg = { contextualGates: true }; @@ -154,3 +154,111 @@ test("refreshStickyGates persists a gate even after the keyword disappears", () st.goalText = "rename a variable"; assert.ok(requiredGates(st, cfg).includes("goal-data-reviewer"), "sticky gate survives keyword loss"); }); + +// ── Evidence coverage (exhaustion-prevention gate) ────────────────── + +test("evidenceCoverageMet: true when no acceptance criteria exist", () => { + const st = createState(); + st.contract = { acceptanceCriteria: [] }; + assert.equal(evidenceCoverageMet(st), true); + st.contract = null; + assert.equal(evidenceCoverageMet(st), true); +}); + +test("evidenceCoverageMet: false when criteria have no evidence", () => { + const st = createState(); + st.contract = { acceptanceCriteria: ["the thing works", "logs are clean"] }; + assert.equal(evidenceCoverageMet(st), false); +}); + +test("evidenceCoverageMet: true when all criteria have fresh evidence", () => { + const st = createState(); + st.lastEditSeq = 3; + st.contract = { acceptanceCriteria: ["the thing works", "logs are clean"] }; + st.evidence = [ + { command: "test a", result: "pass", criteria: ["the thing works"], seq: 4 }, + { command: "test b", result: "pass", criteria: ["logs are clean"], seq: 5 }, + ]; + assert.equal(evidenceCoverageMet(st), true); +}); + +test("evidenceCoverageMet: false when evidence is stale (recorded before last edit)", () => { + const st = createState(); + st.lastEditSeq = 10; + st.contract = { acceptanceCriteria: ["the thing works"] }; + st.evidence = [ + { command: "old test", result: "pass", criteria: ["the thing works"], seq: 5 }, + ]; + assert.equal(evidenceCoverageMet(st), false); +}); + +test("evidenceCoverageMet: case-insensitive criterion matching", () => { + const st = createState(); + st.contract = { acceptanceCriteria: ["THE THING WORKS"] }; + st.evidence = [ + { command: "test", result: "pass", criteria: ["the thing works"], seq: 1 }, + ]; + assert.equal(evidenceCoverageMet(st), true); +}); + +test("completionAllowed: blocked when evidence coverage is missing (requireEvidenceCoverage default true)", () => { + const st = createState(); + st.active = true; + st.lastEditSeq = 1; + st.contract = { acceptanceCriteria: ["unmet criterion"] }; + for (const g of BASE_GATES) withVerdict(st, g, "PASS", 2); + // All gates pass, but no evidence → completion should NOT be allowed. + assert.equal(completionAllowed(st, cfg), false); +}); + +test("completionAllowed: evidence coverage check can be disabled via config", () => { + const st = createState(); + st.active = true; + st.lastEditSeq = 1; + st.contract = { acceptanceCriteria: ["unmet criterion"] }; + for (const g of BASE_GATES) withVerdict(st, g, "PASS", 2); + // Disable evidence coverage requirement. + assert.equal(completionAllowed(st, { ...cfg, requireEvidenceCoverage: false }), true); +}); + +test("completionAllowed: evidence coverage is satisfied with evidence (default config)", () => { + const st = createState(); + st.active = true; + st.lastEditSeq = 1; + st.contract = { acceptanceCriteria: ["met criterion"] }; + st.evidence = [{ command: "test", result: "pass", criteria: ["met criterion"], seq: 2 }]; + for (const g of BASE_GATES) withVerdict(st, g, "PASS", 2); + assert.equal(completionAllowed(st, cfg), true); +}); + +// ── Lean gates (reduced checker set) ──────────────────────────────── + +test("leanGates uses LEAN_BASE_GATES instead of BASE_GATES", () => { + const st = createState(); + st.dirty = true; // code-bearing so code gates would normally be included + const leanCfg = { ...cfg, leanGates: true }; + const gates = requiredGates(st, leanCfg); + for (const g of LEAN_BASE_GATES) assert.ok(gates.includes(g), `lean must include ${g}`); + // The code-only gates (diff-reviewer, verifier) must NOT be in the lean set. + assert.equal(gates.includes("goal-diff-reviewer"), false, "lean mode drops diff-reviewer"); + assert.equal(gates.includes("goal-verifier"), false, "lean mode drops verifier"); +}); + +test("leanGates disables contextual gates", () => { + const st = createState(); + st.goalText = "add an auth endpoint and a database migration"; + const leanCfg = { ...cfg, leanGates: true }; + const gates = requiredGates(st, leanCfg); + assert.equal(gates.includes("goal-security-reviewer"), false, "lean mode drops contextual security"); + assert.equal(gates.includes("goal-api-reviewer"), false, "lean mode drops contextual api"); + assert.equal(gates.includes("goal-data-reviewer"), false, "lean mode drops contextual data"); +}); + +test("leanGates only includes LEAN_BASE_GATES for a code-bearing goal", () => { + const st = createState(); + st.dirty = true; + const leanCfg = { ...cfg, leanGates: true }; + const gates = requiredGates(st, leanCfg); + assert.deepEqual(gates.length, LEAN_BASE_GATES.length, "lean gates should equal the lean base set"); + assert.deepEqual(gates.sort(), [...LEAN_BASE_GATES].sort()); +}); diff --git a/tests/plugin.test.mjs b/tests/plugin.test.mjs index f6200a4..fb0e7e0 100644 --- a/tests/plugin.test.mjs +++ b/tests/plugin.test.mjs @@ -744,6 +744,11 @@ test("a NEW goal in the same session does not inherit the previous goal's gates/ { title: "Polish the welcome banner", original: "make the welcome banner look nicer", acceptanceCriteria: ["the banner greeting reads well"] }, { sessionID: "switch" }, ); + // Record evidence so completionAllowed (which now checks evidence coverage) allows the goal to complete. + await tools.goal_evidence.execute( + { command: "check banner", result: "pass", criteria: ["the banner greeting reads well"] }, + { sessionID: "switch" }, + ); await passAllBaseGates(guard.hooks, guard.store, "switch"); let view = sidebarView(guard.store.stateFor("switch"), guard.config); assert.equal(view.goal, "Polish the welcome banner"); diff --git a/tests/programmatic-review.test.mjs b/tests/programmatic-review.test.mjs index 1c36cfc..1af7a47 100644 --- a/tests/programmatic-review.test.mjs +++ b/tests/programmatic-review.test.mjs @@ -139,7 +139,7 @@ test("[live-parity] the guard runs programmatic reviews when only goal_evidence await guard.hooks["chat.params"]({ sessionID: "g", agent: "goal", model: MODEL }, {}); await guard.hooks["chat.message"]({ sessionID: "g", agent: "goal", model: MODEL }, { parts: [{ type: "text", text: "verify remote server setup" }] }); await tools.goal_contract.execute({ title: "Remote verify", original: "verify remote server setup", acceptanceCriteria: ["server reachable"] }, { sessionID: "g" }); - await tools.goal_evidence.execute({ command: "curl -fsS https://example.com", result: "PASS" }, { sessionID: "g" }); + await tools.goal_evidence.execute({ command: "curl -fsS https://example.com", result: "PASS", criteria: ["server reachable"] }, { sessionID: "g" }); await guard.hooks.event({ event: { type: "session.idle", properties: { sessionID: "g" } } }); // No file edits → a non-code goal, gated on its evidence by the always-on // reviewers (prompt-auditor, reviewer, final-auditor); review still runs. diff --git a/tests/review-runner.test.mjs b/tests/review-runner.test.mjs index b921c31..dd53b88 100644 --- a/tests/review-runner.test.mjs +++ b/tests/review-runner.test.mjs @@ -1,7 +1,7 @@ import test from "node:test"; import assert from "node:assert/strict"; import { createStore } from "../plugins/goal-guard/state.js"; -import { markEdit } from "../plugins/goal-guard/events.js"; +import { markEdit, recordEvidence } from "../plugins/goal-guard/events.js"; import { DEFAULT_CONFIG } from "../plugins/goal-guard/config.js"; import { completionAllowed } from "../plugins/goal-guard/gates.js"; import { runReviewCycle, clientCanReview, ensureReviewClient, reviewerPrompt } from "../plugins/goal-guard/review-runner.js"; @@ -49,6 +49,8 @@ function goalState(store, id) { state.active = true; state.contract = { title: "Do x", original: "do x", acceptanceCriteria: ["x works"] }; markEdit(store, state, "edit"); // there is work to review + // Record evidence so the exhaustion-prevention gate (evidenceCoverageMet) is satisfied. + recordEvidence(store, state, "test command", "PASS", ["x works"]); return state; } @@ -117,7 +119,8 @@ test("the cycle: FAIL → fix(edit) → re-review PASS counts 2 review cycles", assert.equal(r1.completionAllowed, false); assert.equal(state.reviewCycles, 1); markEdit(store, state, "fix after review"); - assert.equal(completionAllowed(state, DEFAULT_CONFIG), false); + recordEvidence(store, state, "test command", "PASS", ["x works"]); // re-verify after the edit + assert.equal(completionAllowed(state, DEFAULT_CONFIG), false); // evidence fresh but gates still stale from edit const r2 = await runReviewCycle(mockReviewClient("PASS", sid), store, state, DEFAULT_CONFIG, fastOpts(sid)); assert.equal(r2.completionAllowed, true); assert.equal(state.reviewCycles, 2); diff --git a/tests/summary.test.mjs b/tests/summary.test.mjs index e592ab9..6897e15 100644 --- a/tests/summary.test.mjs +++ b/tests/summary.test.mjs @@ -165,6 +165,7 @@ test("#291 sidebarView: a fully-passed, clean goal renders as done", () => { const st = createState(); st.active = true; st.contract = { title: "Done goal", acceptanceCriteria: ["it works"] }; + st.evidence = [{ command: "npm test", result: "pass", criteria: ["it works"], seq: 1 }]; for (const gate of BASE_GATES) recordVerdict(store, st, gate, "PASS"); st.dirty = false; const view = sidebarView(st, CFG); From e0127ed80690a822b98170b30ed0d48201a422d3 Mon Sep 17 00:00:00 2001 From: Devin Oldenburg <158351052+devinoldenburg@users.noreply.github.com> Date: Sun, 21 Jun 2026 17:31:38 +0000 Subject: [PATCH 2/2] fix(test): update deep-bughunt allowedState helper to record evidence for acceptance criteria --- tests/deep-bughunt.test.mjs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/deep-bughunt.test.mjs b/tests/deep-bughunt.test.mjs index ce0d931..d82ded4 100644 --- a/tests/deep-bughunt.test.mjs +++ b/tests/deep-bughunt.test.mjs @@ -11,7 +11,7 @@ import { evaluateCompletionClaim } from "../plugins/goal-guard/completion.js"; import { analyzeCommand } from "../plugins/goal-guard/shell.js"; import { goalSimilarity, SAME_GOAL_THRESHOLD, createGoalTools } from "../plugins/goal-guard/tools.js"; import { createStore, createState } from "../plugins/goal-guard/state.js"; -import { markEdit } from "../plugins/goal-guard/events.js"; +import { markEdit, recordEvidence } from "../plugins/goal-guard/events.js"; import { runReviewCycle } from "../plugins/goal-guard/review-runner.js"; import { DEFAULT_CONFIG } from "../plugins/goal-guard/config.js"; import { completionAllowed } from "../plugins/goal-guard/gates.js"; @@ -59,6 +59,7 @@ async function allowedState(id) { state.active = true; state.contract = { title: "Do x", original: "do x", acceptanceCriteria: ["x works"] }; markEdit(store, state, "edit"); + recordEvidence(store, state, "test", "PASS", ["x works"]); await runReviewCycle(mockReviewClient("PASS", id), store, state, DEFAULT_CONFIG, { sessionID: id, sleep: async () => {}, pollMs: 1, timeoutMs: 500 }); return state; }