From c5a57efb74751e648aad1cf03caa23e028d52305 Mon Sep 17 00:00:00 2001 From: antnewman Date: Thu, 7 May 2026 15:31:35 +0100 Subject: [PATCH] test(canaries): add tier-1 semantic canaries on compiled prompt output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #47. Pins 13 executive phrasings the compiler is supposed to produce. If a refactor accidentally drops or alters any of these structures while keeping unit tests green, the corresponding canary fires loudly. Each canary is authored against current compiler output — codifies present behaviour as the contract going forward, not aspirational. Refinements per #47 review: - 'must be valid JSON' is load-bearing in the contract narrative; pinned - Strategy preamble matches on the strategy name appearing somewhere, not on a specific sentence (lower brittleness, same regression-catching power) - Retry-context section header and previousFailureReason content are asserted independently — both halves can break independently - Output-schema fenced block (json) is asserted as a serialisation contract distinct from the surrounding 'must be valid JSON' prose Coverage: 13 canaries - 3 on output-schema rendering (header, must-be-valid-JSON, fenced json) - 3 on quality-gate rendering (header, before-responding-verify, checkbox) - 2 on confidence config (minimum threshold, escalate_below) - 2 on retry context (header, previousFailureReason content) - 1 on reasoning strategy preamble - 1 on branch context - 1 on step identity (Current Step header) Verified loudly-failing on a deliberate mutation: changed 'must be valid JSON' to 'must be JSON-shaped' in compiler.ts; the load-bearing canary failed cleanly with a clear assertion message. Restored before commit. Tier 2 (snapshot tests for conformance fixtures) deliberately deferred per Rain's #47 comment: 'Get tier 1 in, run it for a few weeks, see what slips through.' --- packages/core/canaries.test.ts | 267 +++++++++++++++++++++++++++++++++ 1 file changed, 267 insertions(+) create mode 100644 packages/core/canaries.test.ts diff --git a/packages/core/canaries.test.ts b/packages/core/canaries.test.ts new file mode 100644 index 0000000..2fa94c5 --- /dev/null +++ b/packages/core/canaries.test.ts @@ -0,0 +1,267 @@ +// ============================================================================= +// Canary tests — semantic invariants on compiled prompt output +// ============================================================================= +// Pins the executive phrasings the compiler is supposed to produce. If a +// refactor accidentally drops any of these structures while keeping unit +// tests green, the corresponding canary fires loudly. +// +// Each canary documents what it pins. Refinements per #47 review: +// - "must be valid JSON" is load-bearing in the contract narrative; pinned. +// - Strategy preamble matches on the strategy NAME appearing, not on a +// specific sentence (lower brittleness, same regression-catching power). +// - Retry-context section header and previousFailureReason content are +// asserted independently — both halves can break independently. +// - Output-schema fenced block (```json ... ```) is asserted as a serialisation +// contract distinct from the surrounding prose. +// +// These canaries are AUTHORED AGAINST CURRENT COMPILER OUTPUT — they codify +// present behaviour as the contract going forward, not aspirational behaviour. +// ============================================================================= + +import { describe, expect, it } from "vitest"; +import { compileStep } from "./compiler.js"; +import type { ExecutionContext, LogicSpec, Step } from "./types.js"; + +// ----------------------------------------------------------------------------- +// Helpers (match style of compiler.test.ts) +// ----------------------------------------------------------------------------- + +function makeSpec(steps: Record, overrides: Partial = {}): LogicSpec { + return { + spec_version: "1.0", + name: "canary-spec", + steps, + ...overrides, + }; +} + +function makeCtx(overrides: Partial = {}): ExecutionContext { + return { + currentStep: "test", + previousOutputs: {}, + input: null, + attemptNumber: 1, + branchReason: null, + previousFailureReason: null, + ...overrides, + }; +} + +// ----------------------------------------------------------------------------- +// Canaries +// ----------------------------------------------------------------------------- + +describe("canaries: compiler output semantic invariants", () => { + // -- Output contract ------------------------------------------------------ + + it("step with output_schema produces a Required Output Format section", () => { + const spec = makeSpec({ + step_a: { + output_schema: { + type: "object", + required: ["result"], + properties: { result: { type: "string" } }, + }, + }, + }); + + const compiled = compileStep(spec, "step_a", makeCtx()); + + expect(compiled.systemPromptSegment).toContain("## Required Output Format"); + }); + + it("output schema produces the load-bearing 'must be valid JSON' phrasing", () => { + // Pinned per #47 review: this phrasing carries the contract narrative. + // A refactor that softens it (e.g. to "should be JSON-shaped") would + // silently weaken the project's central pitch. + const spec = makeSpec({ + step_a: { + output_schema: { type: "object" }, + }, + }); + + const compiled = compileStep(spec, "step_a", makeCtx()); + + expect(compiled.systemPromptSegment).toContain("must be valid JSON"); + }); + + it("output schema is rendered inside a fenced ```json block, not inlined as prose", () => { + // Pinned per #47 review as a serialisation-contract regression test + // distinct from the surrounding "must be valid JSON" prose. If the + // schema serialisation is silently switched to plain text or to a + // non-JSON fence, this fires. + const spec = makeSpec({ + step_a: { + output_schema: { + type: "object", + properties: { x: { type: "number" } }, + }, + }, + }); + + const compiled = compileStep(spec, "step_a", makeCtx()); + + expect(compiled.systemPromptSegment).toMatch(/```json\s*\n[\s\S]+?\n```/); + }); + + // -- Quality gates -------------------------------------------------------- + + it("step with quality gates produces a Pre-Response Checklist section", () => { + const spec = makeSpec( + { + step_a: {}, + }, + { + quality_gates: { + pre_output: [{ name: "groundedness", check: "{{ output.cited == true }}" }], + }, + }, + ); + + const compiled = compileStep(spec, "step_a", makeCtx()); + + expect(compiled.systemPromptSegment).toContain("## Pre-Response Checklist"); + }); + + it("quality gates produce 'Before responding, verify:' framing", () => { + const spec = makeSpec( + { + step_a: {}, + }, + { + quality_gates: { + pre_output: [{ name: "groundedness", check: "{{ output.cited == true }}" }], + }, + }, + ); + + const compiled = compileStep(spec, "step_a", makeCtx()); + + expect(compiled.systemPromptSegment).toContain("Before responding, verify:"); + }); + + it("quality gate items render as Markdown checkboxes (- [ ])", () => { + const spec = makeSpec( + { + step_a: {}, + }, + { + quality_gates: { + pre_output: [ + { name: "groundedness", check: "{{ output.cited == true }}", message: "Cite sources" }, + ], + }, + }, + ); + + const compiled = compileStep(spec, "step_a", makeCtx()); + + expect(compiled.systemPromptSegment).toMatch(/^- \[ \]/m); + }); + + // -- Confidence ----------------------------------------------------------- + + it("step with confidence config produces explicit threshold language", () => { + const spec = makeSpec({ + step_a: { + confidence: { minimum: 0.7, target: 0.85 }, + }, + }); + + const compiled = compileStep(spec, "step_a", makeCtx()); + + expect(compiled.systemPromptSegment).toMatch(/minimum confidence of [0-9.]+/); + }); + + it("step with escalate_below produces escalation language", () => { + const spec = makeSpec({ + step_a: { + confidence: { minimum: 0.7, escalate_below: 0.4 }, + }, + }); + + const compiled = compileStep(spec, "step_a", makeCtx()); + + expect(compiled.systemPromptSegment).toMatch(/falls below [0-9.]+/); + expect(compiled.systemPromptSegment).toMatch(/escalate/i); + }); + + // -- Retry context (split into two independent canaries per #47 review) --- + + it("retry context (attempt > 1) produces a Retry Context section header", () => { + // Header presence — independent of whether a previousFailureReason is set. + const spec = makeSpec({ + step_a: {}, + }); + + const compiled = compileStep(spec, "step_a", makeCtx({ attemptNumber: 2 })); + + expect(compiled.systemPromptSegment).toContain("## Retry Context"); + }); + + it("retry context surfaces the previousFailureReason content", () => { + // Content presence — independent of the section header. Both halves + // can regress independently per #47 review. + const reason = "low confidence on step_a output"; + const spec = makeSpec({ + step_a: {}, + }); + + const compiled = compileStep( + spec, + "step_a", + makeCtx({ attemptNumber: 2, previousFailureReason: reason }), + ); + + expect(compiled.systemPromptSegment).toContain(reason); + }); + + // -- Strategy preamble (per #47 review: match name, not sentence) --------- + + it("reasoning strategy is named in the strategy preamble", () => { + // Per #47 review: match on the strategy NAME appearing somewhere in the + // segment, not on a specific sentence. Lower brittleness, same + // regression-catching power. + const spec = makeSpec( + { + step_a: {}, + }, + { + reasoning: { strategy: "react" }, + }, + ); + + const compiled = compileStep(spec, "step_a", makeCtx()); + + expect(compiled.systemPromptSegment).toContain("react"); + }); + + // -- Branch routing ------------------------------------------------------- + + it("branch context surfaces the routing reason when present", () => { + const spec = makeSpec({ + step_a: {}, + }); + + const compiled = compileStep( + spec, + "step_a", + makeCtx({ branchReason: "previous step output exceeded threshold" }), + ); + + expect(compiled.systemPromptSegment).toContain("## Branch Context"); + expect(compiled.systemPromptSegment).toContain("previous step output exceeded threshold"); + }); + + // -- Step identity -------------------------------------------------------- + + it("compiled step segment names the current step in a Current Step header", () => { + const spec = makeSpec({ + my_step: {}, + }); + + const compiled = compileStep(spec, "my_step", makeCtx()); + + expect(compiled.systemPromptSegment).toContain("## Current Step: my_step"); + }); +});