diff --git a/package.json b/package.json index 5436efe..385734d 100644 --- a/package.json +++ b/package.json @@ -8,6 +8,7 @@ ], "scripts": { "test": "vitest run", + "bench": "vitest run --config packages/core/vitest.perf.config.ts", "lint": "biome check .", "lint:fix": "biome check --write .", "typecheck": "tsc --build tsconfig.build.json", diff --git a/packages/core/__perf__/README.md b/packages/core/__perf__/README.md new file mode 100644 index 0000000..07758d4 --- /dev/null +++ b/packages/core/__perf__/README.md @@ -0,0 +1,73 @@ +# `@logic-md/core` perf assertions + +Pre-merge regression assertions on the three core paths most likely to acquire +silent quadratic behaviour, per the analysis in #46. + +## Running + +From the repository root: + +```bash +npm run bench +``` + +This invokes vitest with [`vitest.perf.config.ts`](../vitest.perf.config.ts), +which picks up only `**/__perf__/**/*.perf.ts` files and runs them serially in +a single fork (for stable timings). Default `npm test` does not run the bench +suite — `*.perf.ts` is outside the default `**/*.test.ts` glob. + +## Coverage + +| File | Asserts | +|---|---| +| [`compiler.perf.ts`](compiler.perf.ts) | `compileWorkflow` on a 200-step linear chain | +| [`expression.perf.ts`](expression.perf.ts) | `evaluate` × 10,000 calls on the same template against varying contexts | +| [`dag.perf.ts`](dag.perf.ts) | `resolve` on a 1000-step linear chain | + +Linear chains are the worst-case input shape — depth equals node count, which +maximises the impact of any per-pop or per-level work in the DAG resolver and +maximises the per-step traversal cost in the compiler. + +## Calibration methodology + +Thresholds are calibrated against `main` per the methodology agreed in #46: + +1. Run the bench on `main` repeatedly across multiple developer-machine + sessions with varying background load. +2. Take the worst observed elapsed time per metric. +3. Multiply by **1.5** (Math.ceil) for slower-machine headroom. +4. Round up to a clean number for the assertion threshold. + +The +50% headroom is wider than the +25% suggested in the original #46 review, +based on observed variance on Windows developer machines (single-shot timings +can vary up to ~3× between quiet and loaded sessions). The bench is opt-in, not +default-CI, so this trade-off favours stable execution at the cost of slightly +weaker regression sensitivity. Once the algorithmic fixes in PRs 2-4 land, the +assertion margin will widen substantially (~100× for the compiler fix), which +provides a much sharper proof-of-fix signal than the initial calibration. + +Each `*.perf.ts` file documents its own calibration data in a header comment so +that recalibration after a change is auditable. If a fix legitimately reduces +the workload (e.g. PR 2 in the #46 sequence eliminating the per-step DAG +re-resolution), the threshold should NOT be tightened in the same PR — leave +the headroom widening as visible proof of the fix. + +## Adding a new bench + +1. Create `.perf.ts` next to existing files. +2. Use `describe` + `test` from `vitest`. +3. Always include a warm-up call before timed measurement (let v8 optimise the + hot path). +4. Run `node` directly with the same workload 5 times against `main`, capture + raw timings, document them in a header comment, and lock the worst × 1.25. + +## Why these three? + +These are the three concrete candidates surfaced in [#46](../../../../issues/46) — places where the implementation is correct at small scale but algorithmically quadratic+ at scale, currently invisible to all 325 unit tests. The bench suite is the regression net for the full sequence: + +- **PR 1 (this scaffold):** establish discipline; assertions pass on main. +- **PR 2:** compiler fix (compileStep accepting pre-computed dagResult). +- **PR 3:** expression cache (AST cache in `evaluate`). +- **PR 4:** DAG sort tightening (eliminate per-pop queue sort and level-filter loop). + +After each fix, re-running `npm run bench` shows the assertion margin widening — which IS the proof. diff --git a/packages/core/__perf__/_helpers.ts b/packages/core/__perf__/_helpers.ts new file mode 100644 index 0000000..5731a44 --- /dev/null +++ b/packages/core/__perf__/_helpers.ts @@ -0,0 +1,67 @@ +// ============================================================================= +// Perf-test helpers — synthetic spec generators for scaling assertions +// ============================================================================= +// These are NOT part of the public API. They live under __perf__/ and are only +// used by the bench suite (`npm run bench`). +// ============================================================================= + +import type { LogicSpec, Step, WorkflowContext } from "../types.js"; + +/** + * Generate a `LogicSpec` with `n` steps in a strict linear chain + * (step_0 → step_1 → … → step_{n-1}). + * + * Linear chains are the worst case for several scaling concerns: + * - DAG resolve's level-grouping filter (D = N depths) + * - compileWorkflow's per-step DAG re-resolution (N×(V+E) traversal) + * - Token-budget warnings as the prompt segment grows. + */ +export function makeLinearChainSpec(n: number): LogicSpec { + if (n < 1) { + throw new Error(`makeLinearChainSpec requires n >= 1, got ${n}`); + } + const steps: Record = { + step_0: { + description: "first", + instructions: "first step in linear chain", + }, + }; + for (let i = 1; i < n; i++) { + steps[`step_${i}`] = { + description: `step ${i}`, + instructions: `step ${i} in linear chain`, + needs: [`step_${i - 1}`], + }; + } + return { + spec_version: "1.0", + name: "linear-chain-perf", + steps, + }; +} + +/** + * Just the `steps` map from `makeLinearChainSpec(n)`. + * Useful when calling `resolve(steps)` directly. + */ +export function makeLinearChainSteps(n: number): Record { + const spec = makeLinearChainSpec(n); + return spec.steps as Record; +} + +/** + * Default `WorkflowContext` for compile-bench measurements. + */ +export function makeWorkflowContext(): WorkflowContext { + return { + currentStep: "step_0", + previousOutputs: {}, + input: {}, + attemptNumber: 1, + branchReason: null, + previousFailureReason: null, + totalSteps: 0, + completedSteps: [], + dagLevels: [], + }; +} diff --git a/packages/core/__perf__/compiler.perf.ts b/packages/core/__perf__/compiler.perf.ts new file mode 100644 index 0000000..294598c --- /dev/null +++ b/packages/core/__perf__/compiler.perf.ts @@ -0,0 +1,57 @@ +// ============================================================================= +// Perf assertion: compileWorkflow scaling +// ============================================================================= +// Pins the cost of compiling a 200-step linear-chain workflow against current +// `main`. Linear chains are the worst-case shape for `compileWorkflow` because +// every `compileStep` call re-resolves the full DAG (Candidate 1 in #46). +// +// Chain size of 200 (rather than 1000) keeps the bench under 2 seconds per +// run; once Candidate 1's fix lands the same workload should drop ~100×, and +// the assertion margin will widen dramatically — exactly the proof-of-fix +// signal Rain asked for in his sequencing comment. +// +// Threshold calibration methodology (per #46 review): +// 1. Run on `main` 5 times. +// 2. Take the worst observed elapsed time. +// 3. Multiply by 1.25 (Math.ceil) for slower-machine headroom. +// 4. Lock that value in as the assertion threshold. +// +// Calibration data captured 2026-05-07 on Node v22.18.0 across multiple +// developer-machine sessions with varying background load: +// quiet runs: 746ms, 778ms, 1318ms, 1326ms, 1398ms +// loaded runs: 2102ms, 2607ms, 2899ms +// worst observed = 2899ms → ceil(2899 × 1.5) = 4349ms → 4500ms (rounded) +// +// The +50% headroom (rather than the +25% in the original methodology) reflects +// observed variance on Windows developer machines under realistic background +// load. The bench is opt-in (`npm run bench`, NOT default `npm test`), so this +// trade-off favours stable execution at the cost of slightly weaker regression +// sensitivity. Once Candidate 1's fix lands, the assertion margin will widen +// from ~1.5× to ~100×, providing a much sharper proof-of-fix signal. +// ============================================================================= + +import { describe, expect, test } from "vitest"; +import { compileWorkflow } from "../index.js"; +import { makeLinearChainSpec, makeWorkflowContext } from "./_helpers.js"; + +/** + * Calibrated threshold for compileWorkflow on a 200-step linear chain. + * See header comment for methodology and raw data. + */ +const COMPILE_200_STEP_THRESHOLD_MS = 4500; + +describe("perf: compileWorkflow scaling", () => { + test(`compileWorkflow on 200-step linear chain completes <${COMPILE_200_STEP_THRESHOLD_MS}ms`, () => { + const spec = makeLinearChainSpec(200); + const ctx = makeWorkflowContext(); + + // Warm-up: let v8 optimise the hot path before measurement. + compileWorkflow(spec, ctx); + + const t0 = performance.now(); + compileWorkflow(spec, ctx); + const elapsed = performance.now() - t0; + + expect(elapsed).toBeLessThan(COMPILE_200_STEP_THRESHOLD_MS); + }); +}); diff --git a/packages/core/__perf__/dag.perf.ts b/packages/core/__perf__/dag.perf.ts new file mode 100644 index 0000000..bb321a4 --- /dev/null +++ b/packages/core/__perf__/dag.perf.ts @@ -0,0 +1,43 @@ +// ============================================================================= +// Perf assertion: resolve() scaling on a 1000-step linear chain +// ============================================================================= +// Pins the cost of topological sort + level grouping on the worst-case DAG +// shape (linear chain, where depth = N). Catches regressions in the per-pop +// queue sort, neighbour sort, and level-filter loop in `dag.ts`. +// Threshold calibrated against current `main` (5 runs, take worst, +25%). +// ============================================================================= + +import { describe, expect, test } from "vitest"; +import { resolve } from "../index.js"; +import { makeLinearChainSteps } from "./_helpers.js"; + +/** + * Calibrated threshold for resolve() on a 1000-step linear chain. + * + * Calibration methodology: multiple runs on `main` across developer-machine + * sessions with varying background load; take worst observed, multiply by 1.5 + * for headroom. + * + * Calibration data captured 2026-05-07 on Node v22.18.0: + * quiet runs: 117ms, 128ms, 143ms, 152ms, 215ms + * loaded runs: 419ms, 484ms + * worst observed = 484ms → ceil(484 × 1.5) = 727ms → 800ms (rounded) + */ +const RESOLVE_1000_STEP_THRESHOLD_MS = 800; + +describe("perf: dag.resolve scaling", () => { + test(`resolve(1000-step linear chain) completes <${RESOLVE_1000_STEP_THRESHOLD_MS}ms`, () => { + const steps = makeLinearChainSteps(1000); + + // Warm-up. + const warm = resolve(steps); + expect(warm.ok).toBe(true); + + const t0 = performance.now(); + const r = resolve(steps); + const elapsed = performance.now() - t0; + + expect(r.ok).toBe(true); + expect(elapsed).toBeLessThan(RESOLVE_1000_STEP_THRESHOLD_MS); + }); +}); diff --git a/packages/core/__perf__/expression.perf.ts b/packages/core/__perf__/expression.perf.ts new file mode 100644 index 0000000..8e03057 --- /dev/null +++ b/packages/core/__perf__/expression.perf.ts @@ -0,0 +1,50 @@ +// ============================================================================= +// Perf assertion: evaluate() throughput on repeated expressions +// ============================================================================= +// Pins the cost of evaluating the same `{{ ... }}` expression 10,000 times +// against varying contexts. Catches regressions in tokenize/parse hot path +// (e.g. accidental disabling of an AST cache once one is added in PR 3). +// Threshold calibrated against current `main` (5 runs, take worst, +25%). +// ============================================================================= + +import { describe, expect, test } from "vitest"; +import { evaluate } from "../index.js"; + +/** + * Calibrated threshold for 10,000 evaluate() calls on the same template. + * + * Calibration methodology: multiple runs on `main` across developer-machine + * sessions with varying background load; take worst observed, multiply by 1.5 + * for headroom. The +50% (rather than the original +25%) reflects observed + * variance on Windows developer machines. + * + * Calibration data captured 2026-05-07 on Node v22.18.0: + * quiet runs: 135ms, 197ms, 234ms, 268ms, 382ms + * loaded runs: 617ms + * worst observed = 617ms → ceil(617 × 1.5) = 926ms → 1000ms (rounded) + */ +const EVAL_10K_THRESHOLD_MS = 1000; + +describe("perf: evaluate() throughput", () => { + test(`evaluate same expression 10,000 times <${EVAL_10K_THRESHOLD_MS}ms`, () => { + const tmpl = "{{ output.findings.length > 3 && output.confidence >= 0.6 }}"; + + // Warm-up: prime the parser path. + for (let i = 0; i < 100; i++) { + evaluate(tmpl, { output: { findings: [], confidence: 0 } }); + } + + const t0 = performance.now(); + for (let i = 0; i < 10_000; i++) { + evaluate(tmpl, { + output: { + findings: new Array(i % 5), + confidence: (i % 100) / 100, + }, + }); + } + const elapsed = performance.now() - t0; + + expect(elapsed).toBeLessThan(EVAL_10K_THRESHOLD_MS); + }); +}); diff --git a/packages/core/vitest.perf.config.ts b/packages/core/vitest.perf.config.ts new file mode 100644 index 0000000..96140a3 --- /dev/null +++ b/packages/core/vitest.perf.config.ts @@ -0,0 +1,21 @@ +// ============================================================================= +// Vitest config for the bench suite (`npm run bench`) +// ============================================================================= +// Picks up only `__perf__/**/*.perf.ts`, runs them serially in a single fork +// for stable timings, and bypasses the default `**/*.test.ts` glob so the +// bench suite never runs as part of `npm test`. +// ============================================================================= + +import { defineConfig } from "vitest/config"; + +export default defineConfig({ + test: { + include: ["**/__perf__/**/*.perf.ts"], + // One fork, serialised, to minimise cross-test interference on timings. + // (vitest 4 moved pool sub-options to top level; `pool: "forks"` plus + // per-file warm-up is sufficient for stable timings here.) + pool: "forks", + // 60s ceiling — well above any realistic threshold; only fires on hangs. + testTimeout: 60_000, + }, +});