diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..a264d3c
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,41 @@
+name: test
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: server
+    strategy:
+      matrix:
+        node-version: [20, 22]
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Node ${{ matrix.node-version }}
+        uses: actions/setup-node@v4
+        with:
+          node-version: ${{ matrix.node-version }}
+          cache: npm
+          cache-dependency-path: server/package-lock.json
+
+      - name: Install dependencies
+        run: npm ci
+
+      - name: Type check
+        run: npm run typecheck
+
+      - name: Build
+        run: npm run build
+
+      - name: Unit tests
+        run: npm run test:unit
+
+      - name: Flow tests
+        run: npm run test:flow
diff --git a/CITATION.cff b/CITATION.cff
index ff3db7e..0adcd7b 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -33,5 +33,5 @@ references:
     repository-code: 'https://github.com/joelparkerhenderson/decision-record/'
     abstract: >-
       The canonical concept, template, and teamwork model for decision
-      records — preserved in this fork at docs/upstream-canon.md and
-      templates/canonical.md.
+      records — preserved in this fork at docs/explanation/why-decision-records.md
+      and templates/canonical.md.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 31a3c63..0853e98 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -24,7 +24,7 @@ This repo is the planning system itself. We deliberately stop at the handoff —
 
 ## Attribution
 
-The conceptual core derives from Joel Parker Henderson's [canonical decision-record repo](https://github.com/joelparkerhenderson/decision-record). Preserve attribution to upstream in any rework of `docs/upstream-canon.md` or `templates/canonical.md`.
+The conceptual core derives from Joel Parker Henderson's [canonical decision-record repo](https://github.com/joelparkerhenderson/decision-record). Preserve attribution to upstream in any rework of `docs/explanation/why-decision-records.md` or `templates/canonical.md`.
 
 ## License
 
diff --git a/LICENSE b/LICENSE
index 30603ec..04d47e0 100644
--- a/LICENSE
+++ b/LICENSE
@@ -22,8 +22,8 @@ SOFTWARE.
 
 ---
 
-The preserved canonical material in `docs/upstream-canon.md` and the
-canonical decision record template at `templates/canonical.md` derive from
+The preserved canonical material in `docs/explanation/why-decision-records.md`
+and the canonical decision record template at `templates/canonical.md` derive from
 the upstream work of Joel Parker Henderson:
 <https://github.com/joelparkerhenderson/decision-record>. That material
 should be attributed to its original author; see CITATION.cff.
diff --git a/README.md b/README.md
index 8a8a886..4a14326 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 
 This repository is a Claude Code plugin + bundled MCP server. It runs inside a fresh or template repo, partners with a human and an AI agent, and produces an executable MVP plan: a scoped manifest, a set of accepted decision records, and a dependency-aware task graph. Output goes to Linear (primary) or stays as filesystem artifacts (fallback).
 
-This project is a derivative of [Joel Parker Henderson's canonical decision-record repo](https://github.com/joelparkerhenderson/decision-record). The canonical explanation of what a DR is and why it matters is preserved at [`docs/upstream-canon.md`](docs/upstream-canon.md). What this fork adds is **enforcement**: workflows, tools, and a state machine that make DRs a non-skippable part of planning with an agentic system.
+This project is a derivative of [Joel Parker Henderson's canonical decision-record repo](https://github.com/joelparkerhenderson/decision-record). The canonical explanation of what a DR is and why it matters is preserved at [`docs/explanation/why-decision-records.md`](docs/explanation/why-decision-records.md). What this fork adds is **enforcement**: workflows, tools, and a state machine that make DRs a non-skippable part of planning with an agentic system.
 
 ## What you get
 
@@ -17,7 +17,16 @@ This project is a derivative of [Joel Parker Henderson's canonical decision-reco
 
 ## Status
 
-Active development — first usable cut is in. The pipeline is functional end-to-end (intake → scope → decisions → tasks → handoff to filesystem or Linear). See [`docs/quickstart.md`](docs/quickstart.md) for the five-minute walkthrough, [`docs/usage.md`](docs/usage.md) for the full interaction model, and [`docs/architecture.md`](docs/architecture.md) for the data model.
+Active development — first usable cut is in. The pipeline is functional end-to-end (intake → scope → decisions → tasks → handoff to filesystem or Linear). A standalone CLI (`decision-record`) ships alongside the Claude Code plugin and MCP server.
+
+## Documentation
+
+Docs follow the [Diátaxis](https://diataxis.fr) framework — start at [`docs/README.md`](docs/README.md) to orient.
+
+- **Brand new?** → [`docs/tutorials/your-first-plan.md`](docs/tutorials/your-first-plan.md) is a 15-minute end-to-end walkthrough.
+- **How do I do X?** → [`docs/how-to/`](docs/how-to/) (install, run the CLI, configure providers, hand off to Linear, calibrate gates).
+- **What's the exact spec?** → [`docs/reference/`](docs/reference/) (CLI flags, MCP tools, data model, gates).
+- **Why is it built this way?** → [`docs/explanation/`](docs/explanation/) (design rationale, the five phases, why decision records).
 
 ## How it's structured
 
@@ -58,18 +67,26 @@ npm install
 npm run build
 ```
 
-Then either link as a Claude Code plugin (symlink the repo into `~/.claude/plugins/decision-record/`) or run the MCP server standalone via `node /path/to/decision-record/server/dist/index.js`. Full instructions: [`docs/quickstart.md`](docs/quickstart.md).
+Then either:
+- Use the **standalone CLI**: `export OPENAI_API_KEY=… && node dist/cli.js --idea "your idea here"`
+- Use the **Claude Code plugin**: symlink the repo into `~/.claude/plugins/decision-record/` and run `/plan` inside Claude Code.
+
+Full install instructions: [`docs/how-to/install.md`](docs/how-to/install.md). First-run walkthrough: [`docs/tutorials/your-first-plan.md`](docs/tutorials/your-first-plan.md).
 
 (A published marketplace release is on the roadmap.)
 
+## Benchmarks
+
+We use a canonical prompt — an AI-driven roguelike POC — to spot regressions as the system evolves. See [`benchmarks/`](benchmarks/) for the prompt, expected output shape, and a `run.sh` to re-run it.
+
 ## Contributing
 
 See [CONTRIBUTING.md](CONTRIBUTING.md). Issues and pull requests welcome.
 
 ## Acknowledgments
 
-The conceptual core — what a decision record is, the canonical template structure, the teamwork model around DRs — is the work of [Joel Parker Henderson](https://joelparkerhenderson.com). See [`docs/upstream-canon.md`](docs/upstream-canon.md) for the preserved canonical material, and [CITATION.cff](CITATION.cff) for citation metadata.
+The conceptual core — what a decision record is, the canonical template structure, the teamwork model around DRs — is the work of [Joel Parker Henderson](https://joelparkerhenderson.com). See [`docs/explanation/why-decision-records.md`](docs/explanation/why-decision-records.md) for the preserved canonical material, and [CITATION.cff](CITATION.cff) for citation metadata.
 
 ## License
 
-[MIT](LICENSE) — for the code, schemas, and tooling in this repository. The preserved canonical content in `docs/upstream-canon.md` and the canonical template at `templates/canonical.md` derive from upstream and should be attributed to Joel Parker Henderson per CITATION.cff.
+[MIT](LICENSE) — for the code, schemas, and tooling in this repository. The preserved canonical content in `docs/explanation/why-decision-records.md` and the canonical template at `templates/canonical.md` derive from upstream and should be attributed to Joel Parker Henderson per CITATION.cff.
diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 0000000..a5416ca
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,32 @@
+# Benchmarks
+
+Canonical prompts we run against the decision-record planning pipeline to catch regressions as the system evolves.
+
+| Benchmark | Prompt | Effort | Purpose |
+|---|---|---|---|
+| [roguelike-ai-poc](roguelike-ai-poc/) | AI-driven roguelike where the agent plays the game | `poc` | Exercises all five pipeline phases on a small, well-bounded problem. The original dogfood case. |
+
+## How to run a benchmark
+
+```bash
+cd benchmarks/<name>
+./run.sh
+```
+
+Each benchmark has:
+
+- `prompt.md` — the exact idea, effort level, and what "good output" looks like
+- `reference/` — a baseline artifact snapshot from a canonical run
+- `run.sh` — one-shot runner that fires the CLI against a fresh tmp dir
+
+## What we look for when comparing runs
+
+Each benchmark's `prompt.md` defines its own success criteria. Generally:
+
+- Pipeline reaches `handed-off`
+- Decision count and shape match expectations for the effort tier
+- Tasks are vertical slices, every leaf has a decision ref, graph validates
+- Render artifacts are emitted (Markdown + HTML)
+- Event log is coherent
+
+These benchmarks are **not unit tests** — they're regression observability. Different runs will produce slightly different plans and that's by design. Treat the reference as "shape we expect," not "bytes we require."
diff --git a/benchmarks/roguelike-ai-poc/prompt.md b/benchmarks/roguelike-ai-poc/prompt.md
new file mode 100644
index 0000000..745bdb9
--- /dev/null
+++ b/benchmarks/roguelike-ai-poc/prompt.md
@@ -0,0 +1,63 @@
+# Benchmark: roguelike-ai-poc
+
+This is the canonical benchmark for the decision-record planning pipeline. We re-run it as the system evolves to spot regressions in plan quality, gate behavior, agent prompts, and rendering.
+
+## The prompt
+
+**Idea (free-form):**
+
+> A minimal roguelike where the player primes an AI agent with a strategy, then the agent autonomously navigates a single ASCII-rendered room over a tick system until it wins the objective or dies. Goal: prove the agent-as-player concept with the smallest viable surface area.
+
+**Effort level:** `poc`
+
+## Invocation
+
+```bash
+decision-record \
+  --title "AI-driven roguelike POC" \
+  --description "$(cat <<'EOF'
+A minimal roguelike where the player primes an AI agent with a strategy, then the agent autonomously navigates a single ASCII-rendered room over a tick system until it wins the objective or dies. Goal: prove the agent-as-player concept with the smallest viable surface area.
+EOF
+)" \
+  --effort poc \
+  --cwd ./tmp-roguelike-bench \
+  --yes
+```
+
+Or the one-shot wrapper: `./run.sh` (creates a fresh tmp dir, runs the CLI, prints where the artifacts landed).
+
+## What "good output" looks like
+
+A run is healthy if the produced plan:
+
+- **Pipeline reaches `handed-off`** — every gate passes, sign-offs recorded, project finalized.
+- **3-5 significant decisions** are proposed and accepted — language, world representation, agent action contract, tick-loop control. (Not 1; not 12.)
+- **5-8 vertical-slice tasks** — bootstrap → world → renderer → agent client → action handlers → game loop → CLI entry. Every leaf ≤ 16h (poc cap). Every task references at least one accepted DR.
+- **The seed library is consulted** for at least the language decision (`dr_seed_search` + `dr_seed_load` on `language-choice`).
+- **Graph validates clean** — no cycles, no orphan deps, no missing decision refs.
+- **Artifacts emitted** — `dr/project.json`, `dr/decisions/*.json`, `dr/tasks/*.json`, rendered `.md` siblings, `dr/index.html`. `.dr/events.jsonl` contains a coherent audit trail.
+
+## Reference snapshot
+
+`./reference/` holds the artifacts from the canonical run produced by hand-driving the MCP tools (2026-05-16, the dogfood test that originally produced this benchmark). Treat it as a "this is what good looks like" baseline, not a strict equality target — different agent runs will pick slightly different positions, phrasing, and task decomposition, and that's fine.
+
+When comparing a new run against `./reference/`:
+
+- **Same final phase, gate decisions, event mix** → no regression.
+- **More/fewer decisions or tasks** → check whether the new run is denser/sparser appropriately or whether the agent over- or under-decomposed.
+- **Different selected positions** → fine if defensible; concerning if the argument is weaker.
+- **Missing seed usage** → bug or prompt drift; the agent should reach for `language-choice` here.
+- **Tasks without decision refs** → regression. Every task must link to a DR.
+- **Validation failures** → regression. The graph must validate.
+
+## What this benchmark exercises
+
+| Surface | Coverage |
+|---|---|
+| Phase machine | All five transitions: intake → scoping → deciding → decomposing → handing-off → handed-off |
+| Seed library | At least one `dr_seed_load` (language-choice) |
+| Decision lifecycle | propose → update with position + argument → accept (no review under poc preset) |
+| Task graph | Multi-node dependency chain with decision_refs |
+| Gates | `min_tasks=3`, `max_task_estimate_hours=16`, `require_human_signoff_phases=['handing-off']` |
+| Render | Markdown per record + static HTML index |
+| Handoff | Filesystem path (Linear path is exercised by separate live test) |
diff --git a/benchmarks/roguelike-ai-poc/reference/decisions/0001-choose-the-implementation-language.json b/benchmarks/roguelike-ai-poc/reference/decisions/0001-choose-the-implementation-language.json
new file mode 100644
index 0000000..f07d744
--- /dev/null
+++ b/benchmarks/roguelike-ai-poc/reference/decisions/0001-choose-the-implementation-language.json
@@ -0,0 +1,115 @@
+{
+  "id": "0001-choose-the-implementation-language",
+  "number": 1,
+  "slug": "choose-the-implementation-language",
+  "title": "Choose the implementation language",
+  "status": "accepted",
+  "template_variant": "architecture",
+  "created_at": "2026-05-17T04:13:38.681Z",
+  "updated_at": "2026-05-17T04:13:38.685Z",
+  "summary": "Decide the primary implementation language for the project.",
+  "issue": "Every other foundational decision (runtime, package manager, framework choices, testing tools) flows from the language choice. Picking this early and explicitly avoids drift.",
+  "assumptions": [
+    "Team has existing language strengths to lean on.",
+    "Project lifespan is long enough that hiring and onboarding matter.",
+    "Ecosystem maturity matters for the project's domain."
+  ],
+  "constraints": [
+    "Team's current expertise.",
+    "Target runtime environments (browser, server, native, embedded).",
+    "Performance and memory budgets.",
+    "Licensing or compliance restrictions on language ecosystems."
+  ],
+  "positions": [
+    {
+      "title": "TypeScript",
+      "description": "Strongly typed JavaScript. Best for full-stack web work, ubiquitous tooling.",
+      "pros": [
+        "Ubiquitous in web",
+        "Strong types catch errors early",
+        "Massive ecosystem",
+        "Frontend/backend code sharing"
+      ],
+      "cons": [
+        "Build step overhead",
+        "Type system can be over-engineered",
+        "Slower than native languages for hot paths"
+      ],
+      "links": []
+    },
+    {
+      "title": "Python",
+      "description": "Dynamic, batteries-included. Best for data work, scripting, ML, fast prototypes.",
+      "pros": [
+        "Excellent ML/data ecosystem",
+        "Fast to write",
+        "Readable",
+        "Huge stdlib"
+      ],
+      "cons": [
+        "Slow runtime without C extensions",
+        "GIL limits concurrency",
+        "Dynamic typing → runtime errors"
+      ],
+      "links": []
+    },
+    {
+      "title": "Go",
+      "description": "Statically typed, compiled, built for concurrent services.",
+      "pros": [
+        "Simple language",
+        "Single binary deployment",
+        "Strong concurrency primitives",
+        "Fast compile times"
+      ],
+      "cons": [
+        "Generics still maturing",
+        "Verbose error handling",
+        "Less rich third-party ecosystem than JS/Python"
+      ],
+      "links": []
+    },
+    {
+      "title": "Rust",
+      "description": "Memory-safe systems language. Best for performance-critical or systems work.",
+      "pros": [
+        "No GC, predictable performance",
+        "Memory safety",
+        "Excellent tooling (cargo)",
+        "Strong types"
+      ],
+      "cons": [
+        "Steep learning curve",
+        "Slower to ship initial features",
+        "Compile times can be long"
+      ],
+      "links": []
+    }
+  ],
+  "opinions": [],
+  "argument": "Python is fastest to write for a single-script game-loop POC. The OpenAI SDK + a tiny terminal renderer fit naturally; no build step or transpile loop slows iteration. Team is comfortable with Python and the project never needs to leave a single repo.",
+  "selected_position": "Python",
+  "implications": [
+    "Use the official openai Python SDK for agent calls.",
+    "Single-file or small-module layout; no package manager beyond pip/uv.",
+    "Pin to Python 3.11+ for ergonomic match-statement parsing of agent actions."
+  ],
+  "depends_on": [],
+  "related_decisions": [],
+  "related_artifacts": [],
+  "review": [],
+  "sign_off": {
+    "by": "human",
+    "actor": "kj",
+    "at": "2026-05-17T04:13:38.685Z",
+    "notes": "poc preset, no review required"
+  },
+  "seed_origin": "language-choice",
+  "tags": [
+    "foundation",
+    "poc",
+    "foundation",
+    "architecture",
+    "stack"
+  ]
+}
diff --git a/benchmarks/roguelike-ai-poc/reference/decisions/0001-choose-the-implementation-language.md b/benchmarks/roguelike-ai-poc/reference/decisions/0001-choose-the-implementation-language.md
new file mode 100644
index 0000000..8a3a4b3
--- /dev/null
+++ b/benchmarks/roguelike-ai-poc/reference/decisions/0001-choose-the-implementation-language.md
@@ -0,0 +1,120 @@
+# 0001-choose-the-implementation-language — Choose the implementation language
+
+| Field | Value |
+| --- | --- |
+| Status | `accepted` |
+| Template | `architecture` |
+| Updated | 2026-05-17T04:13:38.685Z |
+| Selected | **Python** |
+| Depends on | _(none)_ |
+
+## Summary
+
+Decide the primary implementation language for the project.
+
+## Issue
+
+Every other foundational decision (runtime, package manager, framework choices, testing tools) flows from the language choice. Picking this early and explicitly avoids drift.
+
+## Assumptions
+
+- Team has existing language strengths to lean on.
+- Project lifespan is long enough that hiring and onboarding matter.
+- Ecosystem maturity matters for the project's domain.
+
+## Constraints
+
+- Team's current expertise.
+- Target runtime environments (browser, server, native, embedded).
+- Performance and memory budgets.
+- Licensing or compliance restrictions on language ecosystems.
+
+## Positions
+
+### TypeScript
+
+Strongly typed JavaScript. Best for full-stack web work, ubiquitous tooling.
+
+**Pros**
+
+- Ubiquitous in web
+- Strong types catch errors early
+- Massive ecosystem
+- Frontend/backend code sharing
+
+**Cons**
+
+- Build step overhead
+- Type system can be over-engineered
+- Slower than native languages for hot paths
+
+### Python ✅
+
+Dynamic, batteries-included. Best for data work, scripting, ML, fast prototypes.
+
+**Pros**
+
+- Excellent ML/data ecosystem
+- Fast to write
+- Readable
+- Huge stdlib
+
+**Cons**
+
+- Slow runtime without C extensions
+- GIL limits concurrency
+- Dynamic typing → runtime errors
+
+### Go
+
+Statically typed, compiled, built for concurrent services.
+
+**Pros**
+
+- Simple language
+- Single binary deployment
+- Strong concurrency primitives
+- Fast compile times
+
+**Cons**
+
+- Generics still maturing
+- Verbose error handling
+- Less rich third-party ecosystem than JS/Python
+
+### Rust
+
+Memory-safe systems language. Best for performance-critical or systems work.
+
+**Pros**
+
+- No GC, predictable performance
+- Memory safety
+- Excellent tooling (cargo)
+- Strong types
+
+**Cons**
+
+- Steep learning curve
+- Slower to ship initial features
+- Compile times can be long
+
+## Argument
+
+Python is fastest to write for a single-script game-loop POC. The OpenAI SDK + a tiny terminal renderer fit naturally; no build step or transpile loop slows iteration. Team is comfortable with Python and the project never needs to leave a single repo.
+
+## Implications
+
+- Use the official openai Python SDK for agent calls.
+- Single-file or small-module layout; no package manager beyond pip/uv.
+- Pin to Python 3.11+ for ergonomic match-statement parsing of agent actions.
+
+## Sign-off
+
+- **By:** kj (human)
+- **At:** 2026-05-17T04:13:38.685Z
+- **Notes:** poc preset, no review required
+
+---
+
+_Instantiated from seed: `language-choice`_
diff --git a/benchmarks/roguelike-ai-poc/reference/decisions/0002-define-the-world-representation-and-renderer.json b/benchmarks/roguelike-ai-poc/reference/decisions/0002-define-the-world-representation-and-renderer.json
new file mode 100644
index 0000000..7afe41a
--- /dev/null
+++ b/benchmarks/roguelike-ai-poc/reference/decisions/0002-define-the-world-representation-and-renderer.json
@@ -0,0 +1,85 @@
+{
+  "id": "0002-define-the-world-representation-and-renderer",
+  "number": 2,
+  "slug": "define-the-world-representation-and-renderer",
+  "title": "Define the world representation and renderer",
+  "status": "accepted",
+  "template_variant": "data-model",
+  "created_at": "2026-05-17T04:13:38.686Z",
+  "updated_at": "2026-05-17T04:13:38.688Z",
+  "summary": "How the room is stored in memory and rendered to the terminal each tick.",
+  "issue": "The world is small (one 10×10 room) but the representation must support: easy frame rendering, fast collision/hazard checks, and a stable serialization that the agent can read on each tick. Pick a model now so the action handlers and renderer can converge.",
+  "assumptions": [
+    "10×10 fixed grid",
+    "Single player entity",
+    "Static tiles set at startup",
+    "Frame fits in a single terminal redraw"
+  ],
+  "constraints": [
+    "Frame must be readable both by humans and the LLM",
+    "No external graphics libraries"
+  ],
+  "positions": [
+    {
+      "title": "Nested list of chars",
+      "description": "world: list[list[str]] indexed by [y][x]. Player position stored separately.",
+      "pros": [
+        "Simplest possible",
+        "Trivial to mutate",
+        "Renders by row-join"
+      ],
+      "cons": [
+        "No type safety on tile semantics",
+        "Have to scan grid for entity positions"
+      ],
+      "links": []
+    },
+    {
+      "title": "Tile-grid + entity dict",
+      "description": "static_tiles: list[list[str]] for walls/floor/hazard/exit; entities: dict[id, {pos, hp, glyph}] overlaid at render time.",
+      "pros": [
+        "Separates static map from dynamic state",
+        "Easy to add entities later if needed",
+        "Clean serialization to JSON"
+      ],
+      "cons": [
+        "Two structures to keep consistent",
+        "Slightly more code"
+      ],
+      "links": []
+    },
+    {
+      "title": "Single 2D numpy array + glyph table",
+      "description": "Each cell is an int; render by mapping ints to glyphs.",
+      "pros": [
+        "Compact",
+        "Fast",
+        "Numpy is familiar"
+      ],
+      "cons": [
+        "Numpy is overkill for 10×10",
+        "Adds a dep we do not otherwise need",
+        "Less Pythonic for tiny data"
+      ],
+      "links": []
+    }
+  ],
+  "opinions": [],
+  "argument": "Static map + entity overlay is the simplest model that survives the day-2 question can we add a second entity? without a rewrite. It serializes naturally to JSON for the LLM payload and keeps render code in one row-join.",
+  "selected_position": "Tile-grid + entity dict",
+  "implications": [
+    "Tile glyphs: # wall, . floor, X hazard, > exit; entities overlay (@ for player).",
+    "Each tick the renderer composes static_tiles + entity glyphs at their positions.",
+    "JSON state sent to the agent: { frame: [<row strings>], hp, tick, exit_pos, player_pos }."
+  ],
+  "depends_on": [],
+  "related_decisions": [],
+  "related_artifacts": [],
+  "review": [],
+  "sign_off": {
+    "by": "human",
+    "actor": "kj",
+    "at": "2026-05-17T04:13:38.688Z"
+  },
+  "tags": []
+}
diff --git a/benchmarks/roguelike-ai-poc/reference/decisions/0002-define-the-world-representation-and-renderer.md b/benchmarks/roguelike-ai-poc/reference/decisions/0002-define-the-world-representation-and-renderer.md
new file mode 100644
index 0000000..dfbf675
--- /dev/null
+++ b/benchmarks/roguelike-ai-poc/reference/decisions/0002-define-the-world-representation-and-renderer.md
@@ -0,0 +1,92 @@
+# 0002-define-the-world-representation-and-renderer — Define the world representation and renderer
+
+| Field | Value |
+| --- | --- |
+| Status | `accepted` |
+| Template | `data-model` |
+| Updated | 2026-05-17T04:13:38.688Z |
+| Selected | **Tile-grid + entity dict** |
+| Depends on | _(none)_ |
+
+## Summary
+
+How the room is stored in memory and rendered to the terminal each tick.
+
+## Issue
+
+The world is small (one 10×10 room) but the representation must support: easy frame rendering, fast collision/hazard checks, and a stable serialization that the agent can read on each tick. Pick a model now so the action handlers and renderer can converge.
+
+## Assumptions
+
+- 10×10 fixed grid
+- Single player entity
+- Static tiles set at startup
+- Frame fits in a single terminal redraw
+
+## Constraints
+
+- Frame must be readable both by humans and the LLM
+- No external graphics libraries
+
+## Positions
+
+### Nested list of chars
+
+world: list[list[str]] indexed by [y][x]. Player position stored separately.
+
+**Pros**
+
+- Simplest possible
+- Trivial to mutate
+- Renders by row-join
+
+**Cons**
+
+- No type safety on tile semantics
+- Have to scan grid for entity positions
+
+### Tile-grid + entity dict ✅
+
+static_tiles: list[list[str]] for walls/floor/hazard/exit; entities: dict[id, {pos, hp, glyph}] overlaid at render time.
+
+**Pros**
+
+- Separates static map from dynamic state
+- Easy to add entities later if needed
+- Clean serialization to JSON
+
+**Cons**
+
+- Two structures to keep consistent
+- Slightly more code
+
+### Single 2D numpy array + glyph table
+
+Each cell is an int; render by mapping ints to glyphs.
+
+**Pros**
+
+- Compact
+- Fast
+- Numpy is familiar
+
+**Cons**
+
+- Numpy is overkill for 10×10
+- Adds a dep we do not otherwise need
+- Less Pythonic for tiny data
+
+## Argument
+
+Static map + entity overlay is the simplest model that survives the day-2 question can we add a second entity? without a rewrite. It serializes naturally to JSON for the LLM payload and keeps render code in one row-join.
+
+## Implications
+
+- Tile glyphs: # wall, . floor, X hazard, > exit; entities overlay (@ for player).
+- Each tick the renderer composes static_tiles + entity glyphs at their positions.
+- JSON state sent to the agent: { frame: [<row strings>], hp, tick, exit_pos, player_pos }.
+
+## Sign-off
+
+- **By:** kj (human)
+- **At:** 2026-05-17T04:13:38.688Z
diff --git a/benchmarks/roguelike-ai-poc/reference/decisions/0003-define-the-agent-action-contract.json b/benchmarks/roguelike-ai-poc/reference/decisions/0003-define-the-agent-action-contract.json
new file mode 100644
index 0000000..0e98040
--- /dev/null
+++ b/benchmarks/roguelike-ai-poc/reference/decisions/0003-define-the-agent-action-contract.json
@@ -0,0 +1,83 @@
+{
+  "id": "0003-define-the-agent-action-contract",
+  "number": 3,
+  "slug": "define-the-agent-action-contract",
+  "title": "Define the agent action contract",
+  "status": "accepted",
+  "template_variant": "architecture",
+  "created_at": "2026-05-17T04:13:38.689Z",
+  "updated_at": "2026-05-17T04:13:38.690Z",
+  "summary": "How the LLM receives the world state per tick and how it returns the chosen action.",
+  "issue": "The agent must produce a structured, validated action every tick. We need the protocol pinned so the game loop never has to guess what the agent meant.",
+  "assumptions": [
+    "OpenAI-compatible API is the LLM transport",
+    "Strategy prompt is supplied once at startup",
+    "Per-tick latency budget ~2-5s is acceptable"
+  ],
+  "constraints": [
+    "Action set is small (move N/S/E/W + noop)",
+    "Agent must not stall the game with malformed output",
+    "Must be debuggable from logs"
+  ],
+  "positions": [
+    {
+      "title": "Plain-text response parsing",
+      "description": "Agent returns N/S/E/W/noop as plain text; we parse first token.",
+      "pros": [
+        "Lowest token cost",
+        "Works with any model"
+      ],
+      "cons": [
+        "Brittle to extra punctuation/prose",
+        "No reasoning surface",
+        "Hard to audit why"
+      ],
+      "links": []
+    },
+    {
+      "title": "Tool-call (function calling) with one tool: do_action(direction)",
+      "description": "Define a single OpenAI tool; agent invokes it once per tick with a strict enum direction.",
+      "pros": [
+        "Schema-validated",
+        "Free reasoning text alongside the call",
+        "Easy to extend with new actions later"
+      ],
+      "cons": [
+        "Slightly more tokens per call",
+        "Requires a model that supports function calling"
+      ],
+      "links": []
+    },
+    {
+      "title": "JSON-only response with output_config",
+      "description": "Force agent to emit {\"action\":\"N\",\"reason\":\"…\"} via structured outputs.",
+      "pros": [
+        "Schema-validated",
+        "Reasoning captured in same payload"
+      ],
+      "cons": [
+        "Some providers do not honor strict mode",
+        "Slightly more setup than tool-call"
+      ],
+      "links": []
+    }
+  ],
+  "opinions": [],
+  "argument": "Tool-calling is the cleanest contract: the model gets free-form reasoning in `content` AND a strict-enum action in `tool_calls`. We can log both, and extending to new actions later is just adding enum values. Plain-text parsing trades 100 tokens of savings for a constant brittleness tax.",
+  "selected_position": "Tool-call (function calling) with one tool: do_action(direction)",
+  "implications": [
+    "Define tool `do_action` with input_schema requiring `direction` in {N,S,E,W,noop}.",
+    "Use tool_choice=\"required\" each tick to force a call.",
+    "Log the assistant message text (the reasoning) alongside the chosen direction for replay/debug."
+  ],
+  "depends_on": [],
+  "related_decisions": [],
+  "related_artifacts": [],
+  "review": [],
+  "sign_off": {
+    "by": "human",
+    "actor": "kj",
+    "at": "2026-05-17T04:13:38.690Z"
+  },
+  "tags": []
+}
diff --git a/benchmarks/roguelike-ai-poc/reference/decisions/0003-define-the-agent-action-contract.md b/benchmarks/roguelike-ai-poc/reference/decisions/0003-define-the-agent-action-contract.md
new file mode 100644
index 0000000..1bd6e3a
--- /dev/null
+++ b/benchmarks/roguelike-ai-poc/reference/decisions/0003-define-the-agent-action-contract.md
@@ -0,0 +1,90 @@
+# 0003-define-the-agent-action-contract — Define the agent action contract
+
+| Field | Value |
+| --- | --- |
+| Status | `accepted` |
+| Template | `architecture` |
+| Updated | 2026-05-17T04:13:38.690Z |
+| Selected | **Tool-call (function calling) with one tool: do_action(direction)** |
+| Depends on | _(none)_ |
+
+## Summary
+
+How the LLM receives the world state per tick and how it returns the chosen action.
+
+## Issue
+
+The agent must produce a structured, validated action every tick. We need the protocol pinned so the game loop never has to guess what the agent meant.
+
+## Assumptions
+
+- OpenAI-compatible API is the LLM transport
+- Strategy prompt is supplied once at startup
+- Per-tick latency budget ~2-5s is acceptable
+
+## Constraints
+
+- Action set is small (move N/S/E/W + noop)
+- Agent must not stall the game with malformed output
+- Must be debuggable from logs
+
+## Positions
+
+### Plain-text response parsing
+
+Agent returns N/S/E/W/noop as plain text; we parse first token.
+
+**Pros**
+
+- Lowest token cost
+- Works with any model
+
+**Cons**
+
+- Brittle to extra punctuation/prose
+- No reasoning surface
+- Hard to audit why
+
+### Tool-call (function calling) with one tool: do_action(direction) ✅
+
+Define a single OpenAI tool; agent invokes it once per tick with a strict enum direction.
+
+**Pros**
+
+- Schema-validated
+- Free reasoning text alongside the call
+- Easy to extend with new actions later
+
+**Cons**
+
+- Slightly more tokens per call
+- Requires a model that supports function calling
+
+### JSON-only response with output_config
+
+Force agent to emit {"action":"N","reason":"…"} via structured outputs.
+
+**Pros**
+
+- Schema-validated
+- Reasoning captured in same payload
+
+**Cons**
+
+- Some providers do not honor strict mode
+- Slightly more setup than tool-call
+
+## Argument
+
+Tool-calling is the cleanest contract: the model gets free-form reasoning in `content` AND a strict-enum action in `tool_calls`. We can log both, and extending to new actions later is just adding enum values. Plain-text parsing trades 100 tokens of savings for a constant brittleness tax.
+
+## Implications
+
+- Define tool `do_action` with input_schema requiring `direction` in {N,S,E,W,noop}.
+- Use tool_choice="required" each tick to force a call.
+- Log the assistant message text (the reasoning) alongside the chosen direction for replay/debug.
+
+## Sign-off
+
+- **By:** kj (human)
+- **At:** 2026-05-17T04:13:38.690Z
diff --git a/benchmarks/roguelike-ai-poc/reference/decisions/0004-define-the-tick-loop-and-termination-conditions.json b/benchmarks/roguelike-ai-poc/reference/decisions/0004-define-the-tick-loop-and-termination-conditions.json
new file mode 100644
index 0000000..4f6becd
--- /dev/null
+++ b/benchmarks/roguelike-ai-poc/reference/decisions/0004-define-the-tick-loop-and-termination-conditions.json
@@ -0,0 +1,68 @@
+{
+  "id": "0004-define-the-tick-loop-and-termination-conditions",
+  "number": 4,
+  "slug": "define-the-tick-loop-and-termination-conditions",
+  "title": "Define the tick loop and termination conditions",
+  "status": "accepted",
+  "template_variant": "architecture",
+  "created_at": "2026-05-17T04:13:38.691Z",
+  "updated_at": "2026-05-17T04:13:38.692Z",
+  "summary": "How the game advances tick by tick, when it stops, and how the user observes it.",
+  "issue": "With an LLM in the loop, each tick is slow (~2-5s). We need a predictable loop with hard stops so the POC always terminates and is always watchable.",
+  "assumptions": [
+    "One-player synchronous game",
+    "User runs the script in a terminal and watches frames",
+    "LLM calls happen on the same thread"
+  ],
+  "constraints": [
+    "Must terminate on win, death, or step limit",
+    "Frame must visibly update each tick",
+    "Must not deadlock on a stuck agent"
+  ],
+  "positions": [
+    {
+      "title": "Synchronous loop with step cap",
+      "description": "while not terminal: render → ask agent → apply → check win/death. Hard cap at N steps (e.g., 50).",
+      "pros": [
+        "Simplest mental model",
+        "Easy to log",
+        "Predictable termination"
+      ],
+      "cons": [
+        "UI freezes during LLM call (acceptable for POC)"
+      ],
+      "links": []
+    },
+    {
+      "title": "Async loop with timeout per tick",
+      "description": "Wrap each agent call in a 10s timeout; on timeout, treat as noop.",
+      "pros": [
+        "Robust to slow API",
+        "Game keeps moving"
+      ],
+      "cons": [
+        "More complex",
+        "Asyncio inside a CLI script is heavier than warranted"
+      ],
+      "links": []
+    }
+  ],
+  "opinions": [],
+  "argument": "For a single-window terminal demo, synchronous is fine. Adding asyncio doubles the code size for no demo-visible benefit. The step cap protects against an agent that wanders forever and ensures every run terminates.",
+  "selected_position": "Synchronous loop with step cap",
+  "implications": [
+    "Step cap = 50; on cap, exit with status \"timeout\" and final HP.",
+    "Use time.sleep(0.05) after each render so the user can see the frames advance.",
+    "Loop logs each tick to stdout: frame, action, reasoning, hp, tick#."
+  ],
+  "depends_on": [],
+  "related_decisions": [],
+  "related_artifacts": [],
+  "review": [],
+  "sign_off": {
+    "by": "human",
+    "actor": "kj",
+    "at": "2026-05-17T04:13:38.692Z"
+  },
+  "tags": []
+}
diff --git a/benchmarks/roguelike-ai-poc/reference/decisions/0004-define-the-tick-loop-and-termination-conditions.md b/benchmarks/roguelike-ai-poc/reference/decisions/0004-define-the-tick-loop-and-termination-conditions.md
new file mode 100644
index 0000000..0d83a25
--- /dev/null
+++ b/benchmarks/roguelike-ai-poc/reference/decisions/0004-define-the-tick-loop-and-termination-conditions.md
@@ -0,0 +1,74 @@
+# 0004-define-the-tick-loop-and-termination-conditions — Define the tick loop and termination conditions
+
+| Field | Value |
+| --- | --- |
+| Status | `accepted` |
+| Template | `architecture` |
+| Updated | 2026-05-17T04:13:38.692Z |
+| Selected | **Synchronous loop with step cap** |
+| Depends on | _(none)_ |
+
+## Summary
+
+How the game advances tick by tick, when it stops, and how the user observes it.
+
+## Issue
+
+With an LLM in the loop, each tick is slow (~2-5s). We need a predictable loop with hard stops so the POC always terminates and is always watchable.
+
+## Assumptions
+
+- One-player synchronous game
+- User runs the script in a terminal and watches frames
+- LLM calls happen on the same thread
+
+## Constraints
+
+- Must terminate on win, death, or step limit
+- Frame must visibly update each tick
+- Must not deadlock on a stuck agent
+
+## Positions
+
+### Synchronous loop with step cap ✅
+
+while not terminal: render → ask agent → apply → check win/death. Hard cap at N steps (e.g., 50).
+
+**Pros**
+
+- Simplest mental model
+- Easy to log
+- Predictable termination
+
+**Cons**
+
+- UI freezes during LLM call (acceptable for POC)
+
+### Async loop with timeout per tick
+
+Wrap each agent call in a 10s timeout; on timeout, treat as noop.
+
+**Pros**
+
+- Robust to slow API
+- Game keeps moving
+
+**Cons**
+
+- More complex
+- Asyncio inside a CLI script is heavier than warranted
+
+## Argument
+
+For a single-window terminal demo, synchronous is fine. Adding asyncio doubles the code size for no demo-visible benefit. The step cap protects against an agent that wanders forever and ensures every run terminates.
+
+## Implications
+
+- Step cap = 50; on cap, exit with status "timeout" and final HP.
+- Use time.sleep(0.05) after each render so the user can see the frames advance.
+- Loop logs each tick to stdout: frame, action, reasoning, hp, tick#.
+
+## Sign-off
+
+- **By:** kj (human)
+- **At:** 2026-05-17T04:13:38.692Z
diff --git a/benchmarks/roguelike-ai-poc/reference/events.jsonl b/benchmarks/roguelike-ai-poc/reference/events.jsonl
new file mode 100644
index 0000000..42ab62f
--- /dev/null
+++ b/benchmarks/roguelike-ai-poc/reference/events.jsonl
@@ -0,0 +1,33 @@
+{"at":"2026-05-17T04:12:02.030Z","actor":"agent","kind":"project_initialized","entity_kind":"project","entity_id":"ai-driven-roguelike-poc","payload":{"effort_level":"poc"}}
+{"at":"2026-05-17T04:12:40.988Z","actor":"agent","kind":"phase_advanced","entity_kind":"phase","entity_id":"scoping","payload":{"from":"intake","to":"scoping"}}
+{"at":"2026-05-17T04:12:40.991Z","actor":"agent","kind":"scope_updated","entity_kind":"project","entity_id":"ai-driven-roguelike-poc","payload":{"scope":{"in_scope":["A 10×10 ASCII-rendered single room with walls (#), floor (.), player (@), exit (>), and a hazard tile (X)","Tick-based game loop: each tick prints the frame, then queries the agent for one action","A small action vocabulary: move N/S/E/W and noop","Player has HP; stepping on hazard removes HP; reaching exit = win, HP=0 = death","Strategy prompt provided once at startup, fed to the agent as system prompt for every tick","LLM agent receives current frame + HP + tick number, returns a single action"],"out_of_scope":["Multiple rooms, dungeon generation, procedural levels","Combat with enemies, NPCs, monsters","Inventory, items, equipment","Save/load, persistence","Visual UI beyond ASCII to terminal","Multiplayer, networking","Self-improving agent loops or RL training"],"success_criteria":["A user can run a single command, supply a strategy prompt, and watch the agent play until win or death","Win and death paths both observed in manual playtests","Different strategy prompts produce visibly different agent behavior","End-to-end run completes in under 60 seconds wall time on a typical OpenAI API call"],"nice_to_have":["Configurable room layout from a text file","Replay log written to disk for post-hoc inspection","A few preset strategy prompts to demo (cautious, greedy, exploratory)"]}}}
+{"at":"2026-05-17T04:12:40.991Z","actor":"agent","kind":"phase_advanced","entity_kind":"phase","entity_id":"deciding","payload":{"from":"scoping","to":"deciding"}}
+{"at":"2026-05-17T04:13:38.681Z","actor":"agent","kind":"seed_loaded","entity_kind":"decision","entity_id":"0001-choose-the-implementation-language","payload":{"seed_name":"language-choice"}}
+{"at":"2026-05-17T04:13:38.684Z","actor":"agent","kind":"decision_updated","entity_kind":"decision","entity_id":"0001-choose-the-implementation-language","payload":{"changed":["argument","selected_position","implications"]}}
+{"at":"2026-05-17T04:13:38.685Z","actor":"human","actor_name":"kj","kind":"decision_accepted","entity_kind":"decision","entity_id":"0001-choose-the-implementation-language"}
+{"at":"2026-05-17T04:13:38.686Z","actor":"agent","kind":"decision_proposed","entity_kind":"decision","entity_id":"0002-define-the-world-representation-and-renderer","payload":{"template_variant":"data-model"}}
+{"at":"2026-05-17T04:13:38.687Z","actor":"agent","kind":"decision_updated","entity_kind":"decision","entity_id":"0002-define-the-world-representation-and-renderer","payload":{"changed":["argument","selected_position","implications"]}}
+{"at":"2026-05-17T04:13:38.688Z","actor":"human","actor_name":"kj","kind":"decision_accepted","entity_kind":"decision","entity_id":"0002-define-the-world-representation-and-renderer"}
+{"at":"2026-05-17T04:13:38.689Z","actor":"agent","kind":"decision_proposed","entity_kind":"decision","entity_id":"0003-define-the-agent-action-contract","payload":{"template_variant":"architecture"}}
+{"at":"2026-05-17T04:13:38.689Z","actor":"agent","kind":"decision_updated","entity_kind":"decision","entity_id":"0003-define-the-agent-action-contract","payload":{"changed":["argument","selected_position","implications"]}}
+{"at":"2026-05-17T04:13:38.690Z","actor":"human","actor_name":"kj","kind":"decision_accepted","entity_kind":"decision","entity_id":"0003-define-the-agent-action-contract"}
+{"at":"2026-05-17T04:13:38.691Z","actor":"agent","kind":"decision_proposed","entity_kind":"decision","entity_id":"0004-define-the-tick-loop-and-termination-conditions","payload":{"template_variant":"architecture"}}
+{"at":"2026-05-17T04:13:38.692Z","actor":"agent","kind":"decision_updated","entity_kind":"decision","entity_id":"0004-define-the-tick-loop-and-termination-conditions","payload":{"changed":["argument","selected_position","implications"]}}
+{"at":"2026-05-17T04:13:38.692Z","actor":"human","actor_name":"kj","kind":"decision_accepted","entity_kind":"decision","entity_id":"0004-define-the-tick-loop-and-termination-conditions"}
+{"at":"2026-05-17T04:13:38.694Z","actor":"agent","kind":"phase_advanced","entity_kind":"phase","entity_id":"decomposing","payload":{"from":"deciding","to":"decomposing"}}
+{"at":"2026-05-17T04:14:22.524Z","actor":"agent","kind":"task_proposed","entity_kind":"task","entity_id":"T0001-bootstrap-repository","payload":{"decision_refs":["0001-choose-the-implementation-language"],"depends_on":[]}}
+{"at":"2026-05-17T04:14:22.526Z","actor":"agent","kind":"task_proposed","entity_kind":"task","entity_id":"T0002-implement-world-module-tile-grid-entity-dict","payload":{"decision_refs":["0002-define-the-world-representation-and-renderer"],"depends_on":["T0001-bootstrap-repository"]}}
+{"at":"2026-05-17T04:14:22.527Z","actor":"agent","kind":"task_proposed","entity_kind":"task","entity_id":"T0003-implement-frame-renderer","payload":{"decision_refs":["0002-define-the-world-representation-and-renderer"],"depends_on":["T0002-implement-world-module-tile-grid-entity-dict"]}}
+{"at":"2026-05-17T04:14:22.528Z","actor":"agent","kind":"task_proposed","entity_kind":"task","entity_id":"T0004-implement-openai-agent-client","payload":{"decision_refs":["0003-define-the-agent-action-contract"],"depends_on":["T0001-bootstrap-repository"]}}
+{"at":"2026-05-17T04:14:22.529Z","actor":"agent","kind":"task_proposed","entity_kind":"task","entity_id":"T0005-implement-action-handlers-and-termination-checks","payload":{"decision_refs":["0002-define-the-world-representation-and-renderer"],"depends_on":["T0002-implement-world-module-tile-grid-entity-dict"]}}
+{"at":"2026-05-17T04:14:22.530Z","actor":"agent","kind":"task_proposed","entity_kind":"task","entity_id":"T0006-implement-the-tick-based-game-loop","payload":{"decision_refs":["0004-define-the-tick-loop-and-termination-conditions","0002-define-the-world-representation-and-renderer"],"depends_on":["T0003-implement-frame-renderer","T0004-implement-openai-agent-client","T0005-implement-action-handlers-and-termination-checks"]}}
+{"at":"2026-05-17T04:14:22.532Z","actor":"agent","kind":"task_proposed","entity_kind":"task","entity_id":"T0007-implement-cli-entry-script","payload":{"decision_refs":["0001-choose-the-implementation-language","0004-define-the-tick-loop-and-termination-conditions"],"depends_on":["T0006-implement-the-tick-based-game-loop"]}}
+{"at":"2026-05-17T04:14:22.534Z","actor":"agent","kind":"graph_validated","payload":{"valid":true,"task_count":7,"error_count":0,"warning_count":0}}
+{"at":"2026-05-17T04:14:30.972Z","actor":"agent","kind":"graph_validated","payload":{"valid":true,"task_count":7,"error_count":0,"warning_count":0}}
+{"at":"2026-05-17T04:14:37.477Z","actor":"agent","kind":"graph_validated","payload":{"valid":true,"task_count":7,"error_count":0,"warning_count":0}}
+{"at":"2026-05-17T04:14:44.523Z","actor":"human","actor_name":"kj","kind":"phase_advanced","entity_kind":"phase","entity_id":"handing-off","payload":{"from":"decomposing","to":"handing-off","notes":"All decisions accepted, graph validates clean."}}
+{"at":"2026-05-17T04:14:44.523Z","actor":"human","actor_name":"kj","kind":"sign_off_recorded","entity_kind":"phase","entity_id":"handing-off"}
+{"at":"2026-05-17T04:14:44.538Z","actor":"agent","kind":"render_run","payload":{"decisions":4,"tasks":7}}
+{"at":"2026-05-17T04:14:44.540Z","actor":"human","actor_name":"kj","kind":"export_started","entity_kind":"project","entity_id":"ai-driven-roguelike-poc","payload":{"target":"filesystem"}}
+{"at":"2026-05-17T04:14:44.540Z","actor":"human","actor_name":"kj","kind":"export_completed","entity_kind":"project","entity_id":"ai-driven-roguelike-poc","payload":{"target":"filesystem","issue_count":7,"document_count":4}}
+{"at":"2026-05-17T04:14:44.544Z","actor":"agent","kind":"render_run","payload":{"decisions":4,"tasks":7}}
diff --git a/benchmarks/roguelike-ai-poc/reference/index.html b/benchmarks/roguelike-ai-poc/reference/index.html
new file mode 100644
index 0000000..75276fc
--- /dev/null
+++ b/benchmarks/roguelike-ai-poc/reference/index.html
@@ -0,0 +1,231 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width,initial-scale=1">
+<title>AI-driven roguelike POC — Decision Record</title>
+<style>:root {
+  --bg: #fafafa;
+  --fg: #1a1a1a;
+  --muted: #6b7280;
+  --border: #e5e7eb;
+  --accent: #4f46e5;
+  --status-rfc: #fbbf24;
+  --status-proposed: #60a5fa;
+  --status-accepted: #34d399;
+  --status-rejected: #f87171;
+  --status-deprecated: #9ca3af;
+  --status-superseded: #c084fc;
+  --task-open: #9ca3af;
+  --task-ready: #60a5fa;
+  --task-in_progress: #fbbf24;
+  --task-done: #34d399;
+  --task-blocked: #f87171;
+  --task-deferred: #c084fc;
+}
+* { box-sizing: border-box; }
+body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; line-height: 1.5; margin: 0; padding: 2rem; background: var(--bg); color: var(--fg); }
+h1, h2, h3 { margin-top: 1.5rem; }
+.container { max-width: 1100px; margin: 0 auto; }
+.header { border-bottom: 1px solid var(--border); padding-bottom: 1rem; margin-bottom: 1.5rem; }
+.meta { display: flex; flex-wrap: wrap; gap: 0.5rem 1rem; color: var(--muted); font-size: 0.9rem; }
+.meta b { color: var(--fg); }
+.pill { display: inline-block; padding: 0.15rem 0.6rem; border-radius: 999px; font-size: 0.75rem; font-weight: 600; color: white; }
+.pill-rfc { background: var(--status-rfc); }
+.pill-proposed { background: var(--status-proposed); }
+.pill-accepted { background: var(--status-accepted); }
+.pill-rejected { background: var(--status-rejected); }
+.pill-deprecated { background: var(--status-deprecated); }
+.pill-superseded { background: var(--status-superseded); }
+.pill-task-open { background: var(--task-open); }
+.pill-task-ready { background: var(--task-ready); }
+.pill-task-in_progress { background: var(--task-in_progress); }
+.pill-task-done { background: var(--task-done); }
+.pill-task-blocked { background: var(--task-blocked); }
+.pill-task-deferred { background: var(--task-deferred); }
+table { width: 100%; border-collapse: collapse; margin-top: 1rem; background: white; box-shadow: 0 1px 3px rgba(0,0,0,0.05); }
+th, td { text-align: left; padding: 0.5rem 0.75rem; border-bottom: 1px solid var(--border); font-size: 0.9rem; vertical-align: top; }
+th { background: #f3f4f6; font-weight: 600; font-size: 0.8rem; text-transform: uppercase; letter-spacing: 0.04em; color: var(--muted); }
+tr:last-child td { border-bottom: none; }
+.scope { background: white; border: 1px solid var(--border); border-radius: 8px; padding: 1rem; margin-top: 1rem; }
+.scope-list { display: grid; grid-template-columns: repeat(auto-fit, minmax(220px, 1fr)); gap: 1rem; }
+.scope-list section { background: #f9fafb; padding: 0.75rem; border-radius: 6px; }
+.scope-list h4 { margin: 0 0 0.5rem; font-size: 0.85rem; color: var(--muted); text-transform: uppercase; letter-spacing: 0.04em; }
+.scope-list ul { margin: 0; padding-left: 1.25rem; }
+.scope-list li { margin: 0.15rem 0; font-size: 0.9rem; }
+.empty { color: var(--muted); font-style: italic; }
+a { color: var(--accent); text-decoration: none; }
+a:hover { text-decoration: underline; }
+.dep-list { color: var(--muted); font-size: 0.8rem; }
+.code { font-family: ui-monospace, "SF Mono", monospace; font-size: 0.85em; background: #f3f4f6; padding: 0.1rem 0.4rem; border-radius: 4px; }
+.handoff { background: #eef2ff; border: 1px solid #c7d2fe; border-radius: 8px; padding: 1rem; margin-top: 1rem; }
+.handoff h3 { margin-top: 0; color: var(--accent); }
+.footer { margin-top: 3rem; padding-top: 1rem; border-top: 1px solid var(--border); color: var(--muted); font-size: 0.8rem; }</style>
+</head>
+<body>
+<div class="container">
+
+  <header class="header">
+    <div class="meta"><span class="code">ai-driven-roguelike-poc</span></div>
+    <h1>AI-driven roguelike POC</h1>
+    <div class="meta">
+      <span><b>Phase:</b> <span class="code">handed-off</span></span>
+      <span><b>Effort:</b> <span class="code">poc</span></span>
+      <span><b>Updated:</b> 2026-05-17T04:14:44.540Z</span>
+      <span><b>Decisions:</b> 4 (4 accepted)</span>
+      <span><b>Tasks:</b> 7 (0 done)</span>
+    </div>
+  </header>
+
+  <p>A minimal roguelike where the player primes an AI agent with a strategy, then the agent autonomously navigates a single ASCII-rendered room over a tick system until it wins the objective or dies. Goal: prove the agent-as-player concept with the smallest viable surface area.</p>
+
+  <div class="scope">
+    <h3>Scope</h3>
+    <div class="scope-list">
+      <section>
+        <h4>In scope</h4>
+        <ul><li>A 10×10 ASCII-rendered single room with walls (#), floor (.), player (@), exit (&gt;), and a hazard tile (X)</li><li>Tick-based game loop: each tick prints the frame, then queries the agent for one action</li><li>A small action vocabulary: move N/S/E/W and noop</li><li>Player has HP; stepping on hazard removes HP; reaching exit = win, HP=0 = death</li><li>Strategy prompt provided once at startup, fed to the agent as system prompt for every tick</li><li>LLM agent receives current frame + HP + tick number, returns a single action</li></ul>
+      </section><section>
+        <h4>Success criteria</h4>
+        <ul><li>A user can run a single command, supply a strategy prompt, and watch the agent play until win or death</li><li>Win and death paths both observed in manual playtests</li><li>Different strategy prompts produce visibly different agent behavior</li><li>End-to-end run completes in under 60 seconds wall time on a typical OpenAI API call</li></ul>
+      </section><section>
+        <h4>Out of scope</h4>
+        <ul><li>Multiple rooms, dungeon generation, procedural levels</li><li>Combat with enemies, NPCs, monsters</li><li>Inventory, items, equipment</li><li>Save/load, persistence</li><li>Visual UI beyond ASCII to terminal</li><li>Multiplayer, networking</li><li>Self-improving agent loops or RL training</li></ul>
+      </section><section>
+        <h4>Nice to have</h4>
+        <ul><li>Configurable room layout from a text file</li><li>Replay log written to disk for post-hoc inspection</li><li>A few preset strategy prompts to demo (cautious, greedy, exploratory)</li></ul>
+      </section>
+    </div>
+  </div>
+  <div class="handoff">
+    <h3>Handed off</h3>
+    <div class="meta">
+      <span><b>Target:</b> <span class="code">filesystem</span></span>
+      <span><b>At:</b> 2026-05-17T04:14:44.540Z</span>
+      
+      
+    </div>
+  </div>
+
+  <h2>Decisions</h2>
+  <table>
+    <thead>
+      <tr>
+        <th>ID</th>
+        <th>Title</th>
+        <th>Status</th>
+        <th>Selected</th>
+        <th>Depends on</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td><a href="decisions/0001-choose-the-implementation-language.md"><span class="code">0001-choose-the-implementation-language</span></a></td>
+        <td>Choose the implementation language <span class="dep-list">[architecture]</span></td>
+        <td><span class="pill pill-accepted">accepted</span></td>
+        <td>Python</td>
+        <td><span class="empty">—</span></td>
+      </tr><tr>
+        <td><a href="decisions/0002-define-the-world-representation-and-renderer.md"><span class="code">0002-define-the-world-representation-and-renderer</span></a></td>
+        <td>Define the world representation and renderer <span class="dep-list">[data-model]</span></td>
+        <td><span class="pill pill-accepted">accepted</span></td>
+        <td>Tile-grid + entity dict</td>
+        <td><span class="empty">—</span></td>
+      </tr><tr>
+        <td><a href="decisions/0003-define-the-agent-action-contract.md"><span class="code">0003-define-the-agent-action-contract</span></a></td>
+        <td>Define the agent action contract <span class="dep-list">[architecture]</span></td>
+        <td><span class="pill pill-accepted">accepted</span></td>
+        <td>Tool-call (function calling) with one tool: do_action(direction)</td>
+        <td><span class="empty">—</span></td>
+      </tr><tr>
+        <td><a href="decisions/0004-define-the-tick-loop-and-termination-conditions.md"><span class="code">0004-define-the-tick-loop-and-termination-conditions</span></a></td>
+        <td>Define the tick loop and termination conditions <span class="dep-list">[architecture]</span></td>
+        <td><span class="pill pill-accepted">accepted</span></td>
+        <td>Synchronous loop with step cap</td>
+        <td><span class="empty">—</span></td>
+      </tr>
+    </tbody>
+  </table>
+
+  <h2>Task graph</h2>
+  <table>
+    <thead>
+      <tr>
+        <th>ID</th>
+        <th>Title</th>
+        <th>Status</th>
+        <th>Pri</th>
+        <th>Estimate</th>
+        <th>Depends on</th>
+        <th>Decision refs</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td><a href="tasks/T0001-bootstrap-repository.md"><span class="code">T0001-bootstrap-repository</span></a></td>
+        <td>Bootstrap repository</td>
+        <td><span class="pill pill-task-ready">ready</span></td>
+        <td><span class="code">p0</span></td>
+        <td>1h</td>
+        <td><span class="empty">—</span></td>
+        <td><a href="decisions/0001-choose-the-implementation-language.md" title="Choose the implementation language"><span class="code">0001-choose-the-implementation-language</span></a></td>
+      </tr><tr>
+        <td><a href="tasks/T0002-implement-world-module-tile-grid-entity-dict.md"><span class="code">T0002-implement-world-module-tile-grid-entity-dict</span></a></td>
+        <td>Implement world module (tile grid + entity dict)</td>
+        <td><span class="pill pill-task-open">open</span></td>
+        <td><span class="code">p0</span></td>
+        <td>2h</td>
+        <td><span class="code">T0001-bootstrap-repository</span></td>
+        <td><a href="decisions/0002-define-the-world-representation-and-renderer.md" title="Define the world representation and renderer"><span class="code">0002-define-the-world-representation-and-renderer</span></a></td>
+      </tr><tr>
+        <td><a href="tasks/T0003-implement-frame-renderer.md"><span class="code">T0003-implement-frame-renderer</span></a></td>
+        <td>Implement frame renderer</td>
+        <td><span class="pill pill-task-open">open</span></td>
+        <td><span class="code">p0</span></td>
+        <td>1h</td>
+        <td><span class="code">T0002-implement-world-module-tile-grid-entity-dict</span></td>
+        <td><a href="decisions/0002-define-the-world-representation-and-renderer.md" title="Define the world representation and renderer"><span class="code">0002-define-the-world-representation-and-renderer</span></a></td>
+      </tr><tr>
+        <td><a href="tasks/T0004-implement-openai-agent-client.md"><span class="code">T0004-implement-openai-agent-client</span></a></td>
+        <td>Implement OpenAI agent client</td>
+        <td><span class="pill pill-task-open">open</span></td>
+        <td><span class="code">p0</span></td>
+        <td>2h</td>
+        <td><span class="code">T0001-bootstrap-repository</span></td>
+        <td><a href="decisions/0003-define-the-agent-action-contract.md" title="Define the agent action contract"><span class="code">0003-define-the-agent-action-contract</span></a></td>
+      </tr><tr>
+        <td><a href="tasks/T0005-implement-action-handlers-and-termination-checks.md"><span class="code">T0005-implement-action-handlers-and-termination-checks</span></a></td>
+        <td>Implement action handlers and termination checks</td>
+        <td><span class="pill pill-task-open">open</span></td>
+        <td><span class="code">p0</span></td>
+        <td>1h</td>
+        <td><span class="code">T0002-implement-world-module-tile-grid-entity-dict</span></td>
+        <td><a href="decisions/0002-define-the-world-representation-and-renderer.md" title="Define the world representation and renderer"><span class="code">0002-define-the-world-representation-and-renderer</span></a></td>
+      </tr><tr>
+        <td><a href="tasks/T0006-implement-the-tick-based-game-loop.md"><span class="code">T0006-implement-the-tick-based-game-loop</span></a></td>
+        <td>Implement the tick-based game loop</td>
+        <td><span class="pill pill-task-open">open</span></td>
+        <td><span class="code">p0</span></td>
+        <td>2h</td>
+        <td><span class="code">T0003-implement-frame-renderer</span> <span class="code">T0004-implement-openai-agent-client</span> <span class="code">T0005-implement-action-handlers-and-termination-checks</span></td>
+        <td><a href="decisions/0004-define-the-tick-loop-and-termination-conditions.md" title="Define the tick loop and termination conditions"><span class="code">0004-define-the-tick-loop-and-termination-conditions</span></a> <a href="decisions/0002-define-the-world-representation-and-renderer.md" title="Define the world representation and renderer"><span class="code">0002-define-the-world-representation-and-renderer</span></a></td>
+      </tr><tr>
+        <td><a href="tasks/T0007-implement-cli-entry-script.md"><span class="code">T0007-implement-cli-entry-script</span></a></td>
+        <td>Implement CLI entry script</td>
+        <td><span class="pill pill-task-open">open</span></td>
+        <td><span class="code">p0</span></td>
+        <td>1h</td>
+        <td><span class="code">T0006-implement-the-tick-based-game-loop</span></td>
+        <td><a href="decisions/0001-choose-the-implementation-language.md" title="Choose the implementation language"><span class="code">0001-choose-the-implementation-language</span></a> <a href="decisions/0004-define-the-tick-loop-and-termination-conditions.md" title="Define the tick loop and termination conditions"><span class="code">0004-define-the-tick-loop-and-termination-conditions</span></a></td>
+      </tr>
+    </tbody>
+  </table>
+
+  <footer class="footer">
+    Generated by <a href="https://github.com/protoLabsAI/decision-record">decision-record</a> ·
+    Last render: 2026-05-17T04:14:44.544Z
+  </footer>
+
+</div>
+</body>
+</html>
\ No newline at end of file
diff --git a/benchmarks/roguelike-ai-poc/reference/project.json b/benchmarks/roguelike-ai-poc/reference/project.json
new file mode 100644
index 0000000..3b4c9fb
--- /dev/null
+++ b/benchmarks/roguelike-ai-poc/reference/project.json
@@ -0,0 +1,64 @@
+{
+  "id": "ai-driven-roguelike-poc",
+  "title": "AI-driven roguelike POC",
+  "description": "A minimal roguelike where the player primes an AI agent with a strategy, then the agent autonomously navigates a single ASCII-rendered room over a tick system until it wins the objective or dies. Goal: prove the agent-as-player concept with the smallest viable surface area.",
+  "created_at": "2026-05-17T04:12:02.030Z",
+  "updated_at": "2026-05-17T04:14:44.540Z",
+  "effort_level": "poc",
+  "status": "handed-off",
+  "scope": {
+    "in_scope": [
+      "A 10×10 ASCII-rendered single room with walls (#), floor (.), player (@), exit (>), and a hazard tile (X)",
+      "Tick-based game loop: each tick prints the frame, then queries the agent for one action",
+      "A small action vocabulary: move N/S/E/W and noop",
+      "Player has HP; stepping on hazard removes HP; reaching exit = win, HP=0 = death",
+      "Strategy prompt provided once at startup, fed to the agent as system prompt for every tick",
+      "LLM agent receives current frame + HP + tick number, returns a single action"
+    ],
+    "out_of_scope": [
+      "Multiple rooms, dungeon generation, procedural levels",
+      "Combat with enemies, NPCs, monsters",
+      "Inventory, items, equipment",
+      "Save/load, persistence",
+      "Visual UI beyond ASCII to terminal",
+      "Multiplayer, networking",
+      "Self-improving agent loops or RL training"
+    ],
+    "success_criteria": [
+      "A user can run a single command, supply a strategy prompt, and watch the agent play until win or death",
+      "Win and death paths both observed in manual playtests",
+      "Different strategy prompts produce visibly different agent behavior",
+      "End-to-end run completes in under 60 seconds wall time on a typical OpenAI API call"
+    ],
+    "nice_to_have": [
+      "Configurable room layout from a text file",
+      "Replay log written to disk for post-hoc inspection",
+      "A few preset strategy prompts to demo (cautious, greedy, exploratory)"
+    ]
+  },
+  "sign_offs": [
+    {
+      "phase": "handing-off",
+      "by": "human",
+      "actor": "kj",
+      "at": "2026-05-17T04:14:44.523Z",
+      "notes": "All decisions accepted, graph validates clean."
+    },
+    {
+      "phase": "handing-off",
+      "by": "human",
+      "actor": "kj",
+      "at": "2026-05-17T04:14:44.540Z"
+    }
+  ],
+  "handoff": {
+    "target": "filesystem",
+    "exported_at": "2026-05-17T04:14:44.540Z",
+    "issue_count": 7,
+    "document_count": 4
+  },
+  "gate_config": {
+    "preset": "poc"
+  },
+  "tags": []
+}
diff --git a/benchmarks/roguelike-ai-poc/reference/project.md b/benchmarks/roguelike-ai-poc/reference/project.md
new file mode 100644
index 0000000..538b476
--- /dev/null
+++ b/benchmarks/roguelike-ai-poc/reference/project.md
@@ -0,0 +1,64 @@
+# AI-driven roguelike POC
+
+| Field | Value |
+| --- | --- |
+| ID | `ai-driven-roguelike-poc` |
+| Status | `handed-off` |
+| Effort level | `poc` |
+| Created | 2026-05-17T04:12:02.030Z |
+| Updated | 2026-05-17T04:14:44.540Z |
+| Decisions | 4 |
+| Tasks | 7 |
+
+## Description
+
+A minimal roguelike where the player primes an AI agent with a strategy, then the agent autonomously navigates a single ASCII-rendered room over a tick system until it wins the objective or dies. Goal: prove the agent-as-player concept with the smallest viable surface area.
+
+## Scope
+
+**In scope**
+
+- A 10×10 ASCII-rendered single room with walls (#), floor (.), player (@), exit (>), and a hazard tile (X)
+- Tick-based game loop: each tick prints the frame, then queries the agent for one action
+- A small action vocabulary: move N/S/E/W and noop
+- Player has HP; stepping on hazard removes HP; reaching exit = win, HP=0 = death
+- Strategy prompt provided once at startup, fed to the agent as system prompt for every tick
+- LLM agent receives current frame + HP + tick number, returns a single action
+
+**Success criteria**
+
+- A user can run a single command, supply a strategy prompt, and watch the agent play until win or death
+- Win and death paths both observed in manual playtests
+- Different strategy prompts produce visibly different agent behavior
+- End-to-end run completes in under 60 seconds wall time on a typical OpenAI API call
+
+**Out of scope**
+
+- Multiple rooms, dungeon generation, procedural levels
+- Combat with enemies, NPCs, monsters
+- Inventory, items, equipment
+- Save/load, persistence
+- Visual UI beyond ASCII to terminal
+- Multiplayer, networking
+- Self-improving agent loops or RL training
+
+**Nice to have**
+
+- Configurable room layout from a text file
+- Replay log written to disk for post-hoc inspection
+- A few preset strategy prompts to demo (cautious, greedy, exploratory)
+
+## Sign-offs
+
+- **handing-off** by kj (human) at 2026-05-17T04:14:44.523Z — All decisions accepted, graph validates clean.
+
+- **handing-off** by kj (human) at 2026-05-17T04:14:44.540Z
+
+## Handoff
+
+| Field | Value |
+| --- | --- |
+| Target | `filesystem` |
+| Exported at | 2026-05-17T04:14:44.540Z |
+| Target ID | — |
+| Target URL | — |
diff --git a/benchmarks/roguelike-ai-poc/reference/tasks/T0001-bootstrap-repository.json b/benchmarks/roguelike-ai-poc/reference/tasks/T0001-bootstrap-repository.json
new file mode 100644
index 0000000..c433a10
--- /dev/null
+++ b/benchmarks/roguelike-ai-poc/reference/tasks/T0001-bootstrap-repository.json
@@ -0,0 +1,30 @@
+{
+  "id": "T0001-bootstrap-repository",
+  "number": 1,
+  "slug": "bootstrap-repository",
+  "title": "Bootstrap repository",
+  "description": "Initialize the Python project layout: pyproject.toml or requirements.txt with openai pin, a src/ module path, a README stub, and a .gitignore. Verify a `python -c \"import openai\"` succeeds in a fresh venv.",
+  "status": "ready",
+  "estimate": {
+    "unit": "hours",
+    "value": 1,
+    "confidence": "high"
+  },
+  "acceptance_criteria": [
+    "pyproject.toml or requirements.txt committed",
+    "openai SDK installable in a venv",
+    "README explains 30-second quickstart",
+    "python -c \"from src import __init__\" runs"
+  ],
+  "depends_on": [],
+  "decision_refs": [
+    "0001-choose-the-implementation-language"
+  ],
+  "priority": "p0",
+  "labels": [
+    "foundation"
+  ],
+  "assignee_hint": "agent",
+  "created_at": "2026-05-17T04:14:22.524Z",
+  "updated_at": "2026-05-17T04:14:22.524Z"
+}
diff --git a/benchmarks/roguelike-ai-poc/reference/tasks/T0001-bootstrap-repository.md b/benchmarks/roguelike-ai-poc/reference/tasks/T0001-bootstrap-repository.md
new file mode 100644
index 0000000..09effaa
--- /dev/null
+++ b/benchmarks/roguelike-ai-poc/reference/tasks/T0001-bootstrap-repository.md
@@ -0,0 +1,23 @@
+# T0001-bootstrap-repository — Bootstrap repository
+
+| Field | Value |
+| --- | --- |
+| Status | `ready` |
+| Priority | `p0` |
+| Estimate | 1 hours (high confidence) |
+| Depends on | _(none)_ |
+| Decision refs | `0001-choose-the-implementation-language` — Choose the implementation language |
+| Assignee hint | agent |
+| Labels | `foundation` |
+| Updated | 2026-05-17T04:14:22.524Z |
+
+## Description
+
+Initialize the Python project layout: pyproject.toml or requirements.txt with openai pin, a src/ module path, a README stub, and a .gitignore. Verify a `python -c "import openai"` succeeds in a fresh venv.
+
+## Acceptance criteria
+
+- [ ] pyproject.toml or requirements.txt committed
+- [ ] openai SDK installable in a venv
+- [ ] README explains 30-second quickstart
+- [ ] python -c "from src import __init__" runs
diff --git a/benchmarks/roguelike-ai-poc/reference/tasks/T0002-implement-world-module-tile-grid-entity-dict.json b/benchmarks/roguelike-ai-poc/reference/tasks/T0002-implement-world-module-tile-grid-entity-dict.json
new file mode 100644
index 0000000..c7a6c75
--- /dev/null
+++ b/benchmarks/roguelike-ai-poc/reference/tasks/T0002-implement-world-module-tile-grid-entity-dict.json
@@ -0,0 +1,32 @@
+{
+  "id": "T0002-implement-world-module-tile-grid-entity-dict",
+  "number": 2,
+  "slug": "implement-world-module-tile-grid-entity-dict",
+  "title": "Implement world module (tile grid + entity dict)",
+  "description": "Build src/world.py: World dataclass with static_tiles: list[list[str]] and entities: dict[str, dict]. Provide constructors for a default 10×10 room (walls border, one hazard, one exit). Pure data and helpers; no rendering, no game logic.",
+  "status": "open",
+  "estimate": {
+    "unit": "hours",
+    "value": 2,
+    "confidence": "med"
+  },
+  "acceptance_criteria": [
+    "World.default_room() returns a valid 10x10 with #, ., X, > tiles",
+    "entities dict contains a player at a known spawn",
+    "is_walkable(x,y) returns False for walls, True for floor and hazard",
+    "unit test: default room is fully walkable from spawn to exit"
+  ],
+  "depends_on": [
+    "T0001-bootstrap-repository"
+  ],
+  "decision_refs": [
+    "0002-define-the-world-representation-and-renderer"
+  ],
+  "priority": "p0",
+  "labels": [
+    "core"
+  ],
+  "assignee_hint": "agent",
+  "created_at": "2026-05-17T04:14:22.526Z",
+  "updated_at": "2026-05-17T04:14:22.526Z"
+}
diff --git a/benchmarks/roguelike-ai-poc/reference/tasks/T0002-implement-world-module-tile-grid-entity-dict.md b/benchmarks/roguelike-ai-poc/reference/tasks/T0002-implement-world-module-tile-grid-entity-dict.md
new file mode 100644
index 0000000..ff06ca3
--- /dev/null
+++ b/benchmarks/roguelike-ai-poc/reference/tasks/T0002-implement-world-module-tile-grid-entity-dict.md
@@ -0,0 +1,23 @@
+# T0002-implement-world-module-tile-grid-entity-dict — Implement world module (tile grid + entity dict)
+
+| Field | Value |
+| --- | --- |
+| Status | `open` |
+| Priority | `p0` |
+| Estimate | 2 hours (med confidence) |
+| Depends on | `T0001-bootstrap-repository` |
+| Decision refs | `0002-define-the-world-representation-and-renderer` — Define the world representation and renderer |
+| Assignee hint | agent |
+| Labels | `core` |
+| Updated | 2026-05-17T04:14:22.526Z |
+
+## Description
+
+Build src/world.py: World dataclass with static_tiles: list[list[str]] and entities: dict[str, dict]. Provide constructors for a default 10×10 room (walls border, one hazard, one exit). Pure data and helpers; no rendering, no game logic.
+
+## Acceptance criteria
+
+- [ ] World.default_room() returns a valid 10x10 with #, ., X, > tiles
+- [ ] entities dict contains a player at a known spawn
+- [ ] is_walkable(x,y) returns False for walls, True for floor and hazard
+- [ ] unit test: default room is fully walkable from spawn to exit
diff --git a/benchmarks/roguelike-ai-poc/reference/tasks/T0003-implement-frame-renderer.json b/benchmarks/roguelike-ai-poc/reference/tasks/T0003-implement-frame-renderer.json
new file mode 100644
index 0000000..0caf6b1
--- /dev/null
+++ b/benchmarks/roguelike-ai-poc/reference/tasks/T0003-implement-frame-renderer.json
@@ -0,0 +1,32 @@
+{
+  "id": "T0003-implement-frame-renderer",
+  "number": 3,
+  "slug": "implement-frame-renderer",
+  "title": "Implement frame renderer",
+  "description": "Build src/render.py: render_frame(world) -> list[str]. Compose static_tiles + entity glyphs (entity overrides tile). Provide a small HUD line below the frame showing tick number, HP, and last action. Return as list of strings so the game loop can join + print or send to LLM.",
+  "status": "open",
+  "estimate": {
+    "unit": "hours",
+    "value": 1,
+    "confidence": "high"
+  },
+  "acceptance_criteria": [
+    "render_frame returns 10 strings of length 10",
+    "player @ is visible at its current position",
+    "HUD line includes tick, hp, last_action",
+    "manual visual check: frame looks like a roguelike room"
+  ],
+  "depends_on": [
+    "T0002-implement-world-module-tile-grid-entity-dict"
+  ],
+  "decision_refs": [
+    "0002-define-the-world-representation-and-renderer"
+  ],
+  "priority": "p0",
+  "labels": [
+    "core"
+  ],
+  "assignee_hint": "agent",
+  "created_at": "2026-05-17T04:14:22.527Z",
+  "updated_at": "2026-05-17T04:14:22.527Z"
+}
diff --git a/benchmarks/roguelike-ai-poc/reference/tasks/T0003-implement-frame-renderer.md b/benchmarks/roguelike-ai-poc/reference/tasks/T0003-implement-frame-renderer.md
new file mode 100644
index 0000000..8bfc535
--- /dev/null
+++ b/benchmarks/roguelike-ai-poc/reference/tasks/T0003-implement-frame-renderer.md
@@ -0,0 +1,23 @@
+# T0003-implement-frame-renderer — Implement frame renderer
+
+| Field | Value |
+| --- | --- |
+| Status | `open` |
+| Priority | `p0` |
+| Estimate | 1 hours (high confidence) |
+| Depends on | `T0002-implement-world-module-tile-grid-entity-dict` |
+| Decision refs | `0002-define-the-world-representation-and-renderer` — Define the world representation and renderer |
+| Assignee hint | agent |
+| Labels | `core` |
+| Updated | 2026-05-17T04:14:22.527Z |
+
+## Description
+
+Build src/render.py: render_frame(world) -> list[str]. Compose static_tiles + entity glyphs (entity overrides tile). Provide a small HUD line below the frame showing tick number, HP, and last action. Return as list of strings so the game loop can join + print or send to LLM.
+
+## Acceptance criteria
+
+- [ ] render_frame returns 10 strings of length 10
+- [ ] player @ is visible at its current position
+- [ ] HUD line includes tick, hp, last_action
+- [ ] manual visual check: frame looks like a roguelike room
diff --git a/benchmarks/roguelike-ai-poc/reference/tasks/T0004-implement-openai-agent-client.json b/benchmarks/roguelike-ai-poc/reference/tasks/T0004-implement-openai-agent-client.json
new file mode 100644
index 0000000..cdc8821
--- /dev/null
+++ b/benchmarks/roguelike-ai-poc/reference/tasks/T0004-implement-openai-agent-client.json
@@ -0,0 +1,34 @@
+{
+  "id": "T0004-implement-openai-agent-client",
+  "number": 4,
+  "slug": "implement-openai-agent-client",
+  "title": "Implement OpenAI agent client",
+  "description": "Build src/agent.py: AgentClient class with constructor(strategy_prompt, model, api_key). Single method choose_action(world_state_json, tick, hp) → (direction, reasoning). Uses tool-calling with one tool do_action(direction in {N,S,E,W,noop}); tool_choice=\"required\". Returns the chosen direction and the assistant message content as reasoning.",
+  "status": "open",
+  "estimate": {
+    "unit": "hours",
+    "value": 2,
+    "confidence": "med"
+  },
+  "acceptance_criteria": [
+    "AgentClient instantiates without making a call",
+    "choose_action returns a valid direction enum",
+    "reasoning is captured as a string (may be empty)",
+    "malformed responses raise a clear error (does not silently noop)",
+    "strategy_prompt is in the system role on every call"
+  ],
+  "depends_on": [
+    "T0001-bootstrap-repository"
+  ],
+  "decision_refs": [
+    "0003-define-the-agent-action-contract"
+  ],
+  "priority": "p0",
+  "labels": [
+    "llm",
+    "core"
+  ],
+  "assignee_hint": "agent",
+  "created_at": "2026-05-17T04:14:22.528Z",
+  "updated_at": "2026-05-17T04:14:22.528Z"
+}
diff --git a/benchmarks/roguelike-ai-poc/reference/tasks/T0004-implement-openai-agent-client.md b/benchmarks/roguelike-ai-poc/reference/tasks/T0004-implement-openai-agent-client.md
new file mode 100644
index 0000000..0244119
--- /dev/null
+++ b/benchmarks/roguelike-ai-poc/reference/tasks/T0004-implement-openai-agent-client.md
@@ -0,0 +1,24 @@
+# T0004-implement-openai-agent-client — Implement OpenAI agent client
+
+| Field | Value |
+| --- | --- |
+| Status | `open` |
+| Priority | `p0` |
+| Estimate | 2 hours (med confidence) |
+| Depends on | `T0001-bootstrap-repository` |
+| Decision refs | `0003-define-the-agent-action-contract` — Define the agent action contract |
+| Assignee hint | agent |
+| Labels | `llm`, `core` |
+| Updated | 2026-05-17T04:14:22.528Z |
+
+## Description
+
+Build src/agent.py: AgentClient class with constructor(strategy_prompt, model, api_key). Single method choose_action(world_state_json, tick, hp) → (direction, reasoning). Uses tool-calling with one tool do_action(direction in {N,S,E,W,noop}); tool_choice="required". Returns the chosen direction and the assistant message content as reasoning.
+
+## Acceptance criteria
+
+- [ ] AgentClient instantiates without making a call
+- [ ] choose_action returns a valid direction enum
+- [ ] reasoning is captured as a string (may be empty)
+- [ ] malformed responses raise a clear error (does not silently noop)
+- [ ] strategy_prompt is in the system role on every call
diff --git a/benchmarks/roguelike-ai-poc/reference/tasks/T0005-implement-action-handlers-and-termination-checks.json b/benchmarks/roguelike-ai-poc/reference/tasks/T0005-implement-action-handlers-and-termination-checks.json
new file mode 100644
index 0000000..20ad30f
--- /dev/null
+++ b/benchmarks/roguelike-ai-poc/reference/tasks/T0005-implement-action-handlers-and-termination-checks.json
@@ -0,0 +1,33 @@
+{
+  "id": "T0005-implement-action-handlers-and-termination-checks",
+  "number": 5,
+  "slug": "implement-action-handlers-and-termination-checks",
+  "title": "Implement action handlers and termination checks",
+  "description": "Build src/actions.py: apply_action(world, direction) -> ActionResult. Moves the player one cell if walkable; otherwise noop. Compute side effects: HP-1 when stepping onto hazard, win flag when player_pos == exit_pos, dead flag when HP <= 0. Return ActionResult dataclass with new_world, hp_delta, terminal, terminal_reason.",
+  "status": "open",
+  "estimate": {
+    "unit": "hours",
+    "value": 1,
+    "confidence": "high"
+  },
+  "acceptance_criteria": [
+    "Moving into a wall is a noop with no HP change",
+    "Moving onto hazard triggers hp_delta = -1",
+    "Moving onto exit triggers terminal=\"win\"",
+    "HP reaching 0 triggers terminal=\"death\"",
+    "Unit tests for each transition"
+  ],
+  "depends_on": [
+    "T0002-implement-world-module-tile-grid-entity-dict"
+  ],
+  "decision_refs": [
+    "0002-define-the-world-representation-and-renderer"
+  ],
+  "priority": "p0",
+  "labels": [
+    "core"
+  ],
+  "assignee_hint": "agent",
+  "created_at": "2026-05-17T04:14:22.529Z",
+  "updated_at": "2026-05-17T04:14:22.529Z"
+}
diff --git a/benchmarks/roguelike-ai-poc/reference/tasks/T0005-implement-action-handlers-and-termination-checks.md b/benchmarks/roguelike-ai-poc/reference/tasks/T0005-implement-action-handlers-and-termination-checks.md
new file mode 100644
index 0000000..5ad2496
--- /dev/null
+++ b/benchmarks/roguelike-ai-poc/reference/tasks/T0005-implement-action-handlers-and-termination-checks.md
@@ -0,0 +1,24 @@
+# T0005-implement-action-handlers-and-termination-checks — Implement action handlers and termination checks
+
+| Field | Value |
+| --- | --- |
+| Status | `open` |
+| Priority | `p0` |
+| Estimate | 1 hours (high confidence) |
+| Depends on | `T0002-implement-world-module-tile-grid-entity-dict` |
+| Decision refs | `0002-define-the-world-representation-and-renderer` — Define the world representation and renderer |
+| Assignee hint | agent |
+| Labels | `core` |
+| Updated | 2026-05-17T04:14:22.529Z |
+
+## Description
+
+Build src/actions.py: apply_action(world, direction) -> ActionResult. Moves the player one cell if walkable; otherwise noop. Compute side effects: HP-1 when stepping onto hazard, win flag when player_pos == exit_pos, dead flag when HP <= 0. Return ActionResult dataclass with new_world, hp_delta, terminal, terminal_reason.
+
+## Acceptance criteria
+
+- [ ] Moving into a wall is a noop with no HP change
+- [ ] Moving onto hazard triggers hp_delta = -1
+- [ ] Moving onto exit triggers terminal="win"
+- [ ] HP reaching 0 triggers terminal="death"
+- [ ] Unit tests for each transition
diff --git a/benchmarks/roguelike-ai-poc/reference/tasks/T0006-implement-the-tick-based-game-loop.json b/benchmarks/roguelike-ai-poc/reference/tasks/T0006-implement-the-tick-based-game-loop.json
new file mode 100644
index 0000000..129cd6b
--- /dev/null
+++ b/benchmarks/roguelike-ai-poc/reference/tasks/T0006-implement-the-tick-based-game-loop.json
@@ -0,0 +1,35 @@
+{
+  "id": "T0006-implement-the-tick-based-game-loop",
+  "number": 6,
+  "slug": "implement-the-tick-based-game-loop",
+  "title": "Implement the tick-based game loop",
+  "description": "Build src/loop.py: run_game(world, agent_client, max_steps=50). Each iteration: render frame, call agent_client.choose_action, apply action, check terminal, sleep 0.05s, repeat. Logs each tick: tick#, frame, action, reasoning excerpt, hp. Exits on terminal or step cap; returns final state + reason.",
+  "status": "open",
+  "estimate": {
+    "unit": "hours",
+    "value": 2,
+    "confidence": "med"
+  },
+  "acceptance_criteria": [
+    "Loop terminates on win, death, or step cap (≤50)",
+    "Each tick prints the frame and HUD to stdout",
+    "Final summary line shows reason and step count",
+    "No exceptions leak from agent timeouts/errors (logged and treated as noop)"
+  ],
+  "depends_on": [
+    "T0003-implement-frame-renderer",
+    "T0004-implement-openai-agent-client",
+    "T0005-implement-action-handlers-and-termination-checks"
+  ],
+  "decision_refs": [
+    "0004-define-the-tick-loop-and-termination-conditions",
+    "0002-define-the-world-representation-and-renderer"
+  ],
+  "priority": "p0",
+  "labels": [
+    "core"
+  ],
+  "assignee_hint": "agent",
+  "created_at": "2026-05-17T04:14:22.530Z",
+  "updated_at": "2026-05-17T04:14:22.530Z"
+}
diff --git a/benchmarks/roguelike-ai-poc/reference/tasks/T0006-implement-the-tick-based-game-loop.md b/benchmarks/roguelike-ai-poc/reference/tasks/T0006-implement-the-tick-based-game-loop.md
new file mode 100644
index 0000000..3338646
--- /dev/null
+++ b/benchmarks/roguelike-ai-poc/reference/tasks/T0006-implement-the-tick-based-game-loop.md
@@ -0,0 +1,23 @@
+# T0006-implement-the-tick-based-game-loop — Implement the tick-based game loop
+
+| Field | Value |
+| --- | --- |
+| Status | `open` |
+| Priority | `p0` |
+| Estimate | 2 hours (med confidence) |
+| Depends on | `T0003-implement-frame-renderer`, `T0004-implement-openai-agent-client`, `T0005-implement-action-handlers-and-termination-checks` |
+| Decision refs | `0004-define-the-tick-loop-and-termination-conditions` — Define the tick loop and termination conditions; `0002-define-the-world-representation-and-renderer` — Define the world representation and renderer |
+| Assignee hint | agent |
+| Labels | `core` |
+| Updated | 2026-05-17T04:14:22.530Z |
+
+## Description
+
+Build src/loop.py: run_game(world, agent_client, max_steps=50). Each iteration: render frame, call agent_client.choose_action, apply action, check terminal, sleep 0.05s, repeat. Logs each tick: tick#, frame, action, reasoning excerpt, hp. Exits on terminal or step cap; returns final state + reason.
+
+## Acceptance criteria
+
+- [ ] Loop terminates on win, death, or step cap (≤50)
+- [ ] Each tick prints the frame and HUD to stdout
+- [ ] Final summary line shows reason and step count
+- [ ] No exceptions leak from agent timeouts/errors (logged and treated as noop)
diff --git a/benchmarks/roguelike-ai-poc/reference/tasks/T0007-implement-cli-entry-script.json b/benchmarks/roguelike-ai-poc/reference/tasks/T0007-implement-cli-entry-script.json
new file mode 100644
index 0000000..030f430
--- /dev/null
+++ b/benchmarks/roguelike-ai-poc/reference/tasks/T0007-implement-cli-entry-script.json
@@ -0,0 +1,33 @@
+{
+  "id": "T0007-implement-cli-entry-script",
+  "number": 7,
+  "slug": "implement-cli-entry-script",
+  "title": "Implement CLI entry script",
+  "description": "Build src/__main__.py: argparse for --strategy (or read from stdin), --model (default gpt-4o), --max-steps (default 50). Construct AgentClient, build default room, call run_game. Print the final outcome. Document the env vars (OPENAI_API_KEY) and a sample invocation in README.",
+  "status": "open",
+  "estimate": {
+    "unit": "hours",
+    "value": 1,
+    "confidence": "high"
+  },
+  "acceptance_criteria": [
+    "python -m src --strategy \"cautious explorer\" runs end-to-end",
+    "README has a complete example invocation",
+    "--help prints usage",
+    "Exit code 0 on win/timeout, 1 on death (so scripts can chain)"
+  ],
+  "depends_on": [
+    "T0006-implement-the-tick-based-game-loop"
+  ],
+  "decision_refs": [
+    "0001-choose-the-implementation-language",
+    "0004-define-the-tick-loop-and-termination-conditions"
+  ],
+  "priority": "p0",
+  "labels": [
+    "cli"
+  ],
+  "assignee_hint": "agent",
+  "created_at": "2026-05-17T04:14:22.532Z",
+  "updated_at": "2026-05-17T04:14:22.532Z"
+}
diff --git a/benchmarks/roguelike-ai-poc/reference/tasks/T0007-implement-cli-entry-script.md b/benchmarks/roguelike-ai-poc/reference/tasks/T0007-implement-cli-entry-script.md
new file mode 100644
index 0000000..ba9f268
--- /dev/null
+++ b/benchmarks/roguelike-ai-poc/reference/tasks/T0007-implement-cli-entry-script.md
@@ -0,0 +1,23 @@
+# T0007-implement-cli-entry-script — Implement CLI entry script
+
+| Field | Value |
+| --- | --- |
+| Status | `open` |
+| Priority | `p0` |
+| Estimate | 1 hours (high confidence) |
+| Depends on | `T0006-implement-the-tick-based-game-loop` |
+| Decision refs | `0001-choose-the-implementation-language` — Choose the implementation language; `0004-define-the-tick-loop-and-termination-conditions` — Define the tick loop and termination conditions |
+| Assignee hint | agent |
+| Labels | `cli` |
+| Updated | 2026-05-17T04:14:22.532Z |
+
+## Description
+
+Build src/__main__.py: argparse for --strategy (or read from stdin), --model (default gpt-4o), --max-steps (default 50). Construct AgentClient, build default room, call run_game. Print the final outcome. Document the env vars (OPENAI_API_KEY) and a sample invocation in README.
+
+## Acceptance criteria
+
+- [ ] python -m src --strategy "cautious explorer" runs end-to-end
+- [ ] README has a complete example invocation
+- [ ] --help prints usage
+- [ ] Exit code 0 on win/timeout, 1 on death (so scripts can chain)
diff --git a/benchmarks/roguelike-ai-poc/run.sh b/benchmarks/roguelike-ai-poc/run.sh
new file mode 100755
index 0000000..67915d1
--- /dev/null
+++ b/benchmarks/roguelike-ai-poc/run.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# Run the roguelike-ai-poc benchmark prompt against a fresh tmp dir.
+# Requires OPENAI_API_KEY in the environment.
+# Usage:
+#   ./run.sh                            # run with defaults
+#   OUT=./my-output ./run.sh            # specify output dir
+#   MODEL=gpt-4o-mini ./run.sh          # override model
+
+set -euo pipefail
+
+if [[ -z "${OPENAI_API_KEY:-}" ]]; then
+  echo "OPENAI_API_KEY not set — refusing to run." >&2
+  exit 2
+fi
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$HERE/../.." && pwd)"
+OUT="${OUT:-$(mktemp -d -t dr-bench-roguelike-XXXX)}"
+
+DESCRIPTION="A minimal roguelike where the player primes an AI agent with a strategy, then the agent autonomously navigates a single ASCII-rendered room over a tick system until it wins the objective or dies. Goal: prove the agent-as-player concept with the smallest viable surface area."
+
+cd "$REPO_ROOT/server"
+[[ -f dist/cli.js ]] || npm run build >&2
+
+node dist/cli.js \
+  --title "AI-driven roguelike POC" \
+  --description "$DESCRIPTION" \
+  --effort poc \
+  --cwd "$OUT" \
+  --yes \
+  ${MODEL:+--model "$MODEL"}
+
+echo ""
+echo "── Benchmark artifacts at: $OUT"
+echo "Compare with: $HERE/reference/"
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 0000000..2063fb4
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,50 @@
+# Documentation
+
+The decision-record docs follow the [Diátaxis](https://diataxis.fr) framework — four kinds of documentation, each serving a different need.
+
+| You want to… | Read |
+|---|---|
+| **Learn** by following a guided first run | [Tutorials](tutorials/) |
+| **Accomplish** a specific task | [How-to guides](how-to/) |
+| **Look up** facts about a flag, tool, schema | [Reference](reference/) |
+| **Understand** the design — why things are the way they are | [Explanation](explanation/) |
+
+## Start here
+
+**Brand new?** → [Your first plan](tutorials/your-first-plan.md) (15 minutes, end-to-end).
+
+**Already installed and want to do a thing?** → [How-to guides](how-to/).
+
+**Need the exact spec?** → [Reference](reference/).
+
+**Want the rationale?** → [Explanation](explanation/) — especially [why decision records](explanation/why-decision-records.md) and [design rationale](explanation/design-rationale.md).
+
+## Index
+
+### Tutorials
+- [Your first plan](tutorials/your-first-plan.md) — run the roguelike benchmark prompt end-to-end
+
+### How-to guides
+- [Install the plugin or CLI](how-to/install.md)
+- [Run the CLI](how-to/run-the-cli.md) — idea, PRD, resume
+- [Configure LLM providers](how-to/configure-providers.md) — OpenAI, OpenRouter, Ollama, vLLM, LiteLLM
+- [Hand off to Linear](how-to/handoff-to-linear.md)
+- [Calibrate gates](how-to/calibrate-gates.md) — `poc` / `mvp` / `full` + overrides
+
+### Reference
+- [CLI](reference/cli.md) — every flag, env var, exit code
+- [MCP tools](reference/mcp-tools.md) — full tool surface
+- [Data model](reference/data-model.md) — entities, fields, types
+- [Gates](reference/gates.md) — per-phase gate matrix
+
+### Explanation
+- [Why decision records?](explanation/why-decision-records.md) — Joel Parker Henderson's canonical material
+- [Design rationale](explanation/design-rationale.md) — why filesystem, why hard gates, why lens-rotating skeptic
+- [The five phases](explanation/the-five-phases.md) — what each phase does and why this shape
+
+## Outside the docs tree
+
+- [Repo README](../README.md) — overview, status, install summary
+- [CONTRIBUTING](../CONTRIBUTING.md) — how to contribute seeds, templates, and code
+- [Benchmarks](../benchmarks/) — canonical prompts we use to spot regressions
+- [Schemas](../schemas/) — JSON Schema source of truth for every entity
diff --git a/docs/architecture.md b/docs/architecture.md
deleted file mode 100644
index 60ed315..0000000
--- a/docs/architecture.md
+++ /dev/null
@@ -1,197 +0,0 @@
-# Architecture
-
-The decision-record plugin is two pieces:
-
-1. **An MCP server** (`server/`) — TypeScript, speaks the Model Context Protocol over stdio. Stateless aside from in-flight handling; durable state lives on disk in the target repo.
-2. **A Claude Code plugin** (`.claude-plugin/`, `commands/`, `agents/`) — declares the slash command and sub-agents that drive the pipeline through MCP tool calls.
-
-This document covers the data model, the gate machine, and the rationale for each design choice.
-
-## Data model
-
-JSON Schema source of truth lives in [`schemas/`](../schemas/). Zod mirrors live in [`server/src/schemas/index.ts`](../server/src/schemas/index.ts).
-
-### Entity overview
-
-| Entity | Cardinality | File | Source of truth |
-| --- | --- | --- | --- |
-| Project | 1 per repo | `dr/project.json` | This file |
-| PipelineState | 1 per repo | `.dr/state.json` | This file |
-| Event | many, append-only | `.dr/events.jsonl` | This file (one entry per line) |
-| Decision | 0..N | `dr/decisions/<id>.json` | This file |
-| Task | 0..N | `dr/tasks/<id>.json` | This file |
-
-Markdown renderings (`*.md`, `index.html`) are **derived** — regenerated by `dr_render` from the JSON. Never edit them directly; they'll be overwritten.
-
-### Project
-
-The MVP manifest. Captures intent, scope, status, effort calibration, and post-handoff metadata.
-
-Key fields:
-- `id` — stable kebab-case slug
-- `status` — current phase (intake/scoping/deciding/decomposing/handing-off/handed-off)
-- `effort_level` — `poc | mvp | full`; calibrates gate strictness
-- `scope` — `{ in_scope, out_of_scope, success_criteria, nice_to_have }`
-- `sign_offs` — array of phase-level sign-offs (`{ phase, by, actor, at, notes }`)
-- `handoff` — populated at `handing-off → handed-off` with target + identifiers
-- `gate_config` — `{ preset, overrides }`; overrides take precedence per-knob
-
-### Decision (DR)
-
-A single significant choice. Mirrors Joel Parker Henderson's canonical template structure (issue, assumptions, constraints, positions, opinions, argument, implications, related) with a few additions for the pipeline:
-
-- `template_variant` — `canonical | lightweight | scoping | vendor | architecture | data-model`. Affects rendering and (eventually) which sections are required.
-- `status` — `rfc | proposed | accepted | rejected | deprecated | superseded`. Only `accepted` satisfies the deciding gate.
-- `selected_position` — title of the winning position (must exist in `positions`).
-- `review[]` — antagonistic-review passes (`reviewer`, `lens`, `verdict`, `score`, `concerns`).
-- `sign_off` — final acceptance record (`by`, `actor`, `at`, `notes`).
-- `depends_on[]` — IDs of decisions that must be accepted first.
-- `seed_origin` — name of the seed template this DR was instantiated from, if any.
-
-### Task
-
-A beads-style work unit. Pre-handoff only — post-handoff lifecycle lives in Linear (or wherever else).
-
-- `status` — `open | ready | in_progress | done | blocked | deferred`
-- `priority` — `p0..p3`
-- `estimate` — `{ unit: 'hours'|'days', value, confidence }`
-- `acceptance_criteria` — concrete done-when statements
-- `depends_on[]` — task IDs that must complete first
-- `decision_refs[]` — DR IDs this task implements (traceability)
-- `external_ref` — set at handoff to the target system's identifier
-
-### PipelineState
-
-Internal state. Never edited by hand.
-
-- `phase` — same as `Project.status` but read-only from the pipeline's perspective
-- `effective_gate_config` — materialized gate (preset merged with overrides) for fast lookup
-- `next_decision_seq` / `next_task_seq` — monotonically-increasing counters
-- `pending_questions[]` — open questions the agent has surfaced
-- `gate_failures[]` — history of failed `dr_advance` attempts (useful for the agent to remember what to fix)
-
-### Event
-
-One JSONL line per state change. The events log is append-only and is the audit trail.
-
-Event kinds include: `project_initialized`, `phase_advanced`, `phase_advance_blocked`, `scope_updated`, `decision_proposed`, `decision_updated`, `decision_reviewed`, `decision_accepted`, `decision_rejected`, `task_proposed`, `task_updated`, `task_status_changed`, `graph_validated`, `gate_check_passed`, `gate_check_failed`, `question_asked`, `question_answered`, `seed_loaded`, `render_run`, `export_started`, `export_completed`, `export_failed`, `sign_off_recorded`.
-
-A future UI can replay this stream to reconstruct any historical state.
-
-## Gate machine
-
-The pipeline is a state machine with hard gates. Phases:
-
-```
-intake → scoping → deciding → decomposing → handing-off → handed-off
-```
-
-`dr_advance` is the only way to transition. The server evaluates the gate for the *next* phase against the current state. If all gate checks pass and any required sign-off is provided, the phase changes and an event is emitted. Otherwise, gate-failure reasons come back unchanged.
-
-### Per-phase checks
-
-| Phase advancing to | Checks |
-| --- | --- |
-| `scoping` | Project has title and description |
-| `deciding` | `scope.in_scope` non-empty; `scope.success_criteria` non-empty; if review_required_phases includes 'scoping', a scoping-variant DR has a passing review |
-| `decomposing` | ≥ min_decisions; if `decisions_required_status === 'accepted'`, no decisions in `proposed`/`rfc`; if `review_required_per_decision`, every accepted decision has a passing review; if `review_required_phases` includes 'deciding', at least one decision has a passing review; no dangling decision dependencies |
-| `handing-off` | ≥ min_tasks; no dangling task dependencies; no cycles; every task has an estimate ≤ max_task_estimate_hours; every task's `decision_refs` resolve |
-| `handed-off` | `project.handoff` exists (run dr_export_filesystem or dr_export_linear first) |
-
-### Sign-off requirement
-
-Each phase transition can require human sign-off via `require_human_signoff_phases`. When set, `dr_advance` only proceeds if you pass `sign_off_by: 'human'`. The agent cannot self-approve a human-required gate.
-
-## Gate configuration
-
-Three preset tiers calibrate strictness:
-
-```
-poc:
-  decisions_required_status: accepted
-  review_required_phases: []
-  review_required_per_decision: false
-  max_task_estimate_hours: 16
-  require_human_signoff_phases: [handing-off]
-  min_decisions: 0
-  min_tasks: 3
-
-mvp:
-  decisions_required_status: accepted
-  review_required_phases: [scoping, decomposing]
-  review_required_per_decision: false
-  max_task_estimate_hours: 8
-  require_human_signoff_phases: [scoping, decomposing, handing-off]
-  min_decisions: 3
-  min_tasks: 8
-
-full:
-  decisions_required_status: accepted
-  review_required_phases: [scoping, deciding, decomposing]
-  review_required_per_decision: true
-  max_task_estimate_hours: 4
-  require_human_signoff_phases: [scoping, deciding, decomposing, handing-off]
-  min_decisions: 6
-  min_tasks: 15
-```
-
-`gate_overrides` on the project let you tune individual knobs without changing preset:
-
-```json
-{
-  "preset": "mvp",
-  "overrides": {
-    "min_tasks": 5,
-    "review_required_phases": ["scoping"]
-  }
-}
-```
-
-The materialized result lives at `state.effective_gate_config` for fast lookup.
-
-## MCP tool surface
-
-| Group | Tools |
-| --- | --- |
-| Pipeline | `dr_init`, `dr_status`, `dr_advance`, `dr_update_project`, `dr_update_scope` |
-| Decisions | `dr_propose_decision`, `dr_update_decision`, `dr_review_decision`, `dr_accept_decision`, `dr_reject_decision`, `dr_list_decisions`, `dr_get_decision`, `dr_ready_decisions` |
-| Tasks | `dr_propose_task`, `dr_update_task`, `dr_set_task_status`, `dr_list_tasks`, `dr_get_task`, `dr_ready_tasks`, `dr_validate_graph` |
-| Seeds | `dr_seed_search`, `dr_seed_list`, `dr_seed_get`, `dr_seed_load` |
-| Render | `dr_render` |
-| Handoff | `dr_export_filesystem`, `dr_export_linear` |
-
-All tools accept `cwd` (target repo) and default to `process.cwd()` when omitted.
-
-## Why this shape
-
-### Why filesystem instead of SQLite
-
-[beads_rust](https://github.com/Dicklesworthstone/beads_rust) uses SQLite + JSONL. We picked filesystem-only because the user prefers data-driven artifacts that are git-diffable and human-readable, and because the working set is small (tens of decisions, dozens of tasks). The JSONL event log gives us the audit trail without the SQLite dependency.
-
-### Why TypeScript
-
-Best fit for a Claude Code plugin. Easy to iterate on prompts and templates. Smaller install footprint than a Python/Rust toolchain. We can revisit if performance ever matters (it won't at this scale).
-
-### Why hard gates instead of soft suggestions
-
-Soft gates degrade. People learn to skip them. By making the wizard refuse to emit a "ship-ready plan" until criteria are met, the artifact becomes trustworthy: if it exists, it's complete.
-
-### Why per-project calibration
-
-Not every project deserves a SWOT analysis. The POC preset removes ceremony for hack-day work; the Full preset keeps it for regulated or production-grade work. The user picks at init time.
-
-### Why state-driven over form-driven
-
-A rigid form would force the wizard to ask the same questions in the same order regardless of project shape. State-driven means: the agent reads what's in the state, identifies what's missing for the gate, and picks the next question. This is the pattern Automaker's resume-check uses ([reference here](https://github.com/protoLabsAI/automaker)).
-
-### Why antagonistic review
-
-Decisions made fast without pushback ossify. The `dr-skeptic` agent forces a structured "what could go wrong here?" pass before accepting. Inspired by Automaker's two-reviewer pattern (Ava operational + Jon strategic).
-
-### Why Linear as the primary handoff target
-
-The user works in Linear. Linear's official MCP server is mature; Linear's data model (Project + Issue + Project-Update + Initiative + Milestone) maps cleanly to our manifest + tasks. Other target adapters (Plane, GitHub Projects, Jira) can be added by following the `handoff/linear.ts` pattern.
-
-## Versioning
-
-`PipelineState.schema_version` is the durable contract. We bump it on breaking layout changes. The server refuses to mutate older versions until migrated. There's no migration tooling yet — when we cross 1.0, we'll add it.
diff --git a/docs/explanation/design-rationale.md b/docs/explanation/design-rationale.md
new file mode 100644
index 0000000..7b759a7
--- /dev/null
+++ b/docs/explanation/design-rationale.md
@@ -0,0 +1,104 @@
+# Design rationale
+
+The decisions behind how this system is built. Use these when you want to understand "why this way and not the obvious other way."
+
+## Hard gates instead of soft suggestions
+
+Soft gates degrade. People learn to skip them, the optional becomes invisible, and within a few iterations the artifact stops being trustworthy. We made every phase transition refusal-by-default: if a gate fails, the wizard returns reasons, does not advance, and there is no `--force`. The artifact's value is the assurance that everything it claims is real.
+
+Consequence: when a gate is too strict, you change the gate, not bypass it. The `gate_config.overrides` mechanism is the official escape hatch — explicit and recorded.
+
+## Five phases, exactly
+
+Intake → Scoping → Deciding → Decomposing → Handoff is the smallest sequence that gives each artifact a clean home and makes ordering load-bearing:
+
+- **Intake** captures the seed.
+- **Scoping** sets the perimeter before decisions are made (so decisions can be evaluated against scope).
+- **Deciding** resolves significant choices before tasks are written (so tasks can reference decisions for traceability).
+- **Decomposing** turns decisions into work (so the work shape follows from the choices).
+- **Handoff** finalizes (so the artifact has a clear "done" state).
+
+We tried collapsing decisions and decomposition. The decomposer ended up making decisions in passing — implicit, unreviewed, untraceable. Splitting the phases forced decisions to be first-class.
+
+## File-system, not a database
+
+Beads_rust uses SQLite + JSONL. We went filesystem-only:
+
+- The working set is small (tens of decisions, dozens of tasks).
+- JSON files diff well in git; engineers can read them without tooling.
+- A future UI can read the same files; no schema migration tax.
+- The JSONL event log gives us the audit trail without the DB dependency.
+
+The trade-off: queries are O(N) directory scans. Acceptable at our scale. If we ever need cross-project indexing or multi-user concurrency, we revisit.
+
+## TypeScript everywhere
+
+Single language across the MCP server, CLI, and tests. Best fit for the Claude Code plugin ecosystem. The `openai` SDK is mature in TypeScript. Iterating on prompts and templates is fast. We considered Rust to match beads_rust's philosophy — rejected because we iterate on prompts more than perf, and a 100KB CLI bundle is fine.
+
+## OpenAI-compatible, single provider
+
+We initially planned dual backends (Anthropic SDK + openai SDK). Cut to OpenAI-compat only because:
+
+- A single SDK is half the surface area to maintain.
+- `OPENAI_BASE_URL` already covers Anthropic-via-OpenRouter, local Ollama/vLLM, LiteLLM proxies, and most enterprise gateways.
+- The agents do straightforward tool calling; nothing requires a vendor-specific SDK feature.
+
+If we ever need Anthropic-native features (cache_control, adaptive thinking), we add a thin adapter — but we don't anticipate it.
+
+## Antagonistic review with lens rotation
+
+We use a `dr-skeptic` sub-agent that reviews decisions through one specific lens (operational, strategic, security, cost, user-impact) per invocation. For the `full` preset, every decision runs through all five lenses.
+
+Inspired by Automaker's two-reviewer pattern (Ava operational + Jon strategic), but generalized: the lens menu is open-ended, and each lens is its own scoped prompt instead of a single reviewer trying to hold all perspectives at once. A focused agent finds more concrete concerns than a broad one.
+
+The skeptic doesn't have to win. A human can override `block` verdicts with explicit sign-off. But the lens output is recorded on the DR forever — visible to anyone who reads it later.
+
+## State-driven, not form-driven
+
+The wizard's job is to read the current state, identify what's missing for the next gate, and pick the next action. It is not a fixed Q&A sequence. This matches Automaker's resume-check pattern — drop in mid-pipeline, the wizard recovers gracefully.
+
+Practical consequence: every wizard invocation starts with `dr_status`. There's no implicit conversation state in the agent loop; everything is on disk.
+
+## Pre-MVP only, deliberately
+
+The pipeline stops at `handed-off`. We don't track post-handoff execution. That belongs in whatever execution system the team uses — Linear, Plane, GitHub Projects, etc.
+
+Why: planning tools that grow into execution tools accumulate scope until they're nothing in particular. By stopping at handoff, the boundary is clear: the plan is the artifact; execution is somebody else's tool.
+
+## Per-project gate calibration
+
+A weekend hack does not need the same gates as a regulated production rollout. Three presets (`poc`, `mvp`, `full`) calibrate strictness; per-knob overrides handle the edge cases. Picked at init.
+
+This was the user feedback that shaped the gate machine: the same hard-gate philosophy can apply to wildly different project shapes, as long as the strictness scales.
+
+## Seed library
+
+A small set (currently nine) of canned decisions for territory the agent will repeatedly see: language, runtime, data store, auth, deployment, CI/CD, testing, observability, scope-statement. Each is a starter — the agent loads it and customizes for the project.
+
+Why ship these: avoids the agent rediscovering the same trade-offs each project. The seed encodes prior pattern-matching as a starting point, not a final answer. The user can fork the seed library and add their team's defaults.
+
+## Linear as the primary handoff target
+
+The user's primary use case is Linear; the data model maps cleanly. We use Linear's GraphQL API directly with an API key, not their MCP server, because:
+
+- We need precise control over the project/issue/relation creation sequence.
+- The GraphQL API is mature and well-documented.
+- Adding MCP-server-as-downstream adds an extra dependency layer for a one-shot operation.
+
+Other handoff targets follow the `server/src/handoff/linear.ts` pattern: `buildExportPlan` (pure, testable) + per-target API calls.
+
+## What we explicitly didn't build
+
+- **A web UI** — the data model is UI-ready (JSON-everywhere, JSONL event log) but we ship Markdown + static HTML for now. UI work would dwarf the pipeline work.
+- **Real-time multi-user collaboration** — single-user, single-machine. The artifact is git-tracked; that's how teams share.
+- **A built-in LLM** — we depend on OpenAI-compat endpoints. No model bundling.
+- **Reconciliation for partial Linear exports** — a known follow-up. For now, a failed export means deleting the partial Linear project and re-running.
+- **A CI integration** — beyond the test suite. The plugin produces artifacts; what teams do with them in CI is up to the team.
+
+## Open questions
+
+- Does the lens-rotating skeptic produce meaningfully better decisions than a single skeptic? Needs benchmark data over time.
+- Is the nine-seed library the right size? Probably grows.
+- Should `handed-off` have a "re-open for amendment" path? Currently it's a terminal state.
+
+We track these by re-running benchmarks as the system changes.
diff --git a/docs/explanation/the-five-phases.md b/docs/explanation/the-five-phases.md
new file mode 100644
index 0000000..b1352a0
--- /dev/null
+++ b/docs/explanation/the-five-phases.md
@@ -0,0 +1,133 @@
+# The five phases
+
+The pipeline has exactly five phases between an idea and a ship-ready plan. Each phase has a single job; each transition is gated.
+
+```
+intake → scoping → deciding → decomposing → handing-off → handed-off
+```
+
+This page explains what each phase accomplishes and why it exists.
+
+## Intake
+
+**Job:** Capture the idea.
+
+**Inputs:** a one-line idea, an optional PRD, an effort-level choice.
+
+**Outputs:** a `Project` object with title, description, effort_level, and an empty everything-else.
+
+**Gate to next phase:** title and description non-empty.
+
+**Why it exists:** to write the seed down. Until the idea has an `id` on disk, the wizard has nothing to read on subsequent turns. Intake is mechanical and fast.
+
+## Scoping
+
+**Job:** Pin the MVP perimeter.
+
+**Inputs:** the project description, optionally a PRD, optionally a `scope-statement` seed.
+
+**Outputs:**
+
+- `project.scope.in_scope` — capabilities the MVP MUST ship
+- `project.scope.out_of_scope` — explicit non-goals (this is the load-bearing list)
+- `project.scope.success_criteria` — measurable signals
+- `project.scope.nice_to_have` — optional capabilities
+- Under `mvp`/`full` presets: a `scope-statement` DR with a selected shape (lean / walking-skeleton / polished) and an argument
+
+**Gate to next phase:** `in_scope` and `success_criteria` non-empty. Under `mvp`/`full`, the scope DR has a passing review.
+
+**Why it exists:** without explicit scope, decisions and tasks expand silently. Pinning scope first means every decision evaluated against it has a clear target. The `out_of_scope` list, in particular, is the thing that prevents scope creep later — if it's not on the in_scope list, it's not in the plan.
+
+## Deciding
+
+**Job:** Resolve significant decisions.
+
+**Inputs:** the scoped project. Each decision area is a "would otherwise be re-litigated" choice — language, data store, auth, deployment target, agent contract, etc.
+
+**Outputs:** a set of `Decision` records, each with:
+
+- An issue framing
+- 2–4 positions with pros/cons
+- A `selected_position` and an `argument`
+- Under `full` preset: one `Review` entry per lens (operational, strategic, security, cost, user-impact)
+- Final `status: accepted` with a `sign_off`
+
+**Gate to next phase:** ≥ `min_decisions` count; every decision either `accepted` or `rejected` (no in-flight `proposed`); per-decision review passed if `review_required_per_decision`; no dangling decision dependencies.
+
+**Why it exists:** decisions made implicitly during decomposition are untraceable. Forcing them into first-class records means future-you (or future-them) can see why the team chose X. The `seed_origin` field also lets the agent learn from past projects without redeciding the obvious.
+
+## Decomposing
+
+**Job:** Turn decisions into a task graph.
+
+**Inputs:** accepted decisions + scope. Each task is a vertical slice that ships some user-visible behavior end-to-end, sized to fit under the preset's `max_task_estimate_hours`.
+
+**Outputs:** a set of `Task` records, each with:
+
+- A title and description
+- Acceptance criteria (concrete done-when statements)
+- An estimate (hours/days + confidence)
+- `decision_refs` linking back to the decisions it implements
+- `depends_on` for ordering
+
+**Gate to next phase:** ≥ `min_tasks`; no cycles; no orphan dependencies; every estimate within budget; every `decision_refs` resolves; under `mvp`/`full`, the decomposing phase has been reviewed.
+
+**Why it exists:** without explicit dependencies, the team works in arbitrary order and discovers blockers late. The dependency graph makes the order legible. The `decision_refs` make traceability automatic — if a decision changes, you can find every task affected.
+
+## Handing off
+
+**Job:** Finalize the plan into a target system.
+
+**Inputs:** the validated decision + task graph; a handoff target (Linear or filesystem).
+
+**Outputs:**
+
+- For Linear: a Linear Project, an Issue per decision (labeled `decision`), an Issue per task with priority/estimate/acceptance criteria, `blocks` relations for `depends_on`. Each task's local JSON gets an `external_ref` for traceability.
+- For filesystem: the `dr/` tree is finalized, `project.json.handoff` is set, mutations are halted.
+
+**Gate to next phase:** `project.handoff` set; sign-off provided.
+
+**Why it exists:** to mark the plan as complete and hand it to the execution system. After this point, the pipeline considers the work done; ongoing changes happen wherever the engineering team works.
+
+## Handed off (terminal)
+
+**Job:** Hold the final state.
+
+**Inputs:** the finished pipeline.
+
+**Outputs:** none. This is a terminal state — `dr_advance` from `handed-off` returns null.
+
+**Why it exists:** the pipeline has a clear "done." There is no post-handoff lifecycle in this system; that belongs in Linear/Plane/wherever.
+
+## Why exactly these five
+
+We tried a few alternative shapes:
+
+- **Three phases** (idea → plan → handoff) — too coarse; the agent had to make scope decisions and task decisions in the same step, and they collapsed into each other.
+- **Seven phases** (adding "research" before scope and "verification" before handoff) — felt heavier than the workload warranted. The agent can pull research into scoping; verification is what the gates already do.
+- **No explicit handoff phase** (just an export tool) — the export ended up being the implicit handoff, but without a phase boundary the gate machine couldn't enforce sign-off and completeness.
+
+The current shape is the smallest that gives each artifact a single owner and makes every transition load-bearing.
+
+## What happens between phases
+
+Between phases, the wizard:
+
+1. Reads the current state with `dr_status`.
+2. Evaluates the gate to the next phase.
+3. If passing and no human sign-off is required, calls `dr_advance` directly.
+4. If passing and human sign-off is required, prompts the user (or auto-confirms under `--yes`).
+5. If failing, surfaces the gate reasons and tries to make the agent fix them — usually by running the phase's sub-agent again.
+
+The phase machine is therefore not just "what's the next thing" — it's "what gate is blocking us, and what work closes that gate."
+
+## State-driven progression
+
+Critically: phase progression is **state-driven, not turn-driven**. The wizard doesn't say "we just finished scoping so I'll move to deciding." It says "scope is non-empty, the scope DR is reviewed, the gate passes, so I'll advance." This means:
+
+- The wizard can resume cleanly mid-phase.
+- Partial work isn't wasted.
+- A human can edit `project.json` between sessions and the wizard adapts.
+- Phase order is enforced by the gate machine, not by the agent's memory.
+
+That's the underlying primitive that makes the rest work.
diff --git a/docs/upstream-canon.md b/docs/explanation/why-decision-records.md
similarity index 100%
rename from docs/upstream-canon.md
rename to docs/explanation/why-decision-records.md
diff --git a/docs/how-to/calibrate-gates.md b/docs/how-to/calibrate-gates.md
new file mode 100644
index 0000000..6bffb99
--- /dev/null
+++ b/docs/how-to/calibrate-gates.md
@@ -0,0 +1,79 @@
+# Calibrate gates
+
+The pipeline is hard-gated — every phase transition checks a set of conditions, and refuses to advance if they're not met. The strictness of those conditions is set per-project by an **effort level** preset, with optional per-knob overrides.
+
+## Choose a preset
+
+```bash
+decision-record --idea "…" --effort poc    # loosest
+decision-record --idea "…" --effort mvp    # default
+decision-record --idea "…" --effort full   # strictest
+```
+
+| Knob | `poc` | `mvp` (default) | `full` |
+|---|---|---|---|
+| Minimum decisions to advance from deciding | 0 | 3 | 6 |
+| Minimum tasks to advance from decomposing | 3 | 8 | 15 |
+| Max hours per leaf task | 16 | 8 | 4 |
+| Phases that require reviewed scope/decisions/decomp | (none) | scoping, decomposing | scoping, deciding, decomposing |
+| Every DR reviewed individually (lens-rotating skeptic) | no | no | **yes** |
+| Phases that require human sign-off | handing-off | scoping, decomposing, handing-off | scoping, deciding, decomposing, handing-off |
+
+**When to use each:**
+
+- **`poc`** — weekend hacks, prototypes, internal-only spikes. Minimal ceremony.
+- **`mvp`** (default) — a real product slice. Scope and decomposition get scrutiny; individual decisions don't get a full review pass.
+- **`full`** — production work, regulated domains, anything where reading the decisions in six months matters. Every DR is reviewed by the lens-rotating skeptic before acceptance.
+
+## Override individual knobs
+
+Sometimes a preset is close but one knob is off. Override at init time:
+
+```bash
+# Use MVP defaults but require only 5 tasks instead of 8
+decision-record --idea "…" --effort mvp \
+  # (override flags coming — for now use the MCP dr_update_project tool after init)
+```
+
+> The CLI does not currently expose per-knob overrides as flags. You can override them by calling `dr_update_project` via the MCP server, or by editing `dr/project.json` directly (then re-running with `--resume`). A `--gate-override key=value` flag is a planned addition.
+
+### Override schema
+
+`project.json` has a `gate_config.overrides` object. Any knob you set there wins over the preset:
+
+```json
+{
+  "gate_config": {
+    "preset": "mvp",
+    "overrides": {
+      "min_tasks": 5,
+      "review_required_per_decision": true,
+      "max_task_estimate_hours": 6
+    }
+  }
+}
+```
+
+Available override knobs:
+
+| Key | Type | Effect |
+|---|---|---|
+| `decisions_required_status` | `"accepted"` \| `"any"` | What DR status counts toward the deciding gate. Use `"any"` to allow rejection without re-deciding. |
+| `review_required_phases` | `string[]` | Phases at which an antagonistic review must happen before advance. |
+| `review_required_per_decision` | `boolean` | If true, every DR needs a passing review before acceptance. |
+| `max_task_estimate_hours` | `number` | Leaf task estimate ceiling. |
+| `require_human_signoff_phases` | `string[]` | Phases that need human (not agent) sign-off to advance. |
+| `min_decisions` | `integer` | Minimum decisions to advance from deciding. |
+| `min_tasks` | `integer` | Minimum tasks to advance from decomposing. |
+
+## Inspect the effective config
+
+```bash
+cat <cwd>/.dr/state.json | jq '.effective_gate_config'
+```
+
+The `effective_gate_config` is the materialized preset + overrides; it's what the gate evaluator actually checks against. Edit `project.json` overrides, then re-run with `--resume` to see the change.
+
+## Why hard gates?
+
+Soft gates degrade. People learn to skip them. By refusing to emit a "ship-ready plan" until the criteria are met, the resulting artifact becomes trustworthy: if it exists, it's complete. See [the design rationale](../explanation/design-rationale.md) for the longer version.
diff --git a/docs/how-to/configure-providers.md b/docs/how-to/configure-providers.md
new file mode 100644
index 0000000..19e68f5
--- /dev/null
+++ b/docs/how-to/configure-providers.md
@@ -0,0 +1,103 @@
+# Configure LLM providers
+
+The CLI uses the **OpenAI-compatible** API surface. Anything that speaks that protocol works — OpenAI itself, OpenRouter, Ollama, vLLM, LiteLLM, etc.
+
+## OpenAI (the default)
+
+```bash
+export OPENAI_API_KEY=sk-…
+decision-record --idea "…"
+```
+
+Default model: `gpt-4o`. Override per-call:
+
+```bash
+decision-record --idea "…" --model gpt-4o-mini
+```
+
+Or persistently:
+
+```bash
+export OPENAI_MODEL=gpt-4o-mini
+```
+
+## OpenRouter
+
+[OpenRouter](https://openrouter.ai/) proxies many providers behind a single OpenAI-compatible endpoint.
+
+```bash
+export OPENAI_API_KEY=sk-or-…
+export OPENAI_BASE_URL=https://openrouter.ai/api/v1
+export OPENAI_MODEL=anthropic/claude-sonnet-4-6
+decision-record --idea "…"
+```
+
+## Ollama (local)
+
+[Ollama](https://ollama.com/) serves an OpenAI-compatible endpoint on `:11434`.
+
+```bash
+ollama pull llama3.1:70b      # one time
+ollama serve                  # if not already running
+```
+
+```bash
+export OPENAI_API_KEY=ollama   # any non-empty string works
+export OPENAI_BASE_URL=http://localhost:11434/v1
+export OPENAI_MODEL=llama3.1:70b
+decision-record --idea "…"
+```
+
+> **Tool calling matters.** The agents rely on the model emitting tool calls. Verify your local model supports OpenAI-style function calling before running a full pipeline. Smaller models often struggle here.
+
+## vLLM (self-hosted)
+
+[vLLM](https://github.com/vllm-project/vllm) exposes an OpenAI-compatible server.
+
+```bash
+python -m vllm.entrypoints.openai.api_server \
+  --model meta-llama/Llama-3.1-70B-Instruct \
+  --port 8000
+```
+
+```bash
+export OPENAI_API_KEY=any-string
+export OPENAI_BASE_URL=http://localhost:8000/v1
+export OPENAI_MODEL=meta-llama/Llama-3.1-70B-Instruct
+```
+
+## LiteLLM proxy
+
+[LiteLLM](https://github.com/BerriAI/litellm) is a universal proxy that converts many providers to OpenAI format. Once running:
+
+```bash
+export OPENAI_API_KEY=sk-litellm-…
+export OPENAI_BASE_URL=http://localhost:4000/v1
+export OPENAI_MODEL=gpt-4o  # the alias you defined in litellm config
+```
+
+## Per-invocation overrides
+
+All env vars have CLI equivalents that take precedence:
+
+```bash
+decision-record \
+  --api-key sk-… \
+  --base-url https://openrouter.ai/api/v1 \
+  --model anthropic/claude-opus-4-7 \
+  --idea "…"
+```
+
+## Choosing a model
+
+The agents do a lot of tool calling and structured reasoning. Models that work well:
+
+| Model | Notes |
+|---|---|
+| `gpt-4o` | Default; reliable tool calling, good reasoning |
+| `gpt-4o-mini` | Faster and cheaper; works for `poc` and many `mvp` projects |
+| Claude Sonnet 4.6 via OpenRouter | Strong on long-form reasoning and skeptic critique |
+| Claude Opus 4.7 via OpenRouter | Highest-quality decisions and decompositions; slower and pricier |
+| Local Llama 3.1 70B+ | Workable if your tooling supports function calling; weaker on subtle critique |
+
+Pick based on the project's criticality. POC throwaway → `gpt-4o-mini`. Production decision that other people will read → `gpt-4o` or Sonnet/Opus.
diff --git a/docs/how-to/handoff-to-linear.md b/docs/how-to/handoff-to-linear.md
new file mode 100644
index 0000000..0364f20
--- /dev/null
+++ b/docs/how-to/handoff-to-linear.md
@@ -0,0 +1,83 @@
+# Hand off to Linear
+
+When the pipeline reaches the handoff phase, the wizard can push the finished plan into Linear — a Project containing one Issue per task and one Issue (labeled `decision`) per accepted DR, with `blocks` relations matching task dependencies.
+
+## One-time setup
+
+1. **Get a Linear API key.**
+   Settings → API → Personal API keys → "New". Copy the `lin_api_…` value.
+
+2. **Find your team ID.**
+   Two easy ways:
+   - In Linear, open any issue → look at the URL: `linear.app/<workspace>/issue/<TEAM-N>` — the `TEAM` prefix is the team key, not the ID. To get the UUID, use the GraphQL explorer at <https://studio.apollographql.com/public/Linear-API/> or [`linear teams`](https://linear.app/docs/cli) in their CLI.
+   - Or: `curl -H 'Authorization: lin_api_…' -X POST https://api.linear.app/graphql -d '{"query":"{ teams { nodes { id name key } } }"}'`
+
+3. **Set env vars:**
+   ```bash
+   export LINEAR_API_KEY=lin_api_…
+   export LINEAR_TEAM_ID=<the UUID>   # optional; you'll be prompted otherwise
+   ```
+
+## Run with handoff to Linear
+
+```bash
+decision-record --idea "…" --cwd ~/dev/my-project
+```
+
+When the wizard reaches the handoff phase, you'll see:
+
+```
+> LINEAR_API_KEY detected. Push the plan to Linear? [Y/n]
+```
+
+Answer yes. The wizard will:
+
+1. Run a **dry-run preview** — building the export plan locally without calling Linear.
+2. Show you the totals: `N issues (M decisions + K tasks)`.
+3. Ask **"Push to Linear now?"** Confirm to fire the real export.
+
+If you ran with `--yes`, both prompts auto-confirm.
+
+## What gets created
+
+| In decision-record | In Linear |
+|---|---|
+| Project manifest (`project.json`) | A new **Project** with the MVP manifest as the description |
+| Each accepted Decision | An **Issue** labeled `decision` + `dr:<variant>`, with the issue/argument/implications in the description |
+| Each Task | An **Issue** with priority + estimate + acceptance criteria as checkboxes |
+| Task `depends_on` relations | Linear `blocks` issue relations |
+| `LINEAR_TEAM_ID` | The team the Project and Issues are created in |
+
+After the export succeeds:
+
+- `dr/project.json` gets a `handoff` block recording the Linear project URL.
+- Each task's JSON gets an `external_ref: { system: "linear", id, url }` for traceability.
+- `dr/index.html` shows a Handoff banner linking to Linear.
+
+## Preview without pushing
+
+To see the export plan without calling Linear at all, the wizard's interactive prompt offers preview-first by default. If you want to script a preview only, invoke the MCP tool directly:
+
+```bash
+node dist/index.js   # start the MCP server, then call dr_export_linear with dry_run=true
+```
+
+Or just run with `--yes` and watch the dry-run output before answering the confirm prompt (when not in autonomous mode).
+
+## Filesystem only
+
+If `LINEAR_API_KEY` is **not** set in the environment, the wizard skips the Linear branch entirely and exports to filesystem. The plan is still complete and shippable — engineers can pick it up from `dr/` directly and create issues themselves wherever they want.
+
+## When it fails partway
+
+The current Linear export is one-shot, not idempotent. If a `dr_export_linear` call fails after creating some issues:
+
+1. The wizard logs `export_failed` to `events.jsonl` and exits with code 1.
+2. **No reconciliation logic** — the partial Linear project exists, but a re-run will create a fresh project alongside it.
+3. Delete the partial project in Linear, then re-run with `--resume`.
+
+A reconciliation pass that detects and continues partial exports is a known follow-up.
+
+## Other handoff targets
+
+The data model is target-agnostic. To support Plane, GitHub Projects, Jira, etc., follow the pattern in `server/src/handoff/linear.ts` — a `buildExportPlan` function plus per-target API calls. PRs welcome.
diff --git a/docs/how-to/install.md b/docs/how-to/install.md
new file mode 100644
index 0000000..0a5e640
--- /dev/null
+++ b/docs/how-to/install.md
@@ -0,0 +1,80 @@
+# Install
+
+Two ways to use decision-record:
+
+1. **Standalone CLI** — fast to set up, no Claude Code dependency.
+2. **Claude Code plugin** — adds the `/plan` slash command and registers the MCP server with Claude Code.
+
+Both share the same MCP server binary, the same artifacts on disk, and the same gate machine.
+
+## Standalone CLI
+
+```bash
+git clone https://github.com/protoLabsAI/decision-record.git
+cd decision-record/server
+npm install
+npm run build
+```
+
+The build produces `dist/cli.js` (CLI) and `dist/index.js` (MCP server). Run the CLI directly:
+
+```bash
+export OPENAI_API_KEY=sk-…
+node dist/cli.js --help
+```
+
+Optionally, symlink it onto your PATH:
+
+```bash
+ln -s "$(pwd)/dist/cli.js" /usr/local/bin/decision-record
+chmod +x /usr/local/bin/decision-record
+decision-record --help
+```
+
+A published-to-npm release is on the roadmap — once shipped, `npx @protolabs/decision-record-server` will work without the clone.
+
+## Claude Code plugin
+
+The repo root contains a `.claude-plugin/plugin.json` and an `.mcp.json` that point Claude Code at the bundled server. To install locally:
+
+```bash
+git clone https://github.com/protoLabsAI/decision-record.git
+cd decision-record/server
+npm install
+npm run build
+cd ..
+
+# Symlink into the Claude plugins directory
+ln -s "$(pwd)" ~/.claude/plugins/decision-record
+```
+
+Restart Claude Code. You should see:
+
+- The `/plan` slash command available
+- The `decision-record` MCP server listed in `/mcp`
+- The `dr-wizard`, `dr-skeptic`, `dr-decomposer` sub-agents available
+
+Trigger a session:
+
+```
+/plan a CLI tool that converts QuickBooks CSV exports
+```
+
+A marketplace-published version is planned. When available, `/plugin install decision-record` will do everything above.
+
+## Verify
+
+```bash
+# Standalone
+node dist/cli.js --version
+# decision-record 0.1.0
+
+# Plugin (inside Claude Code)
+/mcp
+# should list `decision-record` with green status
+```
+
+## Next
+
+- [Run the CLI](run-the-cli.md) — first invocation patterns
+- [Configure LLM providers](configure-providers.md) — OpenAI, OpenRouter, Ollama, vLLM, LiteLLM
diff --git a/docs/how-to/run-the-cli.md b/docs/how-to/run-the-cli.md
new file mode 100644
index 0000000..40bebbe
--- /dev/null
+++ b/docs/how-to/run-the-cli.md
@@ -0,0 +1,114 @@
+# Run the CLI
+
+Four common invocation patterns:
+
+## 1. One-line idea
+
+```bash
+decision-record --idea "a CLI tool that normalizes accounting exports"
+```
+
+The wizard will derive a title from the idea text. The rest of the pipeline runs in the current directory (`.dr/` and `dr/` will appear there).
+
+To target a different directory:
+
+```bash
+decision-record --idea "…" --cwd ~/dev/my-project
+```
+
+## 2. From a PRD file
+
+```bash
+decision-record --prd ./docs/idea.md --cwd ~/dev/my-project
+```
+
+The PRD reader looks for:
+
+- The first `# heading` → title hint
+- The first non-heading paragraph → description hint
+
+The full PRD text is passed to the scoping agent as context. Combine `--prd` with `--idea` if you want to override the title hint:
+
+```bash
+decision-record --prd ./docs/idea.md --idea "ledger CLI" --cwd …
+```
+
+## 3. Resume an in-progress project
+
+If the CLI is interrupted (or you came back later), pick up where you left off:
+
+```bash
+decision-record --resume --cwd ~/dev/my-project
+```
+
+The wizard reads `.dr/state.json`, sees what phase you were in, and continues from there. State is durable across sessions.
+
+## 4. Fully autonomous
+
+The `--yes` flag bypasses every interactive checkpoint:
+
+```bash
+decision-record --idea "…" --effort poc --yes
+```
+
+Useful for CI, scripted runs, or benchmarks. **Read what gets produced** — the wizard will not stop to ask, including at gates that normally require human sign-off.
+
+## Common flags
+
+| Flag | Meaning |
+|---|---|
+| `--idea TEXT` | Free-form one-line idea |
+| `--title TEXT` | Explicit project title (overrides derivation) |
+| `--description TEXT` | Explicit description |
+| `--prd PATH` | Read a Markdown PRD as scope context |
+| `--cwd PATH` | Target project directory (default: `process.cwd()`) |
+| `--effort poc\|mvp\|full` | Gate strictness preset (default: `mvp`) |
+| `--model NAME` | Override `OPENAI_MODEL` |
+| `--api-key KEY` | Override `OPENAI_API_KEY` |
+| `--base-url URL` | Override `OPENAI_BASE_URL` |
+| `--resume` | Skip intake; resume the project in `--cwd` |
+| `--yes`, `-y` | Bypass interactive checkpoints |
+| `--verbose`, `-v` | Stream agent reasoning + tool calls to stderr |
+| `--help`, `-h` | Show full help |
+| `--version` | Print version |
+
+Full flag reference: [`docs/reference/cli.md`](../reference/cli.md).
+
+## Watching the wizard work
+
+Use `--verbose` (or `-v`) to see agent reasoning and every MCP tool call:
+
+```bash
+decision-record --idea "…" --effort poc --verbose
+```
+
+Output goes to **stderr**, so you can still pipe stdout cleanly:
+
+```bash
+decision-record --idea "…" --yes 2>plan.log
+```
+
+## Exit codes
+
+| Code | Meaning |
+|---|---|
+| `0` | Pipeline completed successfully (reached `handed-off`) |
+| `1` | A phase failed (gate failure, agent error, export failure) |
+| `2` | Bad arguments or missing env (`OPENAI_API_KEY`) |
+
+## What lands on disk
+
+```
+<--cwd>/
+├── .dr/                 # internal (gitignored automatically)
+│   ├── state.json       # pipeline state
+│   └── events.jsonl     # append-only event log
+└── dr/                  # tracked — commit this
+    ├── project.json     # MVP manifest
+    ├── project.md       # rendered view
+    ├── decisions/       # one .json + .md per DR
+    ├── tasks/           # one .json + .md per task
+    └── index.html       # rendered project overview
+```
+
+JSON is the source of truth; `.md` and `index.html` are regenerated by the wizard. The `.dr/` directory's `.gitignore` is created automatically.
diff --git a/docs/quickstart.md b/docs/quickstart.md
deleted file mode 100644
index 4bbe69c..0000000
--- a/docs/quickstart.md
+++ /dev/null
@@ -1,80 +0,0 @@
-# Quickstart
-
-A five-minute walkthrough of taking an idea to a ship-ready MVP plan.
-
-## Prerequisites
-
-- Claude Code installed
-- Node 20+
-- (Optional) A Linear account and a personal API token if you want to push the final plan to Linear
-
-## Install (local dev)
-
-```bash
-git clone https://github.com/protoLabsAI/decision-record.git
-cd decision-record/server
-npm install
-npm run build
-```
-
-Then either:
-
-- **As a Claude Code plugin** — symlink the `decision-record` directory into `~/.claude/plugins/decision-record/`, restart Claude Code, and the `/plan` command + the `decision-record` MCP server should be available.
-- **As a bare MCP server** — point any MCP client at `node /path/to/decision-record/server/dist/index.js`.
-
-## Run
-
-In a target repository (the project you want to plan):
-
-```
-/plan a small CLI that converts QuickBooks CSV exports to a normalized ledger format
-```
-
-You'll see the `dr-wizard` agent take over. It will:
-
-1. Confirm the title, description, and effort level (default: `mvp`).
-2. Run `dr_init`, creating `.dr/` and `dr/` in your target repo.
-3. Advance to scoping and start asking about MVP boundaries.
-
-## What you'll do, in order
-
-1. **Scope it.** Three or four bullets each for in-scope, out-of-scope, and success criteria. The wizard will push back if you're vague.
-2. **Decide.** The wizard surfaces 3-6 significant decisions (language, data store, deployment, etc.), pulling from the seed library where it can. You pick a position and write a brief argument for each. The `dr-skeptic` agent will review them.
-3. **Decompose.** The `dr-decomposer` agent proposes a beads-style task graph. You review, refine, and lock it.
-4. **Hand off.** Push to Linear (with `LINEAR_API_KEY` and a team ID) or finalize to the filesystem.
-
-When the wizard reports `Phase: handed-off`, you have a complete plan. Open `dr/index.html` to see it rendered.
-
-## What you get
-
-In your target repo:
-
-```
-.dr/
-├── state.json           # pipeline state
-└── events.jsonl         # audit log
-dr/
-├── project.json         # the MVP manifest
-├── project.md           # human-readable view
-├── decisions/
-│   ├── 0001-*.json
-│   └── 0001-*.md
-├── tasks/
-│   ├── T0001-*.json
-│   └── T0001-*.md
-└── index.html           # rendered project overview
-```
-
-If you handed off to Linear, you also get:
-
-- A Linear Project named after your manifest
-- An Issue per decision (labeled `decision`)
-- An Issue per task (with priority, estimate, and labels)
-- `blocks` relations matching task dependencies
-
-## Common follow-ups
-
-- **Re-render after manual edits to JSON:** run the wizard again (`/plan`) and ask it to call `dr_render`.
-- **Resume an interrupted session:** just run `/plan` again. The wizard's first action is `dr_status`, which picks up where you left off.
-- **Loosen / tighten gates:** the wizard understands `gate_overrides` — ask it to "change `min_tasks` to 5" or similar.
-- **Add a new seed:** drop a JSON file in `server/seed/` following the shape of the existing entries; the wizard will find it on next search.
diff --git a/docs/reference/cli.md b/docs/reference/cli.md
new file mode 100644
index 0000000..13149c2
--- /dev/null
+++ b/docs/reference/cli.md
@@ -0,0 +1,108 @@
+# CLI reference
+
+```
+decision-record [options]
+```
+
+## Synopsis
+
+```bash
+decision-record [--idea TEXT | --prd PATH | --resume] [options]
+```
+
+## Description
+
+Run the decision-record planning pipeline against a target project directory. By default, starts a new project from an idea string; with `--resume`, continues an existing project; with `--prd`, reads scope context from a Markdown file.
+
+The CLI orchestrates a phase state machine (intake → scoping → deciding → decomposing → handing-off → handed-off), running LLM-driven sub-agents for the actual planning work and stopping at human sign-off gates when configured.
+
+## Options
+
+### Project input
+
+| Flag | Type | Default | Description |
+|---|---|---|---|
+| `--idea TEXT` | string | — | Free-form one-line idea. Used to derive title + description. |
+| `--title TEXT` | string | derived from `--idea` or `--prd` | Explicit project title. Max 120 chars. |
+| `--description TEXT` | string | derived from `--idea` or `--prd` | Explicit project description. |
+| `--prd PATH` | string | — | Markdown PRD file; first H1 used as title hint, first paragraph as description hint, full text passed to scoping agent. |
+
+A positional argument can substitute for `--idea` if no other input flag is given.
+
+### Pipeline behavior
+
+| Flag | Type | Default | Description |
+|---|---|---|---|
+| `--cwd PATH` | string | `process.cwd()` | Target project directory. State lands under `.dr/` and `dr/`. |
+| `--effort poc\|mvp\|full` | string | `mvp` | Gate strictness preset. See [Calibrate gates](../how-to/calibrate-gates.md). |
+| `--resume` | flag | false | Skip intake; pick up the existing project in `--cwd`. |
+| `--yes`, `-y` | flag | false | Bypass interactive checkpoints (fully autonomous). |
+| `--verbose`, `-v` | flag | false | Stream agent reasoning and tool calls to stderr. |
+
+### LLM connection
+
+| Flag | Type | Default | Description |
+|---|---|---|---|
+| `--model NAME` | string | `$OPENAI_MODEL` or `gpt-4o` | OpenAI-compat model name. |
+| `--api-key KEY` | string | `$OPENAI_API_KEY` | OpenAI-compat API key. |
+| `--base-url URL` | string | `$OPENAI_BASE_URL` or OpenAI default | OpenAI-compat base URL (for OpenRouter, Ollama, vLLM, LiteLLM, etc.). |
+
+### Informational
+
+| Flag | Description |
+|---|---|
+| `--help`, `-h` | Print help and exit. |
+| `--version` | Print version (`decision-record X.Y.Z`) and exit. |
+
+## Environment variables
+
+| Variable | Required | Description |
+|---|---|---|
+| `OPENAI_API_KEY` | yes (unless `--api-key`) | API key for the LLM endpoint. |
+| `OPENAI_BASE_URL` | no | OpenAI-compatible base URL. Defaults to OpenAI's. |
+| `OPENAI_MODEL` | no | Default model. Defaults to `gpt-4o`. |
+| `LINEAR_API_KEY` | no | Enables the Linear handoff branch in the handoff phase. |
+| `LINEAR_TEAM_ID` | no | Pre-fills the team ID prompt at Linear handoff. |
+| `DR_LOG_LEVEL` | no | `debug` \| `info` \| `warn` \| `error`. Default `info`. Applies to the MCP server's stderr logs. |
+| `DR_SEED_DIR` | no | Override the seed library directory. Defaults to the bundled `server/seed/`. |
+
+## Exit codes
+
+| Code | Meaning |
+|---|---|
+| `0` | Pipeline completed successfully (final phase is `handed-off`, or the user declined to advance at a checkpoint and that was a clean stop). |
+| `1` | A phase failed: gate failure, agent error, validation failure, export failure. |
+| `2` | Bad arguments, missing required env (`OPENAI_API_KEY`), or precondition not met (e.g., `--resume` against a directory with no project). |
+
+## Output
+
+- **stdout** — minimal; mostly empty until `--version` or terminal summaries.
+- **stderr** — all wizard progress, agent summaries, checkpoint prompts. Pipe with `2>file` if you want to capture.
+
+## Examples
+
+```bash
+# Minimal — uses cwd, derives title from idea
+decision-record --idea "a CLI to dedupe contact lists"
+
+# Specify everything explicitly
+decision-record \
+  --title "Contact deduper" \
+  --description "A CLI that reads CSVs of contacts and merges fuzzy duplicates" \
+  --effort mvp \
+  --cwd ~/dev/dedup \
+  --model gpt-4o \
+  --yes
+
+# From a PRD
+decision-record --prd ./docs/idea.md --cwd ~/dev/my-project
+
+# Resume after a break
+decision-record --resume --cwd ~/dev/my-project
+
+# Use OpenRouter
+decision-record \
+  --idea "…" \
+  --base-url https://openrouter.ai/api/v1 \
+  --model anthropic/claude-sonnet-4-6
+```
diff --git a/docs/reference/data-model.md b/docs/reference/data-model.md
new file mode 100644
index 0000000..420fc42
--- /dev/null
+++ b/docs/reference/data-model.md
@@ -0,0 +1,152 @@
+# Data model
+
+The pipeline stores five entity types. JSON Schemas are the source of truth in [`../../schemas/`](../../schemas/); the Zod mirrors used at runtime live in [`server/src/schemas/index.ts`](../../server/src/schemas/index.ts).
+
+## Filesystem layout
+
+```
+<target>/
+├── .dr/                    # internal, gitignored by default
+│   ├── state.json          # PipelineState
+│   ├── events.jsonl        # Event (one per line, append-only)
+│   └── cache/              # derived artifacts
+└── dr/                     # tracked
+    ├── project.json        # Project
+    ├── project.md          # rendered, derived
+    ├── decisions/
+    │   ├── 0001-*.json     # Decision
+    │   └── 0001-*.md       # rendered, derived
+    ├── tasks/
+    │   ├── T0001-*.json    # Task
+    │   └── T0001-*.md      # rendered, derived
+    └── index.html          # rendered, derived
+```
+
+JSON is source of truth; `.md` and `index.html` are regenerated by `dr_render`.
+
+## Project (`dr/project.json`)
+
+The MVP manifest.
+
+| Field | Type | Notes |
+|---|---|---|
+| `id` | string (kebab-slug) | Derived from title at init. |
+| `title` | string (1–120) | |
+| `description` | string? | |
+| `created_at`, `updated_at` | ISO datetime | |
+| `effort_level` | `"poc" \| "mvp" \| "full"` | Calibrates gates. |
+| `status` | phase enum | `intake \| scoping \| deciding \| decomposing \| handing-off \| handed-off`. |
+| `scope` | object? | `{ in_scope, out_of_scope, success_criteria, nice_to_have }` — each is `string[]`. |
+| `sign_offs` | array | `{ phase, by, actor?, at, notes? }`. |
+| `handoff` | object? | `{ target, target_id?, target_url?, exported_at, issue_count?, document_count? }`. Set after `dr_export_*`. |
+| `gate_config` | object | `{ preset, overrides? }`. See [Gates](gates.md). |
+| `tags` | string[] | |
+
+## Decision (`dr/decisions/<id>.json`)
+
+A single significant choice with context, alternatives, and rationale.
+
+| Field | Type | Notes |
+|---|---|---|
+| `id` | `"0001-slug"` | Composite identifier. |
+| `number` | integer ≥1 | Monotonic per project. |
+| `slug` | string | Kebab-case. |
+| `title` | string (1–80) | Imperative. |
+| `status` | enum | `rfc \| proposed \| accepted \| rejected \| deprecated \| superseded`. |
+| `template_variant` | enum | `canonical \| lightweight \| scoping \| vendor \| architecture \| data-model`. |
+| `created_at`, `updated_at` | ISO datetime | |
+| `summary` | string? | One-line. |
+| `issue` | string? | Why this decision needs to be made. |
+| `assumptions` | string[] | |
+| `constraints` | string[] | |
+| `positions` | Position[] | Candidate options. |
+| `opinions` | Opinion[] | Stakeholder views. |
+| `argument` | string? | Rationale for the selected position. |
+| `selected_position` | string? | Must match a Position title. |
+| `implications` | string[] | |
+| `depends_on` | DecisionId[] | Must be `accepted` before this can be. |
+| `related_decisions` | DecisionId[] | Referenced but not blocking. |
+| `related_artifacts` | string[] | URLs or repo paths. |
+| `review` | Review[] | Antagonistic-review entries. |
+| `sign_off` | object? | `{ by, actor?, at, notes? }`. Set when accepted. |
+| `superseded_by` | DecisionId? | If `status === "superseded"`. |
+| `seed_origin` | string? | Seed name if instantiated from one. |
+| `tags` | string[] | |
+
+### Position
+
+`{ title, description?, pros, cons, cost?, links }`. Each list defaults to `[]`.
+
+### Opinion
+
+`{ author, by: "agent" | "human", at, body, position_ref? }`.
+
+### Review
+
+`{ reviewer, lens, verdict: "pass" | "block", score (1-5)?, concerns, at }`. Lenses: `operational \| strategic \| security \| cost \| user-impact`.
+
+## Task (`dr/tasks/<id>.json`)
+
+A beads-style work unit.
+
+| Field | Type | Notes |
+|---|---|---|
+| `id` | `"T0001-slug"` | Composite identifier. |
+| `number` | integer ≥1 | Monotonic per project. |
+| `slug` | string | Kebab-case. |
+| `title` | string (1–120) | |
+| `description` | string? | |
+| `status` | enum | `open \| ready \| in_progress \| done \| blocked \| deferred`. |
+| `estimate` | object? | `{ unit: "hours" \| "days", value, confidence?: "low" \| "med" \| "high" }`. |
+| `acceptance_criteria` | string[] | |
+| `depends_on` | TaskId[] | Must be `done` before this can start. |
+| `decision_refs` | DecisionId[] | Decisions this task implements. |
+| `priority` | `"p0" \| "p1" \| "p2" \| "p3"` | Default `p2`. |
+| `labels` | string[] | |
+| `assignee_hint` | `"agent" \| "human" \| "either"`? | |
+| `external_ref` | object? | Set at handoff. `{ system: "linear" \| "github" \| "plane" \| "jira" \| "other", id, url? }`. |
+| `created_at`, `updated_at` | ISO datetime | |
+
+## PipelineState (`.dr/state.json`)
+
+Internal pipeline bookkeeping. Never edit by hand.
+
+| Field | Type | Notes |
+|---|---|---|
+| `schema_version` | semver string | Bumped on breaking layout changes. |
+| `project_id` | string | Matches `project.json.id`. |
+| `phase` | phase enum | Mirrors `project.status` but the pipeline writes this. |
+| `effective_gate_config` | object | Materialized preset + overrides. |
+| `next_decision_seq`, `next_task_seq` | integer ≥1 | Monotonic counters. |
+| `pending_questions` | array | Open questions the agent surfaced. |
+| `gate_failures` | array | History of failed advance attempts (for debugging). |
+| `last_event_at`, `last_render_at` | ISO datetime? | |
+
+## Event (`.dr/events.jsonl`)
+
+One JSON line per pipeline action. Append-only audit log.
+
+| Field | Type | Notes |
+|---|---|---|
+| `at` | ISO datetime | |
+| `actor` | `"agent" \| "human" \| "system"` | |
+| `actor_name` | string? | |
+| `kind` | enum | See below. |
+| `entity_kind` | `"project" \| "decision" \| "task" \| "phase" \| "question"`? | |
+| `entity_id` | string? | |
+| `payload` | object? | Event-specific. |
+| `correlation_id` | string? | Groups related events. |
+
+### Event kinds
+
+`project_initialized`, `phase_advanced`, `phase_advance_blocked`, `scope_updated`, `decision_proposed`, `decision_updated`, `decision_reviewed`, `decision_accepted`, `decision_rejected`, `task_proposed`, `task_updated`, `task_status_changed`, `graph_validated`, `gate_check_passed`, `gate_check_failed`, `question_asked`, `question_answered`, `seed_loaded`, `render_run`, `export_started`, `export_completed`, `export_failed`, `sign_off_recorded`.
+
+## ID conventions
+
+| Entity | Format | Example |
+|---|---|---|
+| Decision | `<4-digit>-<slug>` | `0003-define-the-agent-action-contract` |
+| Task | `T<4-digit>-<slug>` | `T0006-implement-the-tick-based-game-loop` |
+| Project | kebab-slug | `ai-driven-roguelike-poc` |
+
+Slugs are 2–64 chars, lower-case alphanumerics + dashes, no leading/trailing dash.
diff --git a/docs/reference/gates.md b/docs/reference/gates.md
new file mode 100644
index 0000000..e3dad85
--- /dev/null
+++ b/docs/reference/gates.md
@@ -0,0 +1,78 @@
+# Gates reference
+
+Every phase transition is checked by a set of gate conditions. The full evaluator lives at [`server/src/gateEval.ts`](../../server/src/gateEval.ts). This page documents what each gate checks and what each preset sets.
+
+## Phase machine
+
+```
+intake ─→ scoping ─→ deciding ─→ decomposing ─→ handing-off ─→ handed-off
+```
+
+`dr_advance` is the only way to move forward. It evaluates the gate for the **next** phase against current state, and either transitions (and emits `phase_advanced`) or records a `phase_advance_blocked` event with reasons.
+
+## What each gate checks
+
+| Advancing to | Conditions |
+|---|---|
+| `scoping` | Project title non-empty; description non-empty. |
+| `deciding` | `scope.in_scope` non-empty; `scope.success_criteria` non-empty; if `review_required_phases` includes `"scoping"`, a `scoping`-variant DR has a passing review. |
+| `decomposing` | Number of decisions ≥ `min_decisions`; if `decisions_required_status === "accepted"`, no decisions in `proposed`/`rfc`; if `review_required_per_decision`, every accepted decision has a passing review; if `review_required_phases` includes `"deciding"`, at least one decision has a passing review; no decisions reference missing dependency IDs. |
+| `handing-off` | Number of tasks ≥ `min_tasks`; no tasks reference missing dependency tasks; no cycles in the task dependency graph; every task has an estimate ≤ `max_task_estimate_hours` (days are normalized to hours at 8h/day); every task's `decision_refs` resolve. |
+| `handed-off` | `project.handoff` is set (i.e., `dr_export_filesystem` or `dr_export_linear` has run). |
+
+## Sign-off check (overlay)
+
+If the next phase is in the project's `require_human_signoff_phases`, the gate also requires `dr_advance` to be called with `sign_off_by: "human"`. Without it, the gate fails with a clear "Sign-off gate" reason.
+
+The orchestrator (CLI + dr-wizard) handles this automatically: it pauses at the relevant checkpoint, asks the user, then calls `dr_advance` with sign-off. Manual MCP callers must remember.
+
+## Preset matrix
+
+| Knob | `poc` | `mvp` | `full` |
+|---|---|---|---|
+| `decisions_required_status` | `accepted` | `accepted` | `accepted` |
+| `review_required_phases` | `[]` | `["scoping", "decomposing"]` | `["scoping", "deciding", "decomposing"]` |
+| `review_required_per_decision` | `false` | `false` | **`true`** |
+| `max_task_estimate_hours` | `16` | `8` | `4` |
+| `require_human_signoff_phases` | `["handing-off"]` | `["scoping", "decomposing", "handing-off"]` | `["scoping", "deciding", "decomposing", "handing-off"]` |
+| `min_decisions` | `0` | `3` | `6` |
+| `min_tasks` | `3` | `8` | `15` |
+
+## Override knobs
+
+Per-project overrides at `project.json → gate_config.overrides` take precedence per-key over the preset. Any of the seven keys above can be overridden; omitted keys inherit the preset.
+
+```json
+{
+  "gate_config": {
+    "preset": "mvp",
+    "overrides": {
+      "min_tasks": 5,
+      "review_required_per_decision": true
+    }
+  }
+}
+```
+
+The materialized result is at `state.effective_gate_config` — that's what the evaluator actually reads.
+
+## Inspecting gate state
+
+```bash
+# Current evaluation against the next phase
+node dist/index.js  # then call dr_status
+
+# Or directly:
+cat <cwd>/.dr/state.json | jq '.effective_gate_config'
+cat <cwd>/dr/project.json | jq '.gate_config'
+```
+
+`dr_status` returns a `gate_to_next` block: `{ pass, reasons[], next_phase }`. Read the reasons — they name the specific failing knob and the specific shortfall.
+
+## Why hard gates
+
+The system refuses to advance when gates fail. There is no `--force` flag, no admin override.
+
+The trade-off is intentional. Soft gates degrade — people learn to skip them, and the artifact stops being trustworthy. With hard gates, the rule is: if a plan exists and reached `handed-off`, every gate it crossed actually passed. The plan is real.
+
+If a gate is too strict, change the gate (override the knob in `project.json`). Don't bypass it.
diff --git a/docs/reference/mcp-tools.md b/docs/reference/mcp-tools.md
new file mode 100644
index 0000000..e5a1c47
--- /dev/null
+++ b/docs/reference/mcp-tools.md
@@ -0,0 +1,188 @@
+# MCP tools
+
+The MCP server exposes the planning pipeline as a set of tools an agent can call. The CLI uses the same registry in-process; nothing is CLI-only.
+
+Every tool accepts `cwd?: string` (the target project directory; defaults to the server's `process.cwd()`).
+
+## Pipeline tools
+
+### `dr_init`
+
+Initialize the pipeline in a target repo. Creates `.dr/` and `dr/` layout, writes `state.json` and `project.json`. Fails if already initialized.
+
+| Input | Type | Notes |
+|---|---|---|
+| `title` | string | Project title. |
+| `description` | string? | Intake description. |
+| `effort_level` | `"poc" \| "mvp" \| "full"` | Default `mvp`. |
+| `gate_overrides` | object? | Per-knob preset overrides. See [Gates reference](gates.md). |
+| `tags` | string[] | Free-form. |
+| `project_id` | string? | Override the derived slug. |
+
+Returns: `{ project_id, paths, project, state, next_phase }`.
+
+### `dr_status`
+
+Read pipeline status. Returns project metadata, current phase, gate evaluation against the next phase (what's blocking advance), counts, pending questions, effective gate config.
+
+### `dr_advance`
+
+Advance to the next pipeline phase if the gate passes. Records a sign-off and emits `phase_advanced`. If the gate fails, returns reasons without changing phase.
+
+| Input | Type | Notes |
+|---|---|---|
+| `sign_off_by` | `"agent" \| "human"`? | Required when the next phase has human sign-off requirement. |
+| `sign_off_actor` | string? | Identifier of the signing actor. |
+| `sign_off_notes` | string? | Free-form notes attached to the sign-off. |
+
+### `dr_update_project`
+
+Patch project metadata: `title`, `description`, `tags`, and `gate_overrides`. Cannot change the `effort_level` preset (re-init for that).
+
+### `dr_update_scope`
+
+Replace any/all of `in_scope`, `out_of_scope`, `success_criteria`, `nice_to_have`. Each list is fully replaced when provided; omitted lists are unchanged.
+
+## Decision tools
+
+### `dr_propose_decision`
+
+Create a new decision record (`status: "proposed"`).
+
+| Input | Type | Notes |
+|---|---|---|
+| `title` | string | Short imperative, max 80 chars. |
+| `template_variant` | `"canonical" \| "lightweight" \| "scoping" \| "vendor" \| "architecture" \| "data-model"` | Default `canonical`. |
+| `summary`, `issue`, `assumptions`, `constraints`, `positions`, `depends_on`, `tags`, `seed_origin`, `slug` | various | Optional initial content. |
+
+### `dr_update_decision`
+
+Patch any field. Pass only the fields you want to change. `add_opinion` appends a single opinion entry.
+
+### `dr_review_decision`
+
+Record an antagonistic-review pass.
+
+| Input | Type | Notes |
+|---|---|---|
+| `id` | string | Decision id. |
+| `reviewer` | string | e.g., `"dr-skeptic"`. |
+| `lens` | `"operational" \| "strategic" \| "security" \| "cost" \| "user-impact"` | The review lens. |
+| `verdict` | `"pass" \| "block"` | |
+| `score` | number (1–5) | Optional. |
+| `concerns` | string[] | Crisp one-line concerns. |
+
+### `dr_accept_decision`
+
+Move a decision to `accepted` and record sign-off. Requires `selected_position` and `argument` set. Requires a passing review if `review_required_per_decision` is true. Rejects if any blocking deps are unmet.
+
+### `dr_reject_decision`
+
+Move a decision to `rejected` with a reason and sign-off.
+
+### `dr_list_decisions`
+
+Filter by `status[]` and/or `template_variant[]`. Returns summaries.
+
+### `dr_get_decision`
+
+Fetch the full content of a decision by id.
+
+### `dr_ready_decisions`
+
+Return decisions whose `depends_on` are all `accepted` (or which have no deps). Used by the agent to pick the next DR to work on.
+
+## Task tools
+
+### `dr_propose_task`
+
+Create a new task node. Status defaults to `ready` if no deps, `open` otherwise.
+
+| Input | Type | Notes |
+|---|---|---|
+| `title`, `description` | string | |
+| `depends_on` | string[] | Task IDs. |
+| `decision_refs` | string[] | Decision IDs the task implements. |
+| `estimate` | `{ unit: "hours" \| "days", value, confidence? }` | |
+| `acceptance_criteria` | string[] | |
+| `priority` | `"p0" \| "p1" \| "p2" \| "p3"` | Default `p2`. |
+| `labels` | string[] | |
+| `assignee_hint` | `"agent" \| "human" \| "either"` | |
+
+### `dr_update_task`
+
+Patch fields. Use `dr_set_task_status` to change lifecycle state.
+
+### `dr_set_task_status`
+
+Change status: `open`, `ready`, `in_progress`, `done`, `blocked`, `deferred`.
+
+### `dr_list_tasks`, `dr_get_task`
+
+Filter / fetch.
+
+### `dr_ready_tasks`
+
+Tasks whose deps are all `done` (or no deps), sorted by priority. The beads-style "what's next" query.
+
+### `dr_validate_graph`
+
+Validate the full task graph: no cycles, no orphan dependencies, all estimates ≤ `max_task_estimate_hours`, all `decision_refs` resolve. Emits `graph_validated`. Returns `{ valid, errors[], warnings[], cycles[], orphans[], oversized[], missing_decision_refs[] }`.
+
+## Seed library tools
+
+### `dr_seed_search`
+
+Keyword search over the bundled seed library.
+
+| Input | Type | Notes |
+|---|---|---|
+| `query` | string | Matches on name, title, keywords, tags. |
+| `limit` | integer | Default 5. |
+
+### `dr_seed_list`
+
+List every seed.
+
+### `dr_seed_get`
+
+Fetch one seed's full content (including `notes_for_agent`).
+
+### `dr_seed_load`
+
+Instantiate a seed as a `proposed` DR. Pre-fills positions, assumptions, constraints, implications.
+
+| Input | Type | Notes |
+|---|---|---|
+| `seed_name` | string | E.g., `"language-choice"`. |
+| `title_override` | string? | Project-specific title. |
+| `slug_override` | string? | |
+| `depends_on` | string[] | Decision IDs this DR depends on. |
+| `tags` | string[] | |
+
+## Render
+
+### `dr_render`
+
+Regenerate Markdown + `index.html` from JSON. Idempotent.
+
+## Handoff
+
+### `dr_export_filesystem`
+
+Finalize the project to filesystem only. Records handoff metadata, transitions to `handed-off`, prevents further phase changes. Requires the project to be in `handing-off` phase.
+
+### `dr_export_linear`
+
+Push to Linear via the GraphQL API. Creates a Project, Issues per decision (labeled `decision`) and per task, with `blocks` relations matching `depends_on`. Supports `dry_run: true` to preview without calling Linear.
+
+| Input | Type | Notes |
+|---|---|---|
+| `team_id` | string | Linear team UUID. |
+| `api_key` | string? | Defaults to `$LINEAR_API_KEY`. |
+| `dry_run` | boolean | Default `false`. |
+| `sign_off_by`, `sign_off_actor`, `sign_off_notes` | various | Sign-off metadata. |
+
+## Where the schemas live
+
+Every tool's input is validated by Zod at the server. JSON Schema mirrors for external consumers live in [`../../schemas/`](../../schemas/). The Zod source of truth is at [`server/src/schemas/index.ts`](../../server/src/schemas/index.ts).
diff --git a/docs/tutorials/your-first-plan.md b/docs/tutorials/your-first-plan.md
new file mode 100644
index 0000000..7f60435
--- /dev/null
+++ b/docs/tutorials/your-first-plan.md
@@ -0,0 +1,164 @@
+# Your first plan
+
+By the end of this tutorial you will have used decision-record to turn a one-line idea into a complete, scoped, decision-backed, task-decomposed MVP plan — and you will have looked at every artifact the system produces. This takes about 15 minutes.
+
+We will use the **roguelike-ai-poc** benchmark idea — a small but real planning problem — so you can see the system handle something other than `hello world`.
+
+## Before you start
+
+You need:
+
+1. **Node 20 or later** installed (`node --version` should print `v20.x` or higher).
+2. **An OpenAI-compatible API key.** This can be:
+   - An OpenAI API key (`OPENAI_API_KEY=sk-…`), or
+   - Any compatible endpoint — set `OPENAI_BASE_URL` and `OPENAI_MODEL`. See [Configure LLM providers](../how-to/configure-providers.md).
+3. **The repo cloned and built:**
+   ```bash
+   git clone https://github.com/protoLabsAI/decision-record.git
+   cd decision-record/server
+   npm install
+   npm run build
+   ```
+
+You do **not** need the Claude Code plugin installed for this tutorial. We will run the CLI directly.
+
+## Step 1: Pick a working directory
+
+The system writes artifacts into a target project directory. We will create a fresh one:
+
+```bash
+mkdir -p ~/dev/my-first-plan
+```
+
+Everything that follows lands in there. Nothing is written into the decision-record repo itself.
+
+## Step 2: Run the CLI
+
+From the `decision-record/server/` directory:
+
+```bash
+export OPENAI_API_KEY=sk-…   # if you haven't already
+
+node dist/cli.js \
+  --idea "a CLI tool that converts QuickBooks CSV exports into a normalized double-entry ledger" \
+  --effort poc \
+  --cwd ~/dev/my-first-plan
+```
+
+You can also drop the `--idea` flag entirely and run interactively — but for a guided first run, this is cleaner.
+
+## Step 3: Watch the wizard work
+
+The CLI will print colored progress to stderr as each phase runs. You will see something like:
+
+```
+━━━ decision-record v0.1.0 ━━━
+  Target: /Users/you/dev/my-first-plan
+  Model: gpt-4o
+━━━ Phase: Intake ━━━
+✓ Initialized 'a-cli-tool-that-converts-quickbooks-csv-export…' at effort_level=poc
+✓ Advanced: intake → scoping
+━━━ Phase: Scoping ━━━
+  Running scoping agent…
+✓ Scoping agent finished (3 tool calls).
+────────────────────────────────────────────────────────────
+Scope set. in_scope: read QuickBooks CSV, parse rows…
+…
+────────────────────────────────────────────────────────────
+✓ Advanced: scoping → deciding
+━━━ Phase: Deciding ━━━
+  Running deciding agent (proposing decisions)…
+…
+━━━ Antagonistic review: 4 decisions × 5 lenses ━━━
+  operational: pass (4/5)
+  strategic: pass (4/5)
+…
+✓ Accepted 0001-…
+…
+━━━ Phase: Decomposing ━━━
+  Running decomposer agent (building task graph)…
+✓ Decomposer finished (28 tool calls). Graph validates.
+…
+━━━ Phase: Handoff ━━━
+✓ Artifacts rendered.
+> LINEAR_API_KEY detected. Push the plan to Linear? [Y/n] [auto-yes]
+✓ Plan finalized to filesystem.
+✓ Pipeline complete. Final phase: handed-off
+```
+
+Each phase shows what it did. Read the summaries — they tell you what the agent decided.
+
+> **About checkpoints:** Under the `poc` preset, only the **handoff** transition requires human sign-off. Because you passed `--yes`, the wizard auto-confirms; without it, you would be prompted before each gate that needs sign-off. See [Calibrate gates](../how-to/calibrate-gates.md) for the difference between `poc`, `mvp`, and `full`.
+
+## Step 4: Look at what got produced
+
+```bash
+ls ~/dev/my-first-plan/dr/
+```
+
+You should see:
+
+```
+project.json     # the MVP manifest — scope, status, sign-offs
+project.md       # human-readable view of project.json
+decisions/       # one .json + .md per decision
+tasks/           # one .json + .md per task
+index.html       # rendered overview — open in a browser
+```
+
+Open `~/dev/my-first-plan/dr/index.html` in a browser. You will see the full plan: scope, decisions with their selected positions, and the task graph.
+
+```bash
+open ~/dev/my-first-plan/dr/index.html   # macOS
+xdg-open ~/dev/my-first-plan/dr/index.html  # Linux
+```
+
+## Step 5: Inspect a decision
+
+Pick one. For example:
+
+```bash
+cat ~/dev/my-first-plan/dr/decisions/0001-*.md
+```
+
+You will see the full record: issue, positions considered, the selected position, the argument for why it won, the implications, and five lens reviews from the skeptic.
+
+```bash
+cat ~/dev/my-first-plan/dr/decisions/0001-*.json | jq .
+```
+
+Same content, machine-readable.
+
+## Step 6: Inspect a task
+
+```bash
+cat ~/dev/my-first-plan/dr/tasks/T0001-*.md
+```
+
+Tasks have: title, description, acceptance criteria (as a checkbox list), estimate, dependencies, and the decisions they implement (`decision_refs`). A developer can pick up T0001 and ship it.
+
+## Step 7: Look at the audit log
+
+```bash
+tail ~/dev/my-first-plan/.dr/events.jsonl | jq .
+```
+
+Every action the wizard took — phase advances, decisions proposed, reviews completed, tasks created, exports — is recorded as one JSON line. This is your replay log; it never gets rewritten.
+
+## You are done
+
+You ran a complete planning pipeline end-to-end. From a one-line idea you produced:
+
+- A scoped MVP manifest with success criteria and explicit non-goals
+- A set of accepted decisions, each with reviewed rationale
+- A dependency-aware task graph linked back to those decisions
+- Rendered Markdown and HTML for human review
+- An immutable event log
+
+## Next steps
+
+- **Hand off to Linear instead of filesystem** — [How-to: Hand off to Linear](../how-to/handoff-to-linear.md)
+- **Run with a PRD instead of a one-liner** — [How-to: Run the CLI](../how-to/run-the-cli.md)
+- **Use a different model** — [How-to: Configure LLM providers](../how-to/configure-providers.md)
+- **Understand what just happened** — [Explanation: The five phases](../explanation/the-five-phases.md) and [Design rationale](../explanation/design-rationale.md)
+- **Look up a specific flag** — [Reference: CLI](../reference/cli.md)
diff --git a/docs/usage.md b/docs/usage.md
deleted file mode 100644
index 30be959..0000000
--- a/docs/usage.md
+++ /dev/null
@@ -1,145 +0,0 @@
-# Usage
-
-A walk-through of how an `idea → ship-ready MVP plan` session goes with this plugin.
-
-## Setup
-
-### Install the plugin (when published)
-
-```bash
-# In Claude Code
-/plugin install decision-record
-```
-
-Until the plugin lands in a marketplace, you can use it locally:
-
-```bash
-git clone https://github.com/protoLabsAI/decision-record.git
-cd decision-record/server
-npm install
-npm run build
-```
-
-Then point Claude Code at the local plugin (settings → plugins, or symlink into `~/.claude/plugins/`).
-
-### Optional: configure Linear handoff
-
-If you want to push the final plan to Linear, set a personal API token in the environment of whichever shell launches the MCP server:
-
-```bash
-export LINEAR_API_KEY=lin_api_xxx
-```
-
-You'll pass your Linear team ID per-export at handoff time. Find it in Linear (Settings → API → Personal API keys; team IDs visible in the GraphQL explorer or team URL).
-
-Without Linear, everything still works — the plugin will hand off to the filesystem.
-
-## Running the pipeline
-
-In a target repository (fresh or template), open Claude Code and run:
-
-```
-/plan
-```
-
-Optionally pass a one-line idea:
-
-```
-/plan a CLI tool that converts CSV exports from QuickBooks into a normalized ledger format
-```
-
-The `dr-wizard` agent runs. It reads pipeline state from `.dr/state.json` (or initializes if missing) and drives forward one phase at a time.
-
-## The five phases
-
-### 1. Intake
-
-The wizard captures the raw idea: a title, a one-paragraph description, and an effort level.
-
-- **POC** — single-day spike. Light gates: ≥3 tasks, no required reviews, only the handoff requires human sign-off.
-- **MVP** (default) — a few weeks of work. Gates: scope and decomposing reviewed, ≥3 decisions, ≥8 tasks, ≤8h per leaf task.
-- **Full** — production-quality. Every gate reviewed, every DR reviewed individually, ≥6 decisions, ≥15 tasks, ≤4h per leaf task.
-
-You can override individual knobs at init or via `dr_update_project` — see [architecture.md#gate-configuration](architecture.md#gate-configuration).
-
-### 2. Scoping
-
-The most important phase, often skipped to everyone's regret. The wizard pushes you to commit to:
-
-- **In scope** — what the MVP MUST do.
-- **Out of scope** — what it explicitly WON'T do.
-- **Success criteria** — measurable signals it worked.
-- **Nice to have** — optional capabilities (won't block ship).
-
-In MVP and Full presets, the wizard also instantiates a `scope-statement` DR — a formal decision record about the scope choice (lean MVP vs walking-skeleton vs polished). The DR gets a human sign-off before advancing.
-
-### 3. Deciding
-
-The wizard surfaces *which decisions need to be made* for this project. It uses two signals:
-
-- **Seed library** — common decisions (language, runtime, auth, data store, CI/CD, etc.). The wizard searches with `dr_seed_search`, finds matches, and instantiates them with `dr_seed_load`.
-- **Project-specific decisions** — anything the seed library doesn't cover gets proposed fresh.
-
-For each decision, the wizard asks one question at a time, drives you to pick a position, write a brief argument, and (in MVP/Full presets) requests an antagonistic review from `dr-skeptic` before acceptance.
-
-Decisions can depend on each other (e.g., "runtime target" depends on "language choice"). The wizard calls `dr_ready_decisions` to find what's unblocked next.
-
-You leave this phase when every significant decision is `accepted` (or explicitly `rejected`), and the wizard advances with your sign-off.
-
-### 4. Decomposing
-
-The wizard delegates to `dr-decomposer`, which:
-
-1. Reads the project, scope, and accepted DRs.
-2. Proposes a beads-style task graph — tasks with titles, descriptions, acceptance criteria, estimates, dependencies, and `decision_refs` linking back to the DRs they implement.
-3. Calls `dr_validate_graph` to confirm: no cycles, no orphan deps, no oversized estimates, every `decision_refs` resolves.
-
-You then review with the wizard: split tasks that are too big, merge tasks that are too small, fix anything missing. When the graph is clean, advance with your sign-off.
-
-### 5. Handing off
-
-The wizard renders the artifacts (`dr_render` regenerates Markdown + the static `index.html`) and asks where to hand off:
-
-**Linear (preferred)** — provide your team ID. The wizard:
-- First runs `dr_export_linear { dry_run: true }` to show you the plan.
-- On your confirm, runs without dry_run: creates a Linear Project, an Issue per decision (labeled `decision`), an Issue per task, and `blocks` relations matching `depends_on`.
-- Updates each task's `external_ref` so the local file knows the Linear identifier.
-
-**Filesystem only** — `dr_export_filesystem` finalizes the plan in place. The team picks up where they want.
-
-The project transitions to `handed-off`. The plugin's work is done; ongoing project management lives wherever you want.
-
-## Resuming an in-progress project
-
-Just run `/plan` again. The wizard's first move is `dr_status`, which discovers the existing project and jumps to the right phase. The state in `.dr/` is durable across sessions — restart-safe, agent-restart safe, machine-reboot safe.
-
-## Inspecting state
-
-```bash
-# Read project
-cat dr/project.json | jq
-
-# Read events (everything that's happened)
-tail -f .dr/events.jsonl | jq
-
-# Re-render artifacts
-# (in Claude Code:)
-# Use the dr_render MCP tool, or just run /plan and let the wizard refresh.
-
-# Open the rendered index
-open dr/index.html
-```
-
-## Common situations
-
-**"The wizard wants me to write more decisions, but my project is simple."**
-You're probably running with the wrong effort level. Re-init with `effort_level: 'poc'`, or override `min_decisions` via `dr_update_project`'s `gate_overrides`.
-
-**"`dr_advance` keeps failing with vague reasons."**
-The wizard returns the gate failures verbatim. Read them. They name the specific knob and the specific shortfall.
-
-**"I want to change my mind about a decision after acceptance."**
-You can re-open a decision by marking it `superseded` and pointing it at a new DR. The old DR stays on file (immutability matters); the new one carries the current state.
-
-**"My Linear export failed partway."**
-Linear creates issues incrementally — partial state may exist. Either delete the partial project in Linear and re-run, or fix the underlying issue and call `dr_export_linear` again (Note: the current implementation doesn't reconcile — a fresh export creates a fresh project. PR welcome.).
diff --git a/server/package-lock.json b/server/package-lock.json
index de0c7f9..3ac4ccc 100644
--- a/server/package-lock.json
+++ b/server/package-lock.json
@@ -11,6 +11,7 @@
       "dependencies": {
         "@modelcontextprotocol/sdk": "^1.0.0",
         "nanoid": "^5.0.0",
+        "openai": "^6.38.0",
         "zod": "^3.23.0"
       },
       "bin": {
@@ -1913,6 +1914,27 @@
         "wrappy": "1"
       }
     },
+    "node_modules/openai": {
+      "version": "6.38.0",
+      "resolved": "https://registry.npmjs.org/openai/-/openai-6.38.0.tgz",
+      "integrity": "sha512-AoMplt2UalrpgUDMh3L09QWjNRlgJPipclQvA6sYAaeF6nHNBMgmikAZGmcYLn8on4d9sQY9Q8bOLfrBS7Lc8g==",
+      "license": "Apache-2.0",
+      "bin": {
+        "openai": "bin/cli"
+      },
+      "peerDependencies": {
+        "ws": "^8.18.0",
+        "zod": "^3.25 || ^4.0"
+      },
+      "peerDependenciesMeta": {
+        "ws": {
+          "optional": true
+        },
+        "zod": {
+          "optional": true
+        }
+      }
+    },
     "node_modules/parseurl": {
       "version": "1.3.3",
       "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
diff --git a/server/package.json b/server/package.json
index 1ae0fca..2bd82ce 100644
--- a/server/package.json
+++ b/server/package.json
@@ -5,7 +5,8 @@
   "license": "MIT",
   "type": "module",
   "bin": {
-    "decision-record-mcp": "dist/index.js"
+    "decision-record-mcp": "dist/index.js",
+    "decision-record": "dist/cli.js"
   },
   "main": "dist/index.js",
   "exports": {
@@ -22,7 +23,9 @@
     "dev": "tsx watch src/index.ts",
     "start": "node dist/index.js",
     "typecheck": "tsc --noEmit",
-    "test": "tsx --test src/**/*.test.ts"
+    "test": "node --import tsx/esm --test tests/unit-*.test.ts tests/flow-*.test.ts 2>&1 | grep -v 'ExperimentalWarning' || true",
+    "test:unit": "node --import tsx/esm --test tests/unit-*.test.ts",
+    "test:flow": "node --import tsx/esm --test tests/flow-*.test.ts"
   },
   "engines": {
     "node": ">=20"
@@ -30,6 +33,7 @@
   "dependencies": {
     "@modelcontextprotocol/sdk": "^1.0.0",
     "nanoid": "^5.0.0",
+    "openai": "^6.38.0",
     "zod": "^3.23.0"
   },
   "devDependencies": {
diff --git a/server/src/cli.ts b/server/src/cli.ts
new file mode 100644
index 0000000..5da4fea
--- /dev/null
+++ b/server/src/cli.ts
@@ -0,0 +1,2 @@
+// CLI entrypoint — re-exports from cli/index.ts so tsup builds it as a separate bundle.
+import "./cli/index.js";
diff --git a/server/src/cli/agents/deciding.ts b/server/src/cli/agents/deciding.ts
new file mode 100644
index 0000000..08a9c17
--- /dev/null
+++ b/server/src/cli/agents/deciding.ts
@@ -0,0 +1,56 @@
+import OpenAI from "openai";
+import { LLMConfig } from "../../llm/client.js";
+import { runAgentTurn } from "../../llm/agent.js";
+
+const SYSTEM = `You are the deciding phase of an idea-to-MVP planning pipeline.
+
+Your one job: identify every significant decision this project needs to make, propose options, pick winners, and record them. You do NOT accept decisions — the orchestrator does that after running antagonistic review. You leave them as 'proposed' with a selected_position and argument.
+
+Workflow:
+1. Call \`dr_status\` to read the project's current state, including scope and any pre-existing decisions.
+2. Call \`dr_list_decisions\` to see what's already on file.
+3. For each project, identify 3-8 significant decisions (or however many the gate requires — see status.effective_gate_config.min_decisions). Significant means: would otherwise be re-litigated, has multiple defensible options, and load-bearing for the MVP.
+
+   For each decision:
+   a. **Check the seed library first.** Call \`dr_seed_search\` with a query relevant to the decision topic (e.g., 'language', 'data store', 'auth'). If a seed matches, use \`dr_seed_load\` to instantiate it — this gives you well-thought-out starter content.
+   b. **If no seed matches**, call \`dr_propose_decision\` with title, issue, 2-4 positions (each with title, description, pros, cons), assumptions, and constraints.
+   c. **Pick a position.** Call \`dr_update_decision\` with selected_position (matching one of the position titles) and a 1-2 sentence argument for why it wins.
+
+4. After each decision is selected, the orchestrator runs antagonistic review. If a review blocks, you may be called again to revise — but for now, don't accept anything.
+
+Constraints:
+- Stay inside the project's scope. Don't propose decisions about out-of-scope capabilities.
+- One DR per significant choice. Don't fragment one decision into many tiny ones.
+- Set \`depends_on\` when a decision logically follows another (e.g., 'runtime target' depends on 'language choice').
+
+When you've covered all the decisions you think this project needs, return a brief plain-text summary:
+- Total decisions proposed (count).
+- A line per decision: \`<id> — <title> → <selected_position>\`.
+- Any decisions you intentionally left out (and why).
+
+Be decisive. The human reviews at the next checkpoint.`;
+
+export interface DecidingResult {
+  summary: string;
+  toolCallCount: number;
+}
+
+export async function runDecidingAgent(
+  client: OpenAI,
+  config: LLMConfig,
+  cwd: string,
+  verbose: boolean
+): Promise<DecidingResult> {
+  const turn = await runAgentTurn(
+    {
+      client,
+      config,
+      system: SYSTEM,
+      toolContext: { cwd },
+      verbose,
+      maxIterations: 60, // many decisions = many tool calls
+    },
+    "Please identify and propose all the decisions this project needs to make. Use dr_status to read scope first."
+  );
+  return { summary: turn.text, toolCallCount: turn.toolCalls.length };
+}
diff --git a/server/src/cli/agents/decomposer.ts b/server/src/cli/agents/decomposer.ts
new file mode 100644
index 0000000..4fdc652
--- /dev/null
+++ b/server/src/cli/agents/decomposer.ts
@@ -0,0 +1,70 @@
+import OpenAI from "openai";
+import { LLMConfig } from "../../llm/client.js";
+import { runAgentTurn } from "../../llm/agent.js";
+
+const SYSTEM = `You are the decomposing phase of an idea-to-MVP planning pipeline. You turn accepted decisions into a beads-style task graph.
+
+Workflow:
+1. Call \`dr_status\` to read the project's scope and gate config — specifically \`effective_gate_config.max_task_estimate_hours\` and \`min_tasks\`.
+2. Call \`dr_list_decisions\` with \`status: ['accepted']\` and read full content via \`dr_get_decision\` for any that look load-bearing.
+3. Plan the graph end-to-end:
+   - Start with foundations (repo bootstrap, dependencies, config).
+   - Build up to user-visible features.
+   - Each task is atomic — under \`max_task_estimate_hours\` of work.
+   - Each task has acceptance_criteria (concrete done-when statements).
+   - Each task has decision_refs (which DRs it implements).
+   - Each task has depends_on for ordering.
+4. Create tasks via \`dr_propose_task\`. Order matters — create dependencies before dependents.
+5. Call \`dr_validate_graph\`. If it returns errors (cycles, orphans, oversized estimates, missing refs), fix them by calling \`dr_update_task\` and re-validating until clean.
+
+Principles:
+- **Vertical slices, not horizontal layers.** A task that ships a feature end-to-end is better than three tasks that each touch one layer but ship nothing alone.
+- **Every task has decision_refs.** If you can't link a task to an accepted DR, the project's decisions are incomplete — flag it in your summary.
+- **Stay in scope.** Out-of-scope items must NOT become tasks. If something seems necessary but isn't in_scope, raise it in your summary — don't quietly add it.
+- **Estimate honestly.** When unsure, set \`confidence: 'low'\` rather than padding hours.
+
+After the graph validates, return a plain-text summary:
+- Total tasks (count).
+- A line per task: \`<id> — <title> (<estimate>) [pri:<priority>] depends on: <ids> | implements: <decision ids>\`.
+- The critical path (a chain of tasks that must complete in order).
+- Any tasks you couldn't link to a decision (flagged for the human).`;
+
+export interface DecomposerResult {
+  summary: string;
+  toolCallCount: number;
+  validationPassed: boolean;
+}
+
+export async function runDecomposerAgent(
+  client: OpenAI,
+  config: LLMConfig,
+  cwd: string,
+  verbose: boolean
+): Promise<DecomposerResult> {
+  const turn = await runAgentTurn(
+    {
+      client,
+      config,
+      system: SYSTEM,
+      toolContext: { cwd },
+      verbose,
+      maxIterations: 100, // task graph creation can need many calls
+    },
+    "Please decompose the accepted decisions into a beads-style task graph. End by validating the graph."
+  );
+
+  // Check whether the last dr_validate_graph call passed.
+  const validateCalls = turn.toolCalls.filter((c) => c.name === "dr_validate_graph");
+  const last = validateCalls[validateCalls.length - 1];
+  const validationPassed = last
+    ? (() => {
+        try {
+          const parsed = JSON.parse(last.resultText) as { ok?: boolean; data?: { valid?: boolean } };
+          return Boolean(parsed.ok && parsed.data?.valid);
+        } catch {
+          return false;
+        }
+      })()
+    : false;
+  return { summary: turn.text, toolCallCount: turn.toolCalls.length, validationPassed };
+}
diff --git a/server/src/cli/agents/scoping.ts b/server/src/cli/agents/scoping.ts
new file mode 100644
index 0000000..4f8434e
--- /dev/null
+++ b/server/src/cli/agents/scoping.ts
@@ -0,0 +1,58 @@
+import OpenAI from "openai";
+import { LLMConfig } from "../../llm/client.js";
+import { runAgentTurn } from "../../llm/agent.js";
+
+const SYSTEM = `You are the scoping phase of an idea-to-MVP planning pipeline.
+
+Your one job: turn a project description into a sharp MVP scope, written into the project's state.
+
+You have access to MCP tools. Use them. Specifically:
+1. Call \`dr_status\` first to learn the project's title, description, effort_level, and current scope (which may be partially populated already).
+2. Read any PRD context the user supplies in the initial message.
+3. Synthesize four lists:
+   - **in_scope**: 3-5 must-ship capabilities. Concrete, not aspirational.
+   - **success_criteria**: 2-4 measurable signals the MVP worked.
+   - **out_of_scope**: 2-5 deliberately deferred capabilities. Be explicit about what you're NOT building.
+   - **nice_to_have**: 0-3 optional items that may slip in if scope allows.
+4. Call \`dr_update_scope\` once with all four lists.
+5. If the project's effort_level is 'mvp' or 'full', also instantiate the \`scope-statement\` seed DR:
+   - Call \`dr_seed_load\` with seed_name='scope-statement'.
+   - Choose a position based on the project's nature: 'Lean MVP', 'Walking-skeleton MVP', or 'Polished MVP'.
+   - Call \`dr_update_decision\` to set \`selected_position\` and \`argument\` (one paragraph: why this shape fits this project).
+   - Do NOT accept it yet — leave status 'proposed'. The orchestrator handles acceptance after review.
+
+Once you've made every tool call, return a brief plain-text summary:
+- The chosen scope as four bullet lists.
+- For mvp/full presets: the scoping DR id and the selected position.
+
+Be decisive. Don't hedge. The orchestrator will surface your output to the human for sign-off; revisions happen there, not here.`;
+
+export interface ScopingResult {
+  summary: string;
+  toolCallCount: number;
+}
+
+export async function runScopingAgent(
+  client: OpenAI,
+  config: LLMConfig,
+  cwd: string,
+  prdContext: string | null,
+  verbose: boolean
+): Promise<ScopingResult> {
+  const userMessage = prdContext
+    ? `Please scope this project. The project state already has a title and description; use dr_status to read them. Additional PRD context:\n\n${prdContext}`
+    : "Please scope this project. Read the project's current state with dr_status and produce the four-list scope.";
+
+  const turn = await runAgentTurn(
+    {
+      client,
+      config,
+      system: SYSTEM,
+      toolContext: { cwd },
+      verbose,
+      maxIterations: 16,
+    },
+    userMessage
+  );
+  return { summary: turn.text, toolCallCount: turn.toolCalls.length };
+}
diff --git a/server/src/cli/agents/skeptic.ts b/server/src/cli/agents/skeptic.ts
new file mode 100644
index 0000000..e98daf7
--- /dev/null
+++ b/server/src/cli/agents/skeptic.ts
@@ -0,0 +1,103 @@
+import OpenAI from "openai";
+import { LLMConfig } from "../../llm/client.js";
+import { runAgentTurn } from "../../llm/agent.js";
+
+const LENSES = ["operational", "strategic", "security", "cost", "user-impact"] as const;
+export type Lens = (typeof LENSES)[number];
+
+export const ALL_LENSES = LENSES;
+
+function systemFor(lens: Lens): string {
+  const lensGuidance: Record<Lens, string> = {
+    operational:
+      "Can the team actually maintain this? What's the on-call cost? What breaks at 3am? Who owns each operational concern?",
+    strategic:
+      "Does this advance the business goal? Is it differentiated? Is the timing right? What's the opportunity cost?",
+    security:
+      "What's the attack surface? What data is exposed? What new compliance hooks? What's the worst-case breach impact?",
+    cost:
+      "Total cost of ownership over 12 months. Hidden costs (people, time, licenses). Migration costs if we're wrong.",
+    "user-impact":
+      "How does this feel to the user? Does it create friction? Could it break trust? Is the upgrade/migration painful?",
+  };
+
+  return `You are dr-skeptic — an antagonistic reviewer applying the ${lens} lens.
+
+${lensGuidance[lens]}
+
+Your job: stress-test the decision. Find what's wrong before it's locked in. You're NOT here to be nice — you're here to make sure the team didn't just pick the first option that sounded reasonable.
+
+Workflow:
+1. Call \`dr_get_decision\` with the decision id you're given.
+2. Examine: title, issue, assumptions, constraints, positions, selected_position, argument, implications.
+3. Stress-test the argument through the ${lens} lens:
+   - What assumptions are unstated?
+   - What positions were dismissed without serious consideration?
+   - What edge cases would break this choice?
+   - What's the cost of being wrong, and how easily is the decision reversible?
+4. Call \`dr_review_decision\` with:
+   - \`reviewer: 'dr-skeptic'\`
+   - \`lens: '${lens}'\`
+   - \`verdict: 'pass' | 'block'\`
+   - \`score: 1-5\` (1=blocking concerns, 5=enthusiastic)
+   - \`concerns: [...]\` (crisp one-line statements — concrete, actionable, not vague)
+
+Pass only if you genuinely tried to break the decision and failed. If \`argument\` is empty or weak, score it low and demand more.
+
+After the tool call, return one or two sentences summarizing your verdict.`;
+}
+
+export interface SkepticReview {
+  lens: Lens;
+  verdict: "pass" | "block";
+  score: number;
+  concerns: string[];
+  summary: string;
+}
+
+export async function runSkepticAgent(
+  client: OpenAI,
+  config: LLMConfig,
+  cwd: string,
+  decisionId: string,
+  lens: Lens,
+  verbose: boolean
+): Promise<SkepticReview> {
+  const turn = await runAgentTurn(
+    {
+      client,
+      config,
+      system: systemFor(lens),
+      toolContext: { cwd },
+      verbose,
+      maxIterations: 8,
+      toolFilter: {
+        include: ["dr_get_decision", "dr_review_decision", "dr_list_decisions"],
+      },
+    },
+    `Review decision \`${decisionId}\` through the ${lens} lens. Record your verdict via dr_review_decision.`
+  );
+
+  const reviewCall = turn.toolCalls.find((c) => c.name === "dr_review_decision");
+  if (!reviewCall) {
+    return {
+      lens,
+      verdict: "block",
+      score: 1,
+      concerns: ["Skeptic agent did not call dr_review_decision — review missing."],
+      summary: turn.text || "Skeptic produced no output.",
+    };
+  }
+  const args = reviewCall.args as {
+    verdict?: "pass" | "block";
+    score?: number;
+    concerns?: string[];
+  };
+  return {
+    lens,
+    verdict: args.verdict ?? "block",
+    score: args.score ?? 0,
+    concerns: args.concerns ?? [],
+    summary: turn.text,
+  };
+}
diff --git a/server/src/cli/checkpoints.ts b/server/src/cli/checkpoints.ts
new file mode 100644
index 0000000..35f2e6c
--- /dev/null
+++ b/server/src/cli/checkpoints.ts
@@ -0,0 +1,82 @@
+import { createInterface } from "node:readline/promises";
+
+const GREEN = "\x1b[32m";
+const YELLOW = "\x1b[33m";
+const RED = "\x1b[31m";
+const BLUE = "\x1b[34m";
+const DIM = "\x1b[2m";
+const BOLD = "\x1b[1m";
+const RESET = "\x1b[0m";
+
+export interface CheckpointOptions {
+  /** Skip interactive prompt and auto-confirm (for --yes / fully autonomous mode). */
+  autoYes: boolean;
+}
+
+export async function confirm(
+  prompt: string,
+  options: CheckpointOptions,
+  defaultYes = true
+): Promise<boolean> {
+  if (options.autoYes) {
+    process.stderr.write(`${BLUE}>${RESET} ${prompt} ${DIM}[auto-yes]${RESET}\n`);
+    return true;
+  }
+  const rl = createInterface({ input: process.stdin, output: process.stderr });
+  try {
+    const hint = defaultYes ? "[Y/n]" : "[y/N]";
+    const answer = (await rl.question(`${BLUE}>${RESET} ${prompt} ${hint} `))
+      .trim()
+      .toLowerCase();
+    if (answer === "") return defaultYes;
+    return answer === "y" || answer === "yes";
+  } finally {
+    rl.close();
+  }
+}
+
+export async function ask(
+  prompt: string,
+  options: CheckpointOptions,
+  fallback = ""
+): Promise<string> {
+  if (options.autoYes) {
+    process.stderr.write(`${BLUE}>${RESET} ${prompt} ${DIM}[auto: '${fallback}']${RESET}\n`);
+    return fallback;
+  }
+  const rl = createInterface({ input: process.stdin, output: process.stderr });
+  try {
+    const answer = await rl.question(`${BLUE}>${RESET} ${prompt} `);
+    return answer.trim() || fallback;
+  } finally {
+    rl.close();
+  }
+}
+
+export function header(text: string): void {
+  process.stderr.write(`\n${BOLD}${BLUE}━━━ ${text} ━━━${RESET}\n`);
+}
+
+export function info(text: string): void {
+  process.stderr.write(`${DIM}${text}${RESET}\n`);
+}
+
+export function success(text: string): void {
+  process.stderr.write(`${GREEN}✓${RESET} ${text}\n`);
+}
+
+export function warn(text: string): void {
+  process.stderr.write(`${YELLOW}!${RESET} ${text}\n`);
+}
+
+export function error(text: string): void {
+  process.stderr.write(`${RED}✗${RESET} ${text}\n`);
+}
+
+export function bullet(text: string): void {
+  process.stderr.write(`  ${DIM}•${RESET} ${text}\n`);
+}
+
+export function divider(): void {
+  process.stderr.write(`${DIM}${"─".repeat(60)}${RESET}\n`);
+}
diff --git a/server/src/cli/index.ts b/server/src/cli/index.ts
new file mode 100644
index 0000000..b3c1d90
--- /dev/null
+++ b/server/src/cli/index.ts
@@ -0,0 +1,232 @@
+import { resolve } from "node:path";
+import { makeClient, resolveConfig } from "../llm/client.js";
+import { registerAllTools } from "../tools/index.js";
+import { runPipeline } from "./orchestrator.js";
+import { readPRD, PRDDigest } from "./prd.js";
+import { error, header, info } from "./checkpoints.js";
+
+interface ParsedArgs {
+  idea?: string;
+  title?: string;
+  description?: string;
+  prdPath?: string;
+  cwd: string;
+  effortLevel: "poc" | "mvp" | "full";
+  model?: string;
+  apiKey?: string;
+  baseURL?: string;
+  resume: boolean;
+  autoYes: boolean;
+  verbose: boolean;
+  help: boolean;
+  version: boolean;
+}
+
+const VERSION = "0.1.0";
+
+const HELP = `decision-record — idea-to-MVP planning CLI
+
+Usage:
+  decision-record [options]                 Start a new project (interactive)
+  decision-record --idea "..."              Start with a free-form idea
+  decision-record --prd <file>              Start from a PRD markdown file
+  decision-record --resume                  Resume the project in --cwd (or process.cwd())
+
+Options:
+  --idea TEXT             Free-form one-line idea (will derive title + description).
+  --title TEXT            Explicit project title.
+  --description TEXT      Explicit project description.
+  --prd PATH              Read a Markdown PRD as scope context. Combinable with --idea.
+  --cwd PATH              Target project directory (default: cwd). State lands under .dr/ and dr/.
+  --effort poc|mvp|full   Gate strictness preset (default: mvp).
+  --model NAME            LLM model name (default: $OPENAI_MODEL or gpt-4o).
+  --api-key KEY           OpenAI-compat API key (default: $OPENAI_API_KEY).
+  --base-url URL          OpenAI-compat base URL (default: $OPENAI_BASE_URL or api.openai.com).
+  --resume                Skip intake; pick up the existing project in --cwd.
+  --yes, -y               Bypass interactive checkpoints (fully autonomous).
+  --verbose, -v           Stream agent reasoning and tool calls to stderr.
+  --help, -h              Show this help.
+  --version               Print version.
+
+Environment:
+  OPENAI_API_KEY          Required unless --api-key is passed.
+  OPENAI_BASE_URL         Optional. Set for OpenRouter, vLLM, Ollama, LiteLLM, etc.
+  OPENAI_MODEL            Optional. Default model name.
+  LINEAR_API_KEY          Optional. Enables Linear handoff target.
+  LINEAR_TEAM_ID          Optional. Pre-fills the Linear team ID prompt.
+
+Examples:
+  decision-record --idea "a CLI for QuickBooks CSV → ledger normalization" --effort poc
+  decision-record --prd ./docs/idea.md --effort mvp --yes
+  decision-record --cwd ./my-project --resume
+`;
+
+function parseArgs(argv: string[]): ParsedArgs {
+  const out: ParsedArgs = {
+    cwd: process.cwd(),
+    effortLevel: "mvp",
+    resume: false,
+    autoYes: false,
+    verbose: false,
+    help: false,
+    version: false,
+  };
+  for (let i = 0; i < argv.length; i++) {
+    const a = argv[i];
+    const next = () => {
+      const v = argv[++i];
+      if (v === undefined) throw new Error(`Missing value for ${a}`);
+      return v;
+    };
+    switch (a) {
+      case "--idea":
+        out.idea = next();
+        break;
+      case "--title":
+        out.title = next();
+        break;
+      case "--description":
+        out.description = next();
+        break;
+      case "--prd":
+        out.prdPath = next();
+        break;
+      case "--cwd":
+        out.cwd = resolve(next());
+        break;
+      case "--effort": {
+        const v = next();
+        if (v !== "poc" && v !== "mvp" && v !== "full") {
+          throw new Error(`--effort must be poc | mvp | full (got ${v})`);
+        }
+        out.effortLevel = v;
+        break;
+      }
+      case "--model":
+        out.model = next();
+        break;
+      case "--api-key":
+        out.apiKey = next();
+        break;
+      case "--base-url":
+        out.baseURL = next();
+        break;
+      case "--resume":
+        out.resume = true;
+        break;
+      case "--yes":
+      case "-y":
+        out.autoYes = true;
+        break;
+      case "--verbose":
+      case "-v":
+        out.verbose = true;
+        break;
+      case "--help":
+      case "-h":
+        out.help = true;
+        break;
+      case "--version":
+        out.version = true;
+        break;
+      default:
+        // First positional is treated as --idea when --idea isn't set.
+        if (a && !a.startsWith("--") && !out.idea && !out.title) {
+          out.idea = a;
+        } else if (a) {
+          throw new Error(`Unknown argument: ${a}`);
+        }
+    }
+  }
+  return out;
+}
+
+async function main(): Promise<number> {
+  let args: ParsedArgs;
+  try {
+    args = parseArgs(process.argv.slice(2));
+  } catch (err) {
+    error(err instanceof Error ? err.message : String(err));
+    process.stderr.write(HELP);
+    return 2;
+  }
+  if (args.help) {
+    process.stdout.write(HELP);
+    return 0;
+  }
+  if (args.version) {
+    process.stdout.write(`decision-record ${VERSION}\n`);
+    return 0;
+  }
+
+  registerAllTools();
+
+  let prd: PRDDigest | null = null;
+  if (args.prdPath) {
+    try {
+      prd = await readPRD(args.prdPath);
+      info(`Loaded PRD: ${args.prdPath} (${prd.raw.length} chars).`);
+    } catch (err) {
+      error(`Could not read PRD at ${args.prdPath}: ${err instanceof Error ? err.message : String(err)}`);
+      return 1;
+    }
+  }
+
+  let title = args.title;
+  let description = args.description;
+  if (!args.resume) {
+    if (!title && prd?.title_hint) title = prd.title_hint;
+    if (!title && args.idea) {
+      title = args.idea.length > 80 ? args.idea.slice(0, 77) + "…" : args.idea;
+    }
+    if (!description) {
+      if (args.idea) description = args.idea;
+      else if (prd?.description_hint) description = prd.description_hint;
+    }
+  }
+
+  let config;
+  let client;
+  try {
+    config = resolveConfig({
+      ...(args.model !== undefined && { model: args.model }),
+      ...(args.apiKey !== undefined && { apiKey: args.apiKey }),
+      ...(args.baseURL !== undefined && { baseURL: args.baseURL }),
+    });
+    client = makeClient(config);
+  } catch (err) {
+    error(err instanceof Error ? err.message : String(err));
+    return 2;
+  }
+
+  header(`decision-record v${VERSION}`);
+  info(`Target: ${args.cwd}`);
+  info(`Model: ${config.model}${config.baseURL ? ` @ ${config.baseURL}` : ""}`);
+  if (args.autoYes) info("Mode: autonomous (--yes; checkpoints bypassed)");
+
+  const outcome = await runPipeline(
+    {
+      cwd: args.cwd,
+      client,
+      config,
+      autoYes: args.autoYes,
+      verbose: args.verbose,
+    },
+    {
+      ...(title !== undefined && { title }),
+      ...(description !== undefined && { description }),
+      effortLevel: args.effortLevel,
+      prd,
+      resume: args.resume,
+    }
+  );
+
+  return outcome.exitCode;
+}
+
+main()
+  .then((code) => process.exit(code))
+  .catch((err) => {
+    error(err instanceof Error ? err.message : String(err));
+    process.exit(1);
+  });
diff --git a/server/src/cli/orchestrator.ts b/server/src/cli/orchestrator.ts
new file mode 100644
index 0000000..4828a0f
--- /dev/null
+++ b/server/src/cli/orchestrator.ts
@@ -0,0 +1,415 @@
+import OpenAI from "openai";
+import { LLMConfig } from "../llm/client.js";
+import { executeAgentTool } from "../llm/tools.js";
+import {
+  CheckpointOptions,
+  ask,
+  bullet,
+  confirm,
+  divider,
+  error,
+  header,
+  info,
+  success,
+  warn,
+} from "./checkpoints.js";
+import { PRDDigest } from "./prd.js";
+import { runScopingAgent } from "./agents/scoping.js";
+import { runDecidingAgent } from "./agents/deciding.js";
+import { ALL_LENSES, runSkepticAgent } from "./agents/skeptic.js";
+import { runDecomposerAgent } from "./agents/decomposer.js";
+
+export interface OrchestratorOptions extends CheckpointOptions {
+  cwd: string;
+  client: OpenAI;
+  config: LLMConfig;
+  verbose: boolean;
+}
+
+export interface RunOutcome {
+  exitCode: number;
+  finalPhase: string;
+}
+
+export async function runPipeline(
+  opts: OrchestratorOptions,
+  ctx: {
+    title?: string;
+    description?: string;
+    effortLevel?: "poc" | "mvp" | "full";
+    prd?: PRDDigest | null;
+    resume: boolean;
+  }
+): Promise<RunOutcome> {
+  // 1. Resume check
+  const status = await callTool(opts.cwd, "dr_status", {});
+  const hasProject = status.ok;
+
+  if (hasProject) {
+    if (!ctx.resume) {
+      warn(
+        `A project is already initialized in ${opts.cwd}. Treating this as a resume.`
+      );
+    } else {
+      info(`Resuming existing project in ${opts.cwd}.`);
+    }
+  } else {
+    if (ctx.resume) {
+      error(`No project found in ${opts.cwd}. Nothing to resume.`);
+      return { exitCode: 2, finalPhase: "(none)" };
+    }
+    if (!ctx.title) {
+      error("Title is required to start a new project (pass --title or --idea).");
+      return { exitCode: 2, finalPhase: "(none)" };
+    }
+    header("Phase: Intake");
+    const initRes = await callTool(opts.cwd, "dr_init", {
+      title: ctx.title,
+      description: ctx.description ?? "",
+      effort_level: ctx.effortLevel ?? "mvp",
+    });
+    if (!initRes.ok) {
+      error(`dr_init failed: ${(initRes.errors ?? []).join("; ")}`);
+      return { exitCode: 1, finalPhase: "intake" };
+    }
+    const initData = initRes.data as { project: { id: string; effort_level: string } };
+    success(`Initialized '${initData.project.id}' at effort_level=${initData.project.effort_level}`);
+  }
+
+  // 2. Walk forward through phases.
+  while (true) {
+    const cur = await getStatus(opts.cwd);
+    const phase = cur.state.phase as string;
+    const nextPhase = cur.state.next_phase as string | null;
+    if (!nextPhase || phase === "handed-off") {
+      success(`Pipeline complete. Final phase: ${phase}`);
+      return { exitCode: 0, finalPhase: phase };
+    }
+
+    info(`Current phase: ${phase} → next: ${nextPhase}`);
+    let workResult: { exitCode: number } | null = null;
+    switch (phase) {
+      case "intake":
+        workResult = await advanceIntake(opts, cur, nextPhase as string);
+        break;
+      case "scoping":
+        workResult = await advanceScoping(opts, ctx.prd ?? null);
+        break;
+      case "deciding":
+        workResult = await advanceDeciding(opts);
+        break;
+      case "decomposing":
+        workResult = await advanceDecomposing(opts);
+        break;
+      case "handing-off":
+        workResult = await advanceHandoff(opts);
+        break;
+      default:
+        error(`Unknown phase '${phase}'`);
+        return { exitCode: 1, finalPhase: phase };
+    }
+    if (workResult && workResult.exitCode !== 0) {
+      return { exitCode: workResult.exitCode, finalPhase: phase };
+    }
+  }
+}
+
+async function advanceIntake(
+  opts: OrchestratorOptions,
+  status: StatusData,
+  nextPhase: string
+): Promise<{ exitCode: number }> {
+  const needsHumanSignoff = status.effective_gate_config.require_human_signoff_phases.includes(
+    nextPhase
+  );
+  return advancePhase(opts, "intake → scoping", needsHumanSignoff);
+}
+
+async function advanceScoping(
+  opts: OrchestratorOptions,
+  prd: PRDDigest | null
+): Promise<{ exitCode: number }> {
+  header("Phase: Scoping");
+  info("Running scoping agent…");
+  const prdContext = prd
+    ? `PRD (excerpt):\n${prd.raw.slice(0, 4000)}${prd.raw.length > 4000 ? "\n…[truncated]" : ""}`
+    : null;
+  const result = await runScopingAgent(opts.client, opts.config, opts.cwd, prdContext, opts.verbose);
+  success(`Scoping agent finished (${result.toolCallCount} tool calls).`);
+  divider();
+  process.stderr.write(result.summary + "\n");
+  divider();
+
+  const project = (await callTool(opts.cwd, "dr_status", {})).data as StatusData;
+  const failures = realGateFailures(project);
+  if (failures.length > 0) {
+    warn("Scoping gate is not yet passable. The agent's output was:");
+    for (const r of failures) bullet(r);
+    return { exitCode: 1 };
+  }
+  return advancePhase(opts, "scoping → deciding", needsHumanSignoffFor(project, "deciding"));
+}
+
+async function advanceDeciding(opts: OrchestratorOptions): Promise<{ exitCode: number }> {
+  header("Phase: Deciding");
+  info("Running deciding agent (proposing decisions)…");
+  const result = await runDecidingAgent(opts.client, opts.config, opts.cwd, opts.verbose);
+  success(`Deciding agent finished (${result.toolCallCount} tool calls).`);
+  divider();
+  process.stderr.write(result.summary + "\n");
+  divider();
+
+  // Lens-rotating review for every proposed decision.
+  const proposed = await listDecisions(opts.cwd, "proposed");
+  if (proposed.length === 0) {
+    warn("No decisions in 'proposed' state to review.");
+  } else {
+    header(`Antagonistic review: ${proposed.length} decisions × ${ALL_LENSES.length} lenses`);
+    for (const d of proposed) {
+      info(`Reviewing ${d.id} — ${d.title}`);
+      const lensVerdicts: { lens: string; verdict: string; score: number }[] = [];
+      let anyBlock = false;
+      for (const lens of ALL_LENSES) {
+        const review = await runSkepticAgent(
+          opts.client,
+          opts.config,
+          opts.cwd,
+          d.id,
+          lens,
+          opts.verbose
+        );
+        lensVerdicts.push({ lens, verdict: review.verdict, score: review.score });
+        if (review.verdict === "block") {
+          anyBlock = true;
+          warn(`  ${lens}: BLOCK (${review.score}/5) — ${review.concerns.join("; ")}`);
+        } else {
+          info(`  ${lens}: pass (${review.score}/5)`);
+        }
+      }
+      if (anyBlock) {
+        warn(`${d.id} has blocking concerns. Will not auto-accept.`);
+        const decision = await ask(
+          `Override and accept ${d.id} anyway? (type 'accept' to override, anything else to reject)`,
+          opts,
+          "reject"
+        );
+        if (decision === "accept") {
+          await callTool(opts.cwd, "dr_accept_decision", {
+            id: d.id,
+            sign_off_by: "human",
+            sign_off_actor: "cli-user",
+            sign_off_notes: "Accepted with blocking review concerns overridden.",
+          });
+          success(`Accepted ${d.id} with human override.`);
+        } else {
+          await callTool(opts.cwd, "dr_reject_decision", {
+            id: d.id,
+            reason: "Skeptic review blocked; not overridden.",
+            sign_off_by: "human",
+            sign_off_actor: "cli-user",
+          });
+          warn(`Rejected ${d.id}.`);
+        }
+      } else {
+        const accept = await callTool(opts.cwd, "dr_accept_decision", {
+          id: d.id,
+          sign_off_by: "human",
+          sign_off_actor: "cli-user",
+          sign_off_notes: `All ${ALL_LENSES.length} lens reviews passed.`,
+        });
+        if (accept.ok) {
+          success(`Accepted ${d.id}.`);
+        } else {
+          warn(`Could not accept ${d.id}: ${(accept.errors ?? []).join("; ")}`);
+        }
+      }
+    }
+  }
+
+  const status = await getStatus(opts.cwd);
+  const failures = realGateFailures(status);
+  if (failures.length > 0) {
+    warn("Deciding gate still failing:");
+    for (const r of failures) bullet(r);
+    return { exitCode: 1 };
+  }
+  return advancePhase(opts, "deciding → decomposing", needsHumanSignoffFor(status, "decomposing"));
+}
+
+async function advanceDecomposing(opts: OrchestratorOptions): Promise<{ exitCode: number }> {
+  header("Phase: Decomposing");
+  info("Running decomposer agent (building task graph)…");
+  const result = await runDecomposerAgent(opts.client, opts.config, opts.cwd, opts.verbose);
+  if (result.validationPassed) {
+    success(`Decomposer finished (${result.toolCallCount} tool calls). Graph validates.`);
+  } else {
+    warn(`Decomposer finished (${result.toolCallCount} tool calls) but graph did not validate.`);
+  }
+  divider();
+  process.stderr.write(result.summary + "\n");
+  divider();
+
+  const status = await getStatus(opts.cwd);
+  const failures = realGateFailures(status);
+  if (failures.length > 0) {
+    warn("Decomposing gate still failing:");
+    for (const r of failures) bullet(r);
+    return { exitCode: 1 };
+  }
+  return advancePhase(opts, "decomposing → handing-off", needsHumanSignoffFor(status, "handing-off"));
+}
+
+async function advanceHandoff(opts: OrchestratorOptions): Promise<{ exitCode: number }> {
+  header("Phase: Handoff");
+  info("Rendering Markdown + HTML artifacts…");
+  const renderRes = await callTool(opts.cwd, "dr_render", {});
+  if (!renderRes.ok) {
+    error(`Render failed: ${(renderRes.errors ?? []).join("; ")}`);
+    return { exitCode: 1 };
+  }
+  success("Artifacts rendered.");
+
+  const linearAvailable = Boolean(process.env.LINEAR_API_KEY);
+  let target: "linear" | "filesystem" = "filesystem";
+  if (linearAvailable) {
+    const wantsLinear = await confirm(
+      "LINEAR_API_KEY detected. Push the plan to Linear?",
+      opts,
+      true
+    );
+    target = wantsLinear ? "linear" : "filesystem";
+  }
+
+  if (target === "linear") {
+    const teamId = await ask(
+      "Linear team ID:",
+      opts,
+      process.env.LINEAR_TEAM_ID ?? ""
+    );
+    if (!teamId) {
+      error("Linear team ID is required.");
+      return { exitCode: 2 };
+    }
+    info("Running dry-run preview…");
+    const dry = await callTool(opts.cwd, "dr_export_linear", {
+      team_id: teamId,
+      dry_run: true,
+    });
+    if (!dry.ok) {
+      error(`Linear dry-run failed: ${(dry.errors ?? []).join("; ")}`);
+      return { exitCode: 1 };
+    }
+    const totals = (dry.data as { totals: { issues: number; decisions: number; tasks: number } }).totals;
+    info(`Dry-run plan: ${totals.issues} issues (${totals.decisions} decisions + ${totals.tasks} tasks)`);
+    const proceed = await confirm("Push to Linear now?", opts, true);
+    if (!proceed) {
+      warn("Linear push cancelled. Project remains in 'handing-off'.");
+      return { exitCode: 0 };
+    }
+    const push = await callTool(opts.cwd, "dr_export_linear", {
+      team_id: teamId,
+      dry_run: false,
+      sign_off_by: "human",
+      sign_off_actor: "cli-user",
+    });
+    if (!push.ok) {
+      error(`Linear export failed: ${(push.errors ?? []).join("; ")}`);
+      return { exitCode: 1 };
+    }
+    const data = push.data as { linear_project: { url?: string }; issues_created: number };
+    success(`Pushed ${data.issues_created} issues to Linear.`);
+    if (data.linear_project.url) info(`Project URL: ${data.linear_project.url}`);
+  } else {
+    const proceed = await confirm("Finalize plan to filesystem?", opts, true);
+    if (!proceed) {
+      warn("Filesystem export cancelled. Project remains in 'handing-off'.");
+      return { exitCode: 0 };
+    }
+    const fs = await callTool(opts.cwd, "dr_export_filesystem", {
+      sign_off_by: "human",
+      sign_off_actor: "cli-user",
+    });
+    if (!fs.ok) {
+      error(`Filesystem export failed: ${(fs.errors ?? []).join("; ")}`);
+      return { exitCode: 1 };
+    }
+    success("Plan finalized to filesystem.");
+  }
+  // Re-render so artifacts reflect the final 'handed-off' state.
+  await callTool(opts.cwd, "dr_render", {});
+  return { exitCode: 0 };
+}
+
+async function advancePhase(
+  opts: OrchestratorOptions,
+  label: string,
+  needsHumanSignoff: boolean
+): Promise<{ exitCode: number }> {
+  if (needsHumanSignoff) {
+    divider();
+    info(`Next transition (${label}) requires human sign-off.`);
+    const proceed = await confirm("Advance?", opts, true);
+    if (!proceed) {
+      warn(`Halting before ${label}. Re-run to resume.`);
+      return { exitCode: 0 };
+    }
+  }
+  const args = needsHumanSignoff
+    ? { sign_off_by: "human", sign_off_actor: "cli-user" }
+    : {};
+  const adv = await callTool(opts.cwd, "dr_advance", args);
+  if (!adv.ok) {
+    error(`dr_advance failed for ${label}:`);
+    for (const r of adv.errors ?? []) bullet(r);
+    return { exitCode: 1 };
+  }
+  success(`Advanced: ${label}`);
+  return { exitCode: 0 };
+}
+
+function needsHumanSignoffFor(status: StatusData, nextPhase: string): boolean {
+  return status.effective_gate_config.require_human_signoff_phases.includes(nextPhase);
+}
+
+/**
+ * Return gate-failure reasons excluding sign-off failures.
+ * The orchestrator handles sign-off itself in advancePhase, so a sign-off-only
+ * "failure" from dr_status (which is called without sign-off context) is not
+ * a real blocker.
+ */
+function realGateFailures(status: StatusData): string[] {
+  return status.gate_to_next.reasons.filter((r) => !r.startsWith("Sign-off gate:"));
+}
+
+interface StatusData {
+  state: { phase: string; next_phase: string | null };
+  gate_to_next: { pass: boolean; reasons: string[]; next_phase: string | null };
+  effective_gate_config: {
+    require_human_signoff_phases: string[];
+    [k: string]: unknown;
+  };
+  counts: { decisions: number; tasks: number };
+}
+
+async function getStatus(cwd: string): Promise<StatusData> {
+  const res = await callTool(cwd, "dr_status", {});
+  if (!res.ok) throw new Error(`dr_status failed: ${(res.errors ?? []).join("; ")}`);
+  return res.data as StatusData;
+}
+
+async function listDecisions(
+  cwd: string,
+  status: "rfc" | "proposed" | "accepted" | "rejected" | "deprecated" | "superseded"
+): Promise<{ id: string; title: string }[]> {
+  const res = await callTool(cwd, "dr_list_decisions", { status: [status] });
+  if (!res.ok) return [];
+  return ((res.data as { decisions?: { id: string; title: string }[] }).decisions) ?? [];
+}
+
+async function callTool(
+  cwd: string,
+  name: string,
+  args: Record<string, unknown>
+): Promise<{ ok: boolean; data?: unknown; errors?: string[] }> {
+  return executeAgentTool(name, { ...args, cwd }, { cwd });
+}
diff --git a/server/src/cli/prd.ts b/server/src/cli/prd.ts
new file mode 100644
index 0000000..930f54c
--- /dev/null
+++ b/server/src/cli/prd.ts
@@ -0,0 +1,36 @@
+import { readFile } from "node:fs/promises";
+
+export interface PRDDigest {
+  /** Raw PRD content. */
+  raw: string;
+  /** First H1 if present — used as a title hint. */
+  title_hint?: string;
+  /** First paragraph after title — used as a description hint. */
+  description_hint?: string;
+}
+
+export async function readPRD(path: string): Promise<PRDDigest> {
+  const raw = await readFile(path, "utf8");
+  return digest(raw);
+}
+
+export function digest(raw: string): PRDDigest {
+  const lines = raw.split("\n");
+  let title_hint: string | undefined;
+  for (const line of lines) {
+    const trimmed = line.trim();
+    if (trimmed.startsWith("# ") && !trimmed.startsWith("##")) {
+      title_hint = trimmed.replace(/^#+\s*/, "").trim();
+      break;
+    }
+  }
+  // Take first non-heading, non-empty paragraph as description hint
+  let description_hint: string | undefined;
+  const blocks = raw.split(/\n\s*\n/).map((b) => b.trim()).filter((b) => b.length > 0);
+  for (const block of blocks) {
+    if (block.startsWith("#")) continue;
+    description_hint = block.length > 800 ? block.slice(0, 800) + "…" : block;
+    break;
+  }
+  return { raw, ...(title_hint && { title_hint }), ...(description_hint && { description_hint }) };
+}
diff --git a/server/src/llm/agent.ts b/server/src/llm/agent.ts
new file mode 100644
index 0000000..931ccbc
--- /dev/null
+++ b/server/src/llm/agent.ts
@@ -0,0 +1,161 @@
+import OpenAI from "openai";
+import { LLMConfig } from "./client.js";
+import {
+  executeAgentTool,
+  listOpenAITools,
+  ToolFilter,
+  ToolInvocationContext,
+} from "./tools.js";
+import { log } from "../log.js";
+
+export interface AgentOptions {
+  client: OpenAI;
+  config: LLMConfig;
+  system: string;
+  toolFilter?: ToolFilter;
+  toolContext: ToolInvocationContext;
+  /** Max tool-use iterations before giving up. */
+  maxIterations?: number;
+  /** Stream agent reasoning to stderr. */
+  verbose?: boolean;
+}
+
+export interface AgentTurn {
+  /** Final assistant text after the loop ends. */
+  text: string;
+  /** Tool calls executed during the loop. */
+  toolCalls: { name: string; args: Record<string, unknown>; resultText: string }[];
+  /** Reason the loop terminated. */
+  stopReason: "end_turn" | "max_iterations" | "refusal" | "length";
+  /** Total iterations consumed. */
+  iterations: number;
+  /** Approximate token usage (sum across all turns). */
+  usage: { prompt: number; completion: number };
+}
+
+/** Run a single agent turn — initial user message plus full tool-using loop until the model has nothing more to do. */
+export async function runAgentTurn(
+  options: AgentOptions,
+  userMessage: string
+): Promise<AgentTurn> {
+  const messages: OpenAI.Chat.ChatCompletionMessageParam[] = [
+    { role: "system", content: options.system },
+    { role: "user", content: userMessage },
+  ];
+  return runAgentLoop(options, messages);
+}
+
+/** Continue an agent conversation with a new user message. Messages are mutated in place. */
+export async function continueAgentConversation(
+  options: AgentOptions,
+  messages: OpenAI.Chat.ChatCompletionMessageParam[],
+  userMessage: string
+): Promise<AgentTurn> {
+  messages.push({ role: "user", content: userMessage });
+  return runAgentLoop(options, messages);
+}
+
+async function runAgentLoop(
+  options: AgentOptions,
+  messages: OpenAI.Chat.ChatCompletionMessageParam[]
+): Promise<AgentTurn> {
+  const tools = listOpenAITools(options.toolFilter);
+  const maxIter = options.maxIterations ?? 32;
+  const toolCalls: AgentTurn["toolCalls"] = [];
+  const usage = { prompt: 0, completion: 0 };
+
+  for (let i = 0; i < maxIter; i++) {
+    const completion = await options.client.chat.completions.create({
+      model: options.config.model,
+      messages,
+      tools: tools.length > 0 ? tools : undefined,
+      max_tokens: options.config.maxTokens,
+      temperature: options.config.temperature,
+    });
+    if (completion.usage) {
+      usage.prompt += completion.usage.prompt_tokens;
+      usage.completion += completion.usage.completion_tokens;
+    }
+    const choice = completion.choices[0];
+    if (!choice) {
+      throw new Error("LLM returned no choices");
+    }
+    const msg = choice.message;
+    messages.push(msg as OpenAI.Chat.ChatCompletionMessageParam);
+
+    if (options.verbose && msg.content) {
+      process.stderr.write(`\n[agent] ${msg.content}\n`);
+    }
+
+    if (choice.finish_reason === "length") {
+      return {
+        text: msg.content ?? "",
+        toolCalls,
+        stopReason: "length",
+        iterations: i + 1,
+        usage,
+      };
+    }
+    if (choice.finish_reason === "content_filter") {
+      return {
+        text: msg.content ?? "[content filtered]",
+        toolCalls,
+        stopReason: "refusal",
+        iterations: i + 1,
+        usage,
+      };
+    }
+    const calls = msg.tool_calls ?? [];
+    if (calls.length === 0) {
+      return {
+        text: msg.content ?? "",
+        toolCalls,
+        stopReason: "end_turn",
+        iterations: i + 1,
+        usage,
+      };
+    }
+
+    for (const call of calls) {
+      if (call.type !== "function") continue;
+      const name = call.function.name;
+      const argsStr = call.function.arguments;
+      if (options.verbose) {
+        process.stderr.write(`[agent→${name}] ${argsStr}\n`);
+      }
+      const result = await executeAgentTool(name, argsStr, options.toolContext);
+      const resultText = JSON.stringify(result, null, 2);
+      toolCalls.push({
+        name,
+        args: safeJson(argsStr),
+        resultText,
+      });
+      messages.push({
+        role: "tool",
+        tool_call_id: call.id,
+        content: resultText,
+      });
+      if (options.verbose) {
+        const head = resultText.length > 300 ? resultText.slice(0, 300) + "…" : resultText;
+        process.stderr.write(`[tool→${name}] ${head}\n`);
+      }
+    }
+  }
+
+  log.warn(`Agent loop hit max_iterations=${maxIter} without ending`);
+  return {
+    text: "[agent stopped: max iterations reached]",
+    toolCalls,
+    stopReason: "max_iterations",
+    iterations: maxIter,
+    usage,
+  };
+}
+
+function safeJson(s: string): Record<string, unknown> {
+  try {
+    return JSON.parse(s);
+  } catch {
+    return { _raw: s };
+  }
+}
diff --git a/server/src/llm/client.ts b/server/src/llm/client.ts
new file mode 100644
index 0000000..ac71d27
--- /dev/null
+++ b/server/src/llm/client.ts
@@ -0,0 +1,34 @@
+import OpenAI from "openai";
+
+export interface LLMConfig {
+  apiKey?: string;
+  baseURL?: string;
+  model: string;
+  maxTokens?: number;
+  temperature?: number;
+}
+
+export function resolveConfig(overrides: Partial<LLMConfig> = {}): LLMConfig {
+  const apiKey = overrides.apiKey ?? process.env.OPENAI_API_KEY;
+  if (!apiKey) {
+    throw new Error(
+      "OPENAI_API_KEY is required (or pass --api-key). Set OPENAI_BASE_URL for non-default endpoints (Ollama, vLLM, OpenRouter, LiteLLM, etc.)."
+    );
+  }
+  const baseURL = overrides.baseURL ?? process.env.OPENAI_BASE_URL;
+  const model = overrides.model ?? process.env.OPENAI_MODEL ?? "gpt-4o";
+  return {
+    apiKey,
+    baseURL,
+    model,
+    maxTokens: overrides.maxTokens,
+    temperature: overrides.temperature,
+  };
+}
+
+export function makeClient(config: LLMConfig): OpenAI {
+  return new OpenAI({
+    apiKey: config.apiKey,
+    baseURL: config.baseURL,
+  });
+}
diff --git a/server/src/llm/tools.ts b/server/src/llm/tools.ts
new file mode 100644
index 0000000..431bfee
--- /dev/null
+++ b/server/src/llm/tools.ts
@@ -0,0 +1,94 @@
+import { getTool, listTools } from "../tools/registry.js";
+import { zodToJsonSchema } from "../jsonSchema.js";
+import { z } from "zod";
+import OpenAI from "openai";
+
+export interface ToolFilter {
+  /** If set, only tools whose name is in this list are exposed. */
+  include?: string[];
+  /** If set, tools whose name is in this list are hidden. */
+  exclude?: string[];
+}
+
+export function listOpenAITools(filter: ToolFilter = {}): OpenAI.Chat.ChatCompletionTool[] {
+  return listTools()
+    .filter((t) => (filter.include ? filter.include.includes(t.name) : true))
+    .filter((t) => (filter.exclude ? !filter.exclude.includes(t.name) : true))
+    .map((t) => ({
+      type: "function",
+      function: {
+        name: t.name,
+        description: t.description,
+        parameters: zodToJsonSchema(t.inputSchema) as Record<string, unknown>,
+      },
+    }));
+}
+
+export interface ToolInvocationContext {
+  /** Target project cwd. Injected into every tool call that accepts `cwd`. */
+  cwd: string;
+}
+
+export interface ToolCallResult {
+  ok: boolean;
+  data?: unknown;
+  errors?: string[];
+  warnings?: string[];
+}
+
+/**
+ * Execute a tool by name with the agent's chosen input. Injects `cwd` from the
+ * orchestrator's context if the tool accepts it and the agent didn't supply one.
+ * Validation errors are returned as ok:false so the agent can recover.
+ */
+export async function executeAgentTool(
+  name: string,
+  rawArgs: string | Record<string, unknown>,
+  ctx: ToolInvocationContext
+): Promise<ToolCallResult> {
+  const tool = getTool(name);
+  if (!tool) {
+    return { ok: false, errors: [`Unknown tool: ${name}`] };
+  }
+  let args: Record<string, unknown>;
+  try {
+    args = typeof rawArgs === "string" ? JSON.parse(rawArgs) : rawArgs;
+  } catch (err) {
+    return {
+      ok: false,
+      errors: [
+        `Failed to parse tool arguments as JSON: ${err instanceof Error ? err.message : String(err)}`,
+      ],
+    };
+  }
+
+  // Inject cwd automatically when the tool has a `cwd` field in its schema
+  // and the agent didn't pass one.
+  if (toolAcceptsCwd(tool.inputSchema) && !("cwd" in args)) {
+    args.cwd = ctx.cwd;
+  }
+
+  try {
+    const validated = tool.inputSchema.parse(args);
+    const result = await tool.handler(validated);
+    return result as ToolCallResult;
+  } catch (err) {
+    if (err instanceof z.ZodError) {
+      return {
+        ok: false,
+        errors: err.errors.map((e) => `${e.path.join(".") || "(root)"}: ${e.message}`),
+      };
+    }
+    return {
+      ok: false,
+      errors: [err instanceof Error ? err.message : String(err)],
+    };
+  }
+}
+
+function toolAcceptsCwd(schema: z.ZodTypeAny): boolean {
+  const def = (schema as unknown as { _def: { typeName: string; shape?: () => Record<string, unknown> } })._def;
+  if (def.typeName !== "ZodObject") return false;
+  const obj = schema as z.ZodObject<z.ZodRawShape>;
+  return "cwd" in obj.shape;
+}
diff --git a/server/src/schemas/index.ts b/server/src/schemas/index.ts
index 9e0acb5..9fc5fdb 100644
--- a/server/src/schemas/index.ts
+++ b/server/src/schemas/index.ts
@@ -254,7 +254,7 @@ export const GateFailureSchema = z.object({
 export type GateFailure = z.infer<typeof GateFailureSchema>;
 
 export const PipelineStateSchema = z.object({
-  schema_version: z.string(),
+  schema_version: z.string().regex(/^[0-9]+\.[0-9]+\.[0-9]+$/, "must be semver"),
   project_id: SlugSchema,
   phase: PhaseSchema,
   effective_gate_config: EffectiveGateConfigSchema,
diff --git a/server/tests/flow-poc-pipeline.test.ts b/server/tests/flow-poc-pipeline.test.ts
new file mode 100644
index 0000000..5203f4a
--- /dev/null
+++ b/server/tests/flow-poc-pipeline.test.ts
@@ -0,0 +1,406 @@
+import { describe, it, before, after } from "node:test";
+import assert from "node:assert/strict";
+import { existsSync, readFileSync, readdirSync } from "node:fs";
+import { join } from "node:path";
+import { makeTmpProject } from "./helpers/tmp-project.js";
+import { makeMockOpenAI, ScriptedResponse } from "./helpers/mock-openai.js";
+import { registerAllTools } from "../src/tools/index.js";
+import { runPipeline } from "../src/cli/orchestrator.js";
+
+/**
+ * End-to-end pipeline test using a scripted mock LLM.
+ *
+ * This test drives the full intake → scoping → deciding → decomposing → handoff
+ * flow without any real API calls. The mock LLM is told exactly what tool calls to
+ * make at each phase, and we assert the artifacts on disk match expectations.
+ */
+describe("Flow: POC happy path (mock LLM)", () => {
+  let toolsRegistered = false;
+
+  before(() => {
+    if (!toolsRegistered) {
+      registerAllTools();
+      toolsRegistered = true;
+    }
+  });
+
+  it("runs intake → scoping → deciding → decomposing → handoff (filesystem)", async () => {
+    const project = makeTmpProject("dr-flow-poc-");
+    try {
+      const script: ScriptedResponse[] = [
+        // ── Scoping agent ──────────────────────────────────────────────
+        // Turn 1: read status
+        { toolCalls: [{ name: "dr_status", args: {} }] },
+        // Turn 2: set scope
+        {
+          toolCalls: [
+            {
+              name: "dr_update_scope",
+              args: {
+                in_scope: ["thing A", "thing B"],
+                success_criteria: ["it works", "it ships"],
+                out_of_scope: ["far-future feature"],
+                nice_to_have: [],
+              },
+            },
+          ],
+        },
+        // Turn 3: final summary
+        { text: "Scope set. in_scope: A, B. success: it works, it ships." },
+
+        // ── Deciding agent ─────────────────────────────────────────────
+        // Turn 1: read status
+        { toolCalls: [{ name: "dr_status", args: {} }] },
+        // Turn 2: search seeds
+        { toolCalls: [{ name: "dr_seed_search", args: { query: "language" } }] },
+        // Turn 3: load seed
+        {
+          toolCalls: [{ name: "dr_seed_load", args: { seed_name: "language-choice" } }],
+        },
+        // Turn 4: pick a position + argument
+        {
+          toolCalls: [
+            {
+              name: "dr_update_decision",
+              args: {
+                id: "0001-choose-the-primary-implementation-language",
+                selected_position: "TypeScript",
+                argument: "Team has deep TS expertise and the project is web-facing.",
+              },
+            },
+          ],
+        },
+        // Turn 5: final summary
+        { text: "Decided: 0001-* → TypeScript." },
+
+        // ── Skeptic (5 lenses × 1 decision = 5 invocations × 2 turns each) ──
+        // Each skeptic invocation: 1 review tool call + 1 summary
+        // operational
+        {
+          toolCalls: [
+            {
+              name: "dr_review_decision",
+              args: {
+                id: "0001-choose-the-primary-implementation-language",
+                reviewer: "dr-skeptic",
+                lens: "operational",
+                verdict: "pass",
+                score: 4,
+                concerns: [],
+              },
+            },
+          ],
+        },
+        { text: "Operational review: pass (4/5)." },
+        // strategic
+        {
+          toolCalls: [
+            {
+              name: "dr_review_decision",
+              args: {
+                id: "0001-choose-the-primary-implementation-language",
+                reviewer: "dr-skeptic",
+                lens: "strategic",
+                verdict: "pass",
+                score: 4,
+                concerns: [],
+              },
+            },
+          ],
+        },
+        { text: "Strategic review: pass." },
+        // security
+        {
+          toolCalls: [
+            {
+              name: "dr_review_decision",
+              args: {
+                id: "0001-choose-the-primary-implementation-language",
+                reviewer: "dr-skeptic",
+                lens: "security",
+                verdict: "pass",
+                score: 5,
+                concerns: [],
+              },
+            },
+          ],
+        },
+        { text: "Security review: pass." },
+        // cost
+        {
+          toolCalls: [
+            {
+              name: "dr_review_decision",
+              args: {
+                id: "0001-choose-the-primary-implementation-language",
+                reviewer: "dr-skeptic",
+                lens: "cost",
+                verdict: "pass",
+                score: 4,
+                concerns: [],
+              },
+            },
+          ],
+        },
+        { text: "Cost review: pass." },
+        // user-impact
+        {
+          toolCalls: [
+            {
+              name: "dr_review_decision",
+              args: {
+                id: "0001-choose-the-primary-implementation-language",
+                reviewer: "dr-skeptic",
+                lens: "user-impact",
+                verdict: "pass",
+                score: 5,
+                concerns: [],
+              },
+            },
+          ],
+        },
+        { text: "User-impact review: pass." },
+
+        // ── Decomposer agent ───────────────────────────────────────────
+        { toolCalls: [{ name: "dr_status", args: {} }] },
+        { toolCalls: [{ name: "dr_list_decisions", args: { status: ["accepted"] } }] },
+        {
+          toolCalls: [
+            {
+              name: "dr_propose_task",
+              args: {
+                title: "Bootstrap repository",
+                description: "Init repo, install deps, scaffold config.",
+                acceptance_criteria: ["repo initialized", "tsconfig in place"],
+                estimate: { unit: "hours", value: 2, confidence: "high" },
+                decision_refs: ["0001-choose-the-primary-implementation-language"],
+                priority: "p0",
+              },
+            },
+          ],
+        },
+        {
+          toolCalls: [
+            {
+              name: "dr_propose_task",
+              args: {
+                title: "Implement core feature",
+                description: "Build the main thing.",
+                acceptance_criteria: ["feature works", "tests pass"],
+                estimate: { unit: "hours", value: 6, confidence: "med" },
+                depends_on: ["T0001-bootstrap-repository"],
+                decision_refs: ["0001-choose-the-primary-implementation-language"],
+                priority: "p0",
+              },
+            },
+          ],
+        },
+        {
+          toolCalls: [
+            {
+              name: "dr_propose_task",
+              args: {
+                title: "Ship and document",
+                description: "Build artifact and write README.",
+                acceptance_criteria: ["binary built", "README complete"],
+                estimate: { unit: "hours", value: 2, confidence: "high" },
+                depends_on: ["T0002-implement-core-feature"],
+                decision_refs: ["0001-choose-the-primary-implementation-language"],
+                priority: "p1",
+              },
+            },
+          ],
+        },
+        { toolCalls: [{ name: "dr_validate_graph", args: {} }] },
+        { text: "3 tasks: bootstrap → implement → ship. Graph validates." },
+      ];
+
+      const client = makeMockOpenAI(script);
+
+      const outcome = await runPipeline(
+        {
+          cwd: project.cwd,
+          client,
+          config: { apiKey: "mock", model: "mock" },
+          autoYes: true,
+          verbose: false,
+        },
+        {
+          title: "Flow POC Test",
+          description: "A test project for the flow harness.",
+          effortLevel: "poc",
+          prd: null,
+          resume: false,
+        }
+      );
+
+      assert.equal(outcome.exitCode, 0, "pipeline should exit cleanly");
+      assert.equal(outcome.finalPhase, "handed-off", "should reach handed-off");
+
+      // Artifacts on disk
+      assert.ok(project.exists("dr/project.json"), "project.json exists");
+      assert.ok(project.exists(".dr/state.json"), "state.json exists");
+      assert.ok(project.exists("dr/index.html"), "index.html rendered");
+
+      const projectJson = project.readJson<{
+        status: string;
+        handoff?: { target: string };
+        scope?: { in_scope: string[] };
+      }>("dr/project.json");
+      assert.equal(projectJson.status, "handed-off");
+      assert.equal(projectJson.handoff?.target, "filesystem");
+      assert.deepEqual(projectJson.scope?.in_scope, ["thing A", "thing B"]);
+
+      const decisions = project.list("dr/decisions").filter((f) => f.endsWith(".json"));
+      assert.equal(decisions.length, 1, "exactly one decision");
+      const decision = project.readJson<{ status: string; review: unknown[] }>(
+        join("dr/decisions", decisions[0]!)
+      );
+      assert.equal(decision.status, "accepted");
+      assert.equal(decision.review.length, 5, "5 lens reviews recorded");
+
+      const tasks = project.list("dr/tasks").filter((f) => f.endsWith(".json"));
+      assert.equal(tasks.length, 3, "three tasks");
+
+      // Event log — verify all major lifecycle events were captured.
+      // Note: this test uses a seed-loaded decision, which emits 'seed_loaded'
+      // instead of 'decision_proposed'.
+      const events = project.events();
+      const kinds = new Set(events.map((e) => e.kind as string));
+      assert.ok(kinds.has("project_initialized"), "project_initialized event");
+      assert.ok(kinds.has("scope_updated"), "scope_updated event");
+      assert.ok(kinds.has("seed_loaded"), "seed_loaded event (seed-instantiated DR)");
+      assert.ok(kinds.has("decision_reviewed"), "decision_reviewed event");
+      assert.ok(kinds.has("decision_accepted"), "decision_accepted event");
+      assert.ok(kinds.has("task_proposed"), "task_proposed event");
+      assert.ok(kinds.has("export_completed"), "export_completed event");
+      assert.ok(kinds.has("phase_advanced"), "phase_advanced event");
+
+      // Index HTML sanity
+      const html = readFileSync(join(project.cwd, "dr/index.html"), "utf8");
+      assert.ok(html.includes("Flow POC Test"));
+      assert.ok(html.includes("handed-off"));
+    } finally {
+      project.dispose();
+    }
+  });
+
+  it("rejects a decision when skeptic blocks and no override given", async () => {
+    const project = makeTmpProject("dr-flow-block-");
+    try {
+      // Pre-initialize via direct tool calls so we land mid-pipeline quickly.
+      const { executeAgentTool } = await import("../src/llm/tools.js");
+      await executeAgentTool(
+        "dr_init",
+        { title: "Block Test", description: "test", effort_level: "poc" },
+        { cwd: project.cwd }
+      );
+      await executeAgentTool("dr_advance", {}, { cwd: project.cwd });
+      await executeAgentTool(
+        "dr_update_scope",
+        { in_scope: ["x"], success_criteria: ["y"] },
+        { cwd: project.cwd }
+      );
+      await executeAgentTool("dr_advance", {}, { cwd: project.cwd });
+
+      const script: ScriptedResponse[] = [
+        // Deciding agent
+        { toolCalls: [{ name: "dr_status", args: {} }] },
+        {
+          toolCalls: [
+            {
+              name: "dr_propose_decision",
+              args: {
+                title: "Pick a thing",
+                issue: "We need to pick a thing.",
+                positions: [{ title: "A" }, { title: "B" }],
+              },
+            },
+          ],
+        },
+        {
+          toolCalls: [
+            {
+              name: "dr_update_decision",
+              args: { id: "0001-pick-a-thing", selected_position: "A", argument: "Because A." },
+            },
+          ],
+        },
+        { text: "Decided A." },
+
+        // 5 skeptic reviews — first one blocks
+        {
+          toolCalls: [
+            {
+              name: "dr_review_decision",
+              args: {
+                id: "0001-pick-a-thing",
+                reviewer: "dr-skeptic",
+                lens: "operational",
+                verdict: "block",
+                score: 2,
+                concerns: ["this would burn the team out"],
+              },
+            },
+          ],
+        },
+        { text: "Operational: block." },
+        // Subsequent lenses still run
+        ...Array.from({ length: 4 }, () => [
+          {
+            toolCalls: [
+              {
+                name: "dr_review_decision",
+                args: {
+                  id: "0001-pick-a-thing",
+                  reviewer: "dr-skeptic",
+                  lens: "strategic",
+                  verdict: "pass",
+                  score: 3,
+                  concerns: [],
+                },
+              },
+            ],
+          },
+          { text: "pass." },
+        ]).flat(),
+        // After rejection, the orchestrator advances to decomposing (poc min_decisions=0).
+        // Script the decomposer to do nothing — gate fails on min_tasks, pipeline returns 1.
+        { toolCalls: [{ name: "dr_status", args: {} }] },
+        { toolCalls: [{ name: "dr_list_decisions", args: { status: ["accepted"] } }] },
+        { text: "No accepted decisions; producing no tasks." },
+      ];
+
+      const client = makeMockOpenAI(script);
+
+      // autoYes: true means the override prompt receives "" (fallback "reject"),
+      // so the orchestrator will reject the blocked decision.
+      const outcome = await runPipeline(
+        {
+          cwd: project.cwd,
+          client,
+          config: { apiKey: "mock", model: "mock" },
+          autoYes: true,
+          verbose: false,
+        },
+        { resume: true, prd: null }
+      );
+
+      // Decision was rejected — gate fails (no accepted decisions for poc preset, but min_decisions=0)
+      // Actually for poc preset, min_decisions=0, so the gate might pass. Either way, the
+      // decision should be in 'rejected' state.
+      const { executeAgentTool: tool2 } = await import("../src/llm/tools.js");
+      const listRes = await tool2(
+        "dr_list_decisions",
+        { status: ["rejected"] },
+        { cwd: project.cwd }
+      );
+      const rejected = (listRes.data as { decisions: { id: string }[] }).decisions;
+      assert.equal(rejected.length, 1, "the blocked decision should be rejected");
+      assert.equal(rejected[0]?.id, "0001-pick-a-thing");
+      assert.ok([0, 1].includes(outcome.exitCode), "pipeline should exit cleanly or stall");
+    } finally {
+      project.dispose();
+    }
+  });
+});
diff --git a/server/tests/helpers/index.ts b/server/tests/helpers/index.ts
new file mode 100644
index 0000000..18a8ead
--- /dev/null
+++ b/server/tests/helpers/index.ts
@@ -0,0 +1,2 @@
+export { McpClient, withMcp, type ToolResponse, type McpClientOptions } from "./mcp-client.js";
+export { makeTmpProject, withTmpProject, type TmpProject } from "./tmp-project.js";
diff --git a/server/tests/helpers/mcp-client.ts b/server/tests/helpers/mcp-client.ts
new file mode 100644
index 0000000..020d30e
--- /dev/null
+++ b/server/tests/helpers/mcp-client.ts
@@ -0,0 +1,194 @@
+import { spawn, ChildProcessWithoutNullStreams } from "node:child_process";
+import { resolve } from "node:path";
+
+interface PendingCall {
+  resolve: (value: ToolResponse) => void;
+  reject: (error: Error) => void;
+  timeout: NodeJS.Timeout;
+}
+
+export interface ToolResponse<T = unknown> {
+  ok: boolean;
+  data?: T;
+  errors?: string[];
+  warnings?: string[];
+}
+
+export interface McpClientOptions {
+  /** Absolute path to the built server entrypoint. Defaults to ../../dist/index.js relative to this file. */
+  serverPath?: string;
+  /** Per-call timeout in ms. Defaults to 8000. */
+  timeoutMs?: number;
+  /** Forward server stderr to parent (debugging). Defaults to false. */
+  verboseStderr?: boolean;
+  /** Environment for the spawned server. Merged with process.env. */
+  env?: Record<string, string>;
+}
+
+const DEFAULT_SERVER_PATH = resolve(
+  new URL(".", import.meta.url).pathname,
+  "..",
+  "..",
+  "..",
+  "dist",
+  "index.js"
+);
+
+export class McpClient {
+  private proc: ChildProcessWithoutNullStreams;
+  private nextId = 1;
+  private pending = new Map<number, PendingCall>();
+  private buf = "";
+  private readonly timeoutMs: number;
+  private closed = false;
+
+  constructor(opts: McpClientOptions = {}) {
+    this.timeoutMs = opts.timeoutMs ?? 8000;
+    const serverPath = opts.serverPath ?? DEFAULT_SERVER_PATH;
+    this.proc = spawn("node", [serverPath], {
+      stdio: ["pipe", "pipe", "pipe"],
+      env: { ...process.env, ...(opts.env ?? {}) },
+    });
+    this.proc.stdout.on("data", (d) => this.onStdout(d.toString()));
+    this.proc.stderr.on("data", (d) => {
+      if (opts.verboseStderr) process.stderr.write(d);
+    });
+    this.proc.on("exit", () => {
+      this.closed = true;
+      for (const [, p] of this.pending) {
+        clearTimeout(p.timeout);
+        p.reject(new Error("MCP server exited before responding"));
+      }
+      this.pending.clear();
+    });
+  }
+
+  private onStdout(chunk: string): void {
+    this.buf += chunk;
+    let idx: number;
+    while ((idx = this.buf.indexOf("\n")) >= 0) {
+      const line = this.buf.slice(0, idx).trim();
+      this.buf = this.buf.slice(idx + 1);
+      if (!line) continue;
+      let msg: { id?: number; result?: { content?: { text: string }[]; isError?: boolean }; error?: { message: string } };
+      try {
+        msg = JSON.parse(line);
+      } catch {
+        continue;
+      }
+      if (typeof msg.id !== "number") continue;
+      const pending = this.pending.get(msg.id);
+      if (!pending) continue;
+      this.pending.delete(msg.id);
+      clearTimeout(pending.timeout);
+      if (msg.error) {
+        pending.reject(new Error(`JSON-RPC error: ${msg.error.message}`));
+        continue;
+      }
+      const text = msg.result?.content?.[0]?.text;
+      if (text === undefined) {
+        pending.reject(new Error("Tool response had no content text"));
+        continue;
+      }
+      try {
+        pending.resolve(JSON.parse(text) as ToolResponse);
+      } catch {
+        pending.resolve({ ok: false, errors: ["non-JSON response"], data: text } as ToolResponse);
+      }
+    }
+  }
+
+  private send(method: string, params: Record<string, unknown>): number {
+    if (this.closed) throw new Error("MCP client is closed");
+    const id = this.nextId++;
+    this.proc.stdin.write(
+      JSON.stringify({ jsonrpc: "2.0", id, method, params }) + "\n"
+    );
+    return id;
+  }
+
+  async initialize(): Promise<void> {
+    return new Promise((resolveFn, rejectFn) => {
+      const id = this.send("initialize", {
+        protocolVersion: "2024-11-05",
+        capabilities: {},
+        clientInfo: { name: "dr-test-harness", version: "0" },
+      });
+      const timeout = setTimeout(() => {
+        this.pending.delete(id);
+        rejectFn(new Error("initialize timed out"));
+      }, this.timeoutMs);
+      this.pending.set(id, {
+        resolve: () => resolveFn(),
+        reject: rejectFn,
+        timeout,
+      });
+    });
+  }
+
+  async call<T = unknown>(
+    tool: string,
+    args: Record<string, unknown> = {}
+  ): Promise<ToolResponse<T>> {
+    return new Promise<ToolResponse<T>>((resolveFn, rejectFn) => {
+      const id = this.send("tools/call", { name: tool, arguments: args });
+      const timeout = setTimeout(() => {
+        this.pending.delete(id);
+        rejectFn(new Error(`tool '${tool}' timed out after ${this.timeoutMs}ms`));
+      }, this.timeoutMs);
+      this.pending.set(id, {
+        resolve: (v) => resolveFn(v as ToolResponse<T>),
+        reject: rejectFn,
+        timeout,
+      });
+    });
+  }
+
+  /** Same as call(), but throws when ok=false (test ergonomics). */
+  async callOk<T = unknown>(
+    tool: string,
+    args: Record<string, unknown> = {}
+  ): Promise<T> {
+    const res = await this.call<T>(tool, args);
+    if (!res.ok) {
+      throw new Error(
+        `Expected ok call for ${tool}, got errors: ${(res.errors ?? []).join("; ")}`
+      );
+    }
+    return res.data as T;
+  }
+
+  /** Same as call(), but throws when ok=true (used to assert gate failures). */
+  async callFail(
+    tool: string,
+    args: Record<string, unknown> = {}
+  ): Promise<string[]> {
+    const res = await this.call(tool, args);
+    if (res.ok) {
+      throw new Error(
+        `Expected ${tool} to fail, but it succeeded with: ${JSON.stringify(res.data).slice(0, 200)}`
+      );
+    }
+    return res.errors ?? [];
+  }
+
+  async close(): Promise<void> {
+    if (this.closed) return;
+    this.closed = true;
+    this.proc.kill("SIGTERM");
+    await new Promise<void>((r) => this.proc.on("exit", () => r()));
+  }
+}
+
+export async function withMcp<T>(
+  fn: (mcp: McpClient) => Promise<T>,
+  opts?: McpClientOptions
+): Promise<T> {
+  const mcp = new McpClient(opts);
+  try {
+    await mcp.initialize();
+    return await fn(mcp);
+  } finally {
+    await mcp.close();
+  }
+}
diff --git a/server/tests/helpers/mock-openai.ts b/server/tests/helpers/mock-openai.ts
new file mode 100644
index 0000000..0d2ce6a
--- /dev/null
+++ b/server/tests/helpers/mock-openai.ts
@@ -0,0 +1,82 @@
+import OpenAI from "openai";
+
+/**
+ * Scripted response — a single completion the mock will return.
+ * If `toolCalls` is non-empty, the model is asking for those tools to be executed.
+ * If `text` is non-empty AND no toolCalls, this terminates the agent loop.
+ */
+export interface ScriptedResponse {
+  text?: string;
+  toolCalls?: { name: string; args: Record<string, unknown> }[];
+}
+
+/**
+ * Build a mock OpenAI client that pops scripted responses off a queue.
+ * Each call to chat.completions.create consumes one entry.
+ */
+export function makeMockOpenAI(script: ScriptedResponse[]): OpenAI {
+  let i = 0;
+  const queue = [...script];
+  let nextId = 1;
+
+  const create = async (params: OpenAI.Chat.ChatCompletionCreateParams) => {
+    const entry = queue[i++];
+    if (!entry) {
+      const lastUser = [...params.messages]
+        .reverse()
+        .find((m) => m.role === "user" || m.role === "tool");
+      const lastUserSummary = lastUser
+        ? `last ${lastUser.role}: ${
+            typeof lastUser.content === "string"
+              ? lastUser.content.slice(0, 120)
+              : "[structured content]"
+          }`
+        : "no user/tool messages found";
+      throw new Error(
+        `Mock OpenAI exhausted after ${i - 1} calls (${queue.length} scripted). ${lastUserSummary}`
+      );
+    }
+    if (process.env.DR_MOCK_DEBUG) {
+      process.stderr.write(`[mock #${i}] ${JSON.stringify(entry).slice(0, 200)}\n`);
+    }
+    const toolCalls = (entry.toolCalls ?? []).map((c) => ({
+      id: `call_${nextId++}`,
+      type: "function" as const,
+      function: { name: c.name, arguments: JSON.stringify(c.args) },
+    }));
+    const message: OpenAI.Chat.ChatCompletionMessage = {
+      role: "assistant",
+      content: entry.text ?? null,
+      refusal: null,
+      ...(toolCalls.length > 0 && { tool_calls: toolCalls }),
+    };
+    return {
+      id: `cmpl_mock_${i}`,
+      object: "chat.completion",
+      created: Date.now(),
+      model: "mock",
+      choices: [
+        {
+          index: 0,
+          message,
+          finish_reason: toolCalls.length > 0 ? "tool_calls" : "stop",
+          logprobs: null,
+        },
+      ],
+      usage: { prompt_tokens: 100, completion_tokens: 50, total_tokens: 150 },
+    } as unknown as OpenAI.Chat.ChatCompletion;
+  };
+
+  // Build a minimal object that quacks like OpenAI for our agent loop.
+  const mock = {
+    chat: {
+      completions: { create },
+    },
+  } as unknown as OpenAI;
+  return mock;
+}
+
+export function remainingMockCalls(client: OpenAI, expectedTotal: number): number {
+  // For tests that want to assert the script was fully consumed.
+  return expectedTotal;
+}
diff --git a/server/tests/helpers/tmp-project.ts b/server/tests/helpers/tmp-project.ts
new file mode 100644
index 0000000..a44e0f3
--- /dev/null
+++ b/server/tests/helpers/tmp-project.ts
@@ -0,0 +1,44 @@
+import { mkdtempSync, rmSync, existsSync, readFileSync, readdirSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+
+export interface TmpProject {
+  cwd: string;
+  dispose: () => void;
+  exists: (relative: string) => boolean;
+  read: (relative: string) => string;
+  readJson: <T = unknown>(relative: string) => T;
+  list: (relative: string) => string[];
+  events: () => Array<Record<string, unknown>>;
+}
+
+export function makeTmpProject(prefix = "dr-test-"): TmpProject {
+  const cwd = mkdtempSync(join(tmpdir(), prefix));
+  return {
+    cwd,
+    dispose: () => rmSync(cwd, { recursive: true, force: true }),
+    exists: (relative) => existsSync(join(cwd, relative)),
+    read: (relative) => readFileSync(join(cwd, relative), "utf8"),
+    readJson: (relative) => JSON.parse(readFileSync(join(cwd, relative), "utf8")),
+    list: (relative) => readdirSync(join(cwd, relative)),
+    events: () => {
+      if (!existsSync(join(cwd, ".dr/events.jsonl"))) return [];
+      return readFileSync(join(cwd, ".dr/events.jsonl"), "utf8")
+        .split("\n")
+        .filter((l) => l.trim().length > 0)
+        .map((l) => JSON.parse(l));
+    },
+  };
+}
+
+export async function withTmpProject<T>(
+  fn: (project: TmpProject) => Promise<T>,
+  prefix?: string
+): Promise<T> {
+  const project = makeTmpProject(prefix);
+  try {
+    return await fn(project);
+  } finally {
+    project.dispose();
+  }
+}
diff --git a/server/tests/unit-gate.test.ts b/server/tests/unit-gate.test.ts
new file mode 100644
index 0000000..2d10f0a
--- /dev/null
+++ b/server/tests/unit-gate.test.ts
@@ -0,0 +1,438 @@
+import { describe, it } from "node:test";
+import assert from "node:assert/strict";
+import { presetFor, resolveEffectiveGateConfig } from "../src/gate.js";
+import { evaluateAdvance, nextPhaseOf } from "../src/gateEval.js";
+import {
+  Decision,
+  PipelineState,
+  Project,
+  SCHEMA_VERSION,
+  Task,
+} from "../src/schemas/index.js";
+
+const NOW = "2026-05-17T00:00:00.000Z";
+
+function makeProject(overrides: Partial<Project> = {}): Project {
+  return {
+    id: "test-project",
+    title: "Test Project",
+    description: "An idea worth shipping.",
+    created_at: NOW,
+    updated_at: NOW,
+    effort_level: "poc",
+    status: "intake",
+    sign_offs: [],
+    gate_config: { preset: "poc" },
+    tags: [],
+    ...overrides,
+  };
+}
+
+function makeState(overrides: Partial<PipelineState> = {}): PipelineState {
+  return {
+    schema_version: SCHEMA_VERSION,
+    project_id: "test-project",
+    phase: "intake",
+    effective_gate_config: presetFor("poc"),
+    next_decision_seq: 1,
+    next_task_seq: 1,
+    pending_questions: [],
+    gate_failures: [],
+    ...overrides,
+  };
+}
+
+function makeDecision(overrides: Partial<Decision> = {}): Decision {
+  return {
+    id: "0001-test",
+    number: 1,
+    slug: "test",
+    title: "Test decision",
+    status: "accepted",
+    template_variant: "canonical",
+    created_at: NOW,
+    updated_at: NOW,
+    assumptions: [],
+    constraints: [],
+    positions: [{ title: "A", pros: [], cons: [], links: [] }],
+    opinions: [],
+    selected_position: "A",
+    argument: "Because A.",
+    implications: [],
+    depends_on: [],
+    related_decisions: [],
+    related_artifacts: [],
+    review: [],
+    tags: [],
+    ...overrides,
+  };
+}
+
+function makeTask(overrides: Partial<Task> = {}): Task {
+  return {
+    id: "T0001-test",
+    number: 1,
+    slug: "test",
+    title: "Test task",
+    status: "ready",
+    estimate: { unit: "hours", value: 2 },
+    acceptance_criteria: ["criteria 1"],
+    depends_on: [],
+    decision_refs: [],
+    priority: "p2",
+    labels: [],
+    created_at: NOW,
+    updated_at: NOW,
+    ...overrides,
+  };
+}
+
+describe("gate / preset resolution", () => {
+  it("returns the preset baseline when no overrides", () => {
+    const cfg = resolveEffectiveGateConfig({ preset: "mvp" });
+    assert.equal(cfg.min_decisions, 3);
+    assert.equal(cfg.min_tasks, 8);
+    assert.equal(cfg.max_task_estimate_hours, 8);
+    assert.equal(cfg.review_required_per_decision, false);
+    assert.deepEqual(cfg.review_required_phases, ["scoping", "decomposing"]);
+  });
+
+  it("applies overrides per-knob without affecting other preset values", () => {
+    const cfg = resolveEffectiveGateConfig({
+      preset: "mvp",
+      overrides: { min_tasks: 5, review_required_per_decision: true },
+    });
+    assert.equal(cfg.min_tasks, 5);
+    assert.equal(cfg.review_required_per_decision, true);
+    assert.equal(cfg.min_decisions, 3, "min_decisions still preset default");
+    assert.equal(cfg.max_task_estimate_hours, 8, "max_task_estimate_hours still preset default");
+  });
+
+  it("preset 'poc' is loosest, 'full' is strictest", () => {
+    const poc = presetFor("poc");
+    const mvp = presetFor("mvp");
+    const full = presetFor("full");
+    assert.ok(poc.min_tasks <= mvp.min_tasks);
+    assert.ok(mvp.min_tasks <= full.min_tasks);
+    assert.ok(poc.min_decisions <= mvp.min_decisions);
+    assert.ok(mvp.min_decisions <= full.min_decisions);
+    assert.ok(poc.max_task_estimate_hours >= mvp.max_task_estimate_hours);
+    assert.ok(mvp.max_task_estimate_hours >= full.max_task_estimate_hours);
+  });
+});
+
+describe("nextPhaseOf", () => {
+  it("walks the linear pipeline", () => {
+    assert.equal(nextPhaseOf("intake"), "scoping");
+    assert.equal(nextPhaseOf("scoping"), "deciding");
+    assert.equal(nextPhaseOf("deciding"), "decomposing");
+    assert.equal(nextPhaseOf("decomposing"), "handing-off");
+    assert.equal(nextPhaseOf("handing-off"), "handed-off");
+    assert.equal(nextPhaseOf("handed-off"), null);
+  });
+});
+
+describe("evaluateAdvance: intake → scoping", () => {
+  it("passes with title + description", () => {
+    const project = makeProject();
+    const state = makeState({ phase: "intake" });
+    const result = evaluateAdvance(project, state, [], [], null);
+    assert.equal(result.pass, true);
+    assert.equal(result.next_phase, "scoping");
+  });
+
+  it("blocks when description empty", () => {
+    const project = makeProject({ description: "" });
+    const state = makeState({ phase: "intake" });
+    const result = evaluateAdvance(project, state, [], [], null);
+    assert.equal(result.pass, false);
+    assert.ok(
+      result.reasons.some((r) => r.includes("description")),
+      `expected description-blocked reason; got: ${result.reasons.join(" | ")}`
+    );
+  });
+});
+
+describe("evaluateAdvance: scoping → deciding", () => {
+  it("passes with non-empty in_scope and success_criteria (poc)", () => {
+    const project = makeProject({
+      status: "scoping",
+      scope: {
+        in_scope: ["thing 1"],
+        success_criteria: ["measurable 1"],
+        out_of_scope: [],
+        nice_to_have: [],
+      },
+    });
+    const state = makeState({ phase: "scoping" });
+    const result = evaluateAdvance(project, state, [], [], null);
+    assert.equal(result.pass, true);
+  });
+
+  it("blocks when in_scope is empty", () => {
+    const project = makeProject({
+      status: "scoping",
+      scope: {
+        in_scope: [],
+        success_criteria: ["x"],
+        out_of_scope: [],
+        nice_to_have: [],
+      },
+    });
+    const state = makeState({ phase: "scoping" });
+    const result = evaluateAdvance(project, state, [], [], null);
+    assert.equal(result.pass, false);
+    assert.ok(result.reasons.some((r) => r.includes("in_scope")));
+  });
+
+  it("blocks when success_criteria is empty", () => {
+    const project = makeProject({
+      status: "scoping",
+      scope: {
+        in_scope: ["x"],
+        success_criteria: [],
+        out_of_scope: [],
+        nice_to_have: [],
+      },
+    });
+    const state = makeState({ phase: "scoping" });
+    const result = evaluateAdvance(project, state, [], [], null);
+    assert.equal(result.pass, false);
+    assert.ok(result.reasons.some((r) => r.includes("success_criteria")));
+  });
+
+  it("under mvp preset, requires a scoping DR with passing review", () => {
+    const project = makeProject({
+      effort_level: "mvp",
+      status: "scoping",
+      scope: {
+        in_scope: ["x"],
+        success_criteria: ["y"],
+        out_of_scope: [],
+        nice_to_have: [],
+      },
+      gate_config: { preset: "mvp" },
+    });
+    const state = makeState({
+      phase: "scoping",
+      effective_gate_config: presetFor("mvp"),
+    });
+    const noScopingDr = evaluateAdvance(
+      project,
+      state,
+      [],
+      [],
+      { by: "human" }
+    );
+    assert.equal(noScopingDr.pass, false);
+    assert.ok(noScopingDr.reasons.some((r) => r.includes("scoping decision")));
+
+    const unreviewedScopingDr = makeDecision({
+      id: "0001-scope",
+      slug: "scope",
+      template_variant: "scoping",
+      status: "proposed",
+      review: [],
+    });
+    const stillBlocked = evaluateAdvance(
+      project,
+      state,
+      [unreviewedScopingDr],
+      [],
+      { by: "human" }
+    );
+    assert.equal(stillBlocked.pass, false);
+    assert.ok(stillBlocked.reasons.some((r) => r.includes("no passing review")));
+
+    const reviewedScopingDr = makeDecision({
+      id: "0001-scope",
+      slug: "scope",
+      template_variant: "scoping",
+      status: "proposed",
+      review: [
+        {
+          reviewer: "dr-skeptic",
+          lens: "operational",
+          verdict: "pass",
+          score: 4,
+          concerns: [],
+          at: NOW,
+        },
+      ],
+    });
+    const passes = evaluateAdvance(
+      project,
+      state,
+      [reviewedScopingDr],
+      [],
+      { by: "human" }
+    );
+    assert.equal(passes.pass, true, `expected pass, got: ${passes.reasons.join("; ")}`);
+  });
+});
+
+describe("evaluateAdvance: deciding → decomposing", () => {
+  it("blocks when fewer decisions than min_decisions", () => {
+    const project = makeProject({ status: "deciding", effort_level: "mvp" });
+    const state = makeState({ phase: "deciding", effective_gate_config: presetFor("mvp") });
+    const result = evaluateAdvance(project, state, [makeDecision()], [], { by: "human" });
+    assert.equal(result.pass, false);
+    assert.ok(result.reasons.some((r) => r.includes("decisions")));
+  });
+
+  it("blocks when any decision is still 'proposed'", () => {
+    const project = makeProject({ status: "deciding" });
+    const state = makeState({ phase: "deciding" });
+    const ds = [
+      makeDecision({ id: "0001-a", slug: "a" }),
+      makeDecision({ id: "0002-b", slug: "b", status: "proposed", selected_position: undefined, argument: undefined }),
+    ];
+    const result = evaluateAdvance(project, state, ds, [], { by: "human" });
+    assert.equal(result.pass, false);
+    assert.ok(result.reasons.some((r) => r.includes("not 'accepted'")));
+  });
+
+  it("passes when all decisions accepted and deps resolved (poc)", () => {
+    const project = makeProject({ status: "deciding" });
+    const state = makeState({ phase: "deciding" });
+    const ds = [makeDecision()];
+    const result = evaluateAdvance(project, state, ds, [], { by: "human" });
+    assert.equal(result.pass, true, `expected pass, got: ${result.reasons.join("; ")}`);
+  });
+
+  it("blocks when decision dependencies are missing", () => {
+    const project = makeProject({ status: "deciding" });
+    const state = makeState({ phase: "deciding" });
+    const ds = [
+      makeDecision({ id: "0001-a", slug: "a", depends_on: ["0999-missing"] }),
+    ];
+    const result = evaluateAdvance(project, state, ds, [], { by: "human" });
+    assert.equal(result.pass, false);
+    assert.ok(result.reasons.some((r) => r.includes("missing dependencies")));
+  });
+
+  it("under full preset, requires every accepted decision to have a passing review", () => {
+    const project = makeProject({
+      status: "deciding",
+      effort_level: "full",
+      gate_config: { preset: "full" },
+    });
+    const state = makeState({
+      phase: "deciding",
+      effective_gate_config: presetFor("full"),
+    });
+    // 6 accepted decisions; min_decisions = 6 for full
+    const ds = Array.from({ length: 6 }, (_, i) =>
+      makeDecision({
+        id: `${String(i + 1).padStart(4, "0")}-d${i}`,
+        slug: `d${i}`,
+        number: i + 1,
+      })
+    );
+    const noReview = evaluateAdvance(project, state, ds, [], { by: "human" });
+    assert.equal(noReview.pass, false);
+    assert.ok(
+      noReview.reasons.some((r) => r.includes("lack a passing review")),
+      `expected per-decision-review blocker; got: ${noReview.reasons.join(" | ")}`
+    );
+  });
+});
+
+describe("evaluateAdvance: decomposing → handing-off", () => {
+  it("passes with deps satisfied and estimates in budget", () => {
+    const project = makeProject({ status: "decomposing" });
+    const state = makeState({ phase: "decomposing" });
+    const tasks = [
+      makeTask({ id: "T0001-a", slug: "a", number: 1, decision_refs: [] }),
+      makeTask({ id: "T0002-b", slug: "b", number: 2, depends_on: ["T0001-a"] }),
+      makeTask({ id: "T0003-c", slug: "c", number: 3, depends_on: ["T0002-b"] }),
+    ];
+    const result = evaluateAdvance(project, state, [makeDecision()], tasks, { by: "human" });
+    assert.equal(result.pass, true, `expected pass, got: ${result.reasons.join("; ")}`);
+  });
+
+  it("blocks on cycles", () => {
+    const project = makeProject({ status: "decomposing" });
+    const state = makeState({ phase: "decomposing" });
+    const tasks = [
+      makeTask({ id: "T0001-a", slug: "a", number: 1, depends_on: ["T0003-c"] }),
+      makeTask({ id: "T0002-b", slug: "b", number: 2, depends_on: ["T0001-a"] }),
+      makeTask({ id: "T0003-c", slug: "c", number: 3, depends_on: ["T0002-b"] }),
+    ];
+    const result = evaluateAdvance(project, state, [makeDecision()], tasks, { by: "human" });
+    assert.equal(result.pass, false);
+    assert.ok(result.reasons.some((r) => r.includes("cycles")));
+  });
+
+  it("blocks on orphan dependencies", () => {
+    const project = makeProject({ status: "decomposing" });
+    const state = makeState({ phase: "decomposing" });
+    const tasks = [
+      makeTask({ id: "T0001-a", slug: "a", number: 1, depends_on: ["T0999-missing"] }),
+      makeTask({ id: "T0002-b", slug: "b", number: 2 }),
+      makeTask({ id: "T0003-c", slug: "c", number: 3 }),
+    ];
+    const result = evaluateAdvance(project, state, [makeDecision()], tasks, { by: "human" });
+    assert.equal(result.pass, false);
+    assert.ok(result.reasons.some((r) => r.includes("missing dependencies")));
+  });
+
+  it("blocks when task estimate exceeds max", () => {
+    const project = makeProject({ status: "decomposing" });
+    const state = makeState({ phase: "decomposing" });
+    const tasks = [
+      makeTask({ id: "T0001-a", slug: "a", number: 1, estimate: { unit: "hours", value: 100 } }),
+      makeTask({ id: "T0002-b", slug: "b", number: 2 }),
+      makeTask({ id: "T0003-c", slug: "c", number: 3 }),
+    ];
+    const result = evaluateAdvance(project, state, [makeDecision()], tasks, { by: "human" });
+    assert.equal(result.pass, false);
+    assert.ok(result.reasons.some((r) => r.includes("estimate")));
+  });
+
+  it("blocks when task has no estimate", () => {
+    const project = makeProject({ status: "decomposing" });
+    const state = makeState({ phase: "decomposing" });
+    const tasks = [
+      makeTask({ id: "T0001-a", slug: "a", number: 1 }),
+      makeTask({ id: "T0002-b", slug: "b", number: 2 }),
+      makeTask({ id: "T0003-c", slug: "c", number: 3, estimate: undefined }),
+    ];
+    const result = evaluateAdvance(project, state, [makeDecision()], tasks, { by: "human" });
+    assert.equal(result.pass, false);
+    assert.ok(result.reasons.some((r) => r.includes("missing or oversized")));
+  });
+
+  it("blocks when task references a missing decision", () => {
+    const project = makeProject({ status: "decomposing" });
+    const state = makeState({ phase: "decomposing" });
+    const tasks = [
+      makeTask({ id: "T0001-a", slug: "a", number: 1, decision_refs: ["0999-missing"] }),
+      makeTask({ id: "T0002-b", slug: "b", number: 2 }),
+      makeTask({ id: "T0003-c", slug: "c", number: 3 }),
+    ];
+    const result = evaluateAdvance(project, state, [makeDecision()], tasks, { by: "human" });
+    assert.equal(result.pass, false);
+    assert.ok(result.reasons.some((r) => r.includes("missing decisions")));
+  });
+});
+
+describe("evaluateAdvance: sign-off requirement", () => {
+  it("requires human sign-off for handing-off under poc preset", () => {
+    const project = makeProject({ status: "decomposing" });
+    const state = makeState({ phase: "decomposing" });
+    const tasks = [
+      makeTask({ id: "T0001-a", slug: "a", number: 1 }),
+      makeTask({ id: "T0002-b", slug: "b", number: 2 }),
+      makeTask({ id: "T0003-c", slug: "c", number: 3 }),
+    ];
+    const agentOnly = evaluateAdvance(project, state, [makeDecision()], tasks, {
+      by: "agent",
+    });
+    assert.equal(agentOnly.pass, false);
+    assert.ok(agentOnly.reasons.some((r) => r.includes("human sign-off")));
+
+    const human = evaluateAdvance(project, state, [makeDecision()], tasks, { by: "human" });
+    assert.equal(human.pass, true, `expected pass, got: ${human.reasons.join("; ")}`);
+  });
+});
diff --git a/server/tests/unit-schemas.test.ts b/server/tests/unit-schemas.test.ts
new file mode 100644
index 0000000..3ab2764
--- /dev/null
+++ b/server/tests/unit-schemas.test.ts
@@ -0,0 +1,273 @@
+import { describe, it } from "node:test";
+import assert from "node:assert/strict";
+import {
+  DecisionIdSchema,
+  DecisionSchema,
+  EventSchema,
+  GateConfigSchema,
+  PipelineStateSchema,
+  ProjectSchema,
+  SCHEMA_VERSION,
+  SlugSchema,
+  TaskIdSchema,
+  TaskSchema,
+} from "../src/schemas/index.js";
+
+const NOW = "2026-05-17T00:00:00.000Z";
+
+describe("SlugSchema", () => {
+  it("accepts well-formed kebab-case", () => {
+    assert.doesNotThrow(() => SlugSchema.parse("project-name"));
+    assert.doesNotThrow(() => SlugSchema.parse("a1"));
+    assert.doesNotThrow(() => SlugSchema.parse("multi-word-thing"));
+  });
+
+  it("rejects upper-case, underscores, leading/trailing dashes", () => {
+    assert.throws(() => SlugSchema.parse("Project"));
+    assert.throws(() => SlugSchema.parse("snake_case"));
+    assert.throws(() => SlugSchema.parse("-leading"));
+    assert.throws(() => SlugSchema.parse("trailing-"));
+    assert.throws(() => SlugSchema.parse(""));
+  });
+});
+
+describe("DecisionIdSchema", () => {
+  it("requires 0000-slug shape", () => {
+    assert.doesNotThrow(() => DecisionIdSchema.parse("0001-language-choice"));
+    assert.doesNotThrow(() => DecisionIdSchema.parse("9999-ab"));
+  });
+
+  it("rejects malformed prefixes", () => {
+    assert.throws(() => DecisionIdSchema.parse("1-foo"));
+    assert.throws(() => DecisionIdSchema.parse("0001"));
+    assert.throws(() => DecisionIdSchema.parse("T0001-foo"));
+    assert.throws(() => DecisionIdSchema.parse("0001-"));
+  });
+});
+
+describe("TaskIdSchema", () => {
+  it("requires T0000-slug shape", () => {
+    assert.doesNotThrow(() => TaskIdSchema.parse("T0001-bootstrap"));
+  });
+
+  it("rejects decision-style IDs", () => {
+    assert.throws(() => TaskIdSchema.parse("0001-foo"));
+    assert.throws(() => TaskIdSchema.parse("t0001-foo"));
+  });
+});
+
+describe("GateConfigSchema", () => {
+  it("accepts preset-only", () => {
+    assert.doesNotThrow(() => GateConfigSchema.parse({ preset: "poc" }));
+    assert.doesNotThrow(() => GateConfigSchema.parse({ preset: "mvp" }));
+    assert.doesNotThrow(() => GateConfigSchema.parse({ preset: "full" }));
+  });
+
+  it("accepts preset + overrides", () => {
+    const parsed = GateConfigSchema.parse({
+      preset: "mvp",
+      overrides: { min_tasks: 5, review_required_per_decision: true },
+    });
+    assert.equal(parsed.overrides?.min_tasks, 5);
+    assert.equal(parsed.overrides?.review_required_per_decision, true);
+  });
+
+  it("rejects unknown preset values", () => {
+    assert.throws(() => GateConfigSchema.parse({ preset: "rapid" }));
+  });
+});
+
+describe("ProjectSchema", () => {
+  const validProject = {
+    id: "demo",
+    title: "Demo",
+    description: "",
+    created_at: NOW,
+    updated_at: NOW,
+    effort_level: "poc" as const,
+    status: "intake" as const,
+    sign_offs: [],
+    gate_config: { preset: "poc" as const },
+    tags: [],
+  };
+
+  it("round-trips a minimal project", () => {
+    const parsed = ProjectSchema.parse(validProject);
+    assert.equal(parsed.id, "demo");
+    assert.equal(parsed.status, "intake");
+  });
+
+  it("rejects unknown status values", () => {
+    assert.throws(() => ProjectSchema.parse({ ...validProject, status: "launching" }));
+  });
+
+  it("rejects bogus id slugs", () => {
+    assert.throws(() => ProjectSchema.parse({ ...validProject, id: "Invalid_Id" }));
+  });
+
+  it("rejects invalid effort_level", () => {
+    assert.throws(() => ProjectSchema.parse({ ...validProject, effort_level: "rapid" }));
+  });
+});
+
+describe("DecisionSchema", () => {
+  const validDecision = {
+    id: "0001-xx",
+    number: 1,
+    slug: "xx",
+    title: "X",
+    status: "proposed" as const,
+    template_variant: "canonical" as const,
+    created_at: NOW,
+    updated_at: NOW,
+  };
+
+  it("accepts minimal valid decision", () => {
+    const parsed = DecisionSchema.parse(validDecision);
+    assert.equal(parsed.id, "0001-xx");
+    assert.deepEqual(parsed.positions, []);
+    assert.deepEqual(parsed.review, []);
+  });
+
+  it("rejects mismatched id format", () => {
+    assert.throws(() => DecisionSchema.parse({ ...validDecision, id: "T0001-xx" }));
+  });
+
+  it("rejects invalid template_variant", () => {
+    assert.throws(() =>
+      DecisionSchema.parse({ ...validDecision, template_variant: "novel" })
+    );
+  });
+
+  it("parses full structure with positions, review, sign_off", () => {
+    const full = {
+      ...validDecision,
+      status: "accepted" as const,
+      positions: [{ title: "A", pros: ["fast"], cons: [], links: [] }],
+      selected_position: "A",
+      argument: "speed matters",
+      implications: ["follow-up"],
+      review: [
+        {
+          reviewer: "dr-skeptic",
+          lens: "operational" as const,
+          verdict: "pass" as const,
+          score: 5,
+          concerns: [],
+          at: NOW,
+        },
+      ],
+      sign_off: { by: "human" as const, at: NOW },
+    };
+    const parsed = DecisionSchema.parse(full);
+    assert.equal(parsed.selected_position, "A");
+    assert.equal(parsed.review[0]?.verdict, "pass");
+    assert.equal(parsed.sign_off?.by, "human");
+  });
+});
+
+describe("TaskSchema", () => {
+  const validTask = {
+    id: "T0001-xx",
+    number: 1,
+    slug: "xx",
+    title: "X task",
+    status: "open" as const,
+    acceptance_criteria: [],
+    depends_on: [],
+    decision_refs: [],
+    priority: "p2" as const,
+    labels: [],
+    created_at: NOW,
+    updated_at: NOW,
+  };
+
+  it("round-trips a minimal task", () => {
+    const parsed = TaskSchema.parse(validTask);
+    assert.equal(parsed.status, "open");
+    assert.equal(parsed.priority, "p2");
+  });
+
+  it("accepts estimate with confidence", () => {
+    const parsed = TaskSchema.parse({
+      ...validTask,
+      estimate: { unit: "hours", value: 4, confidence: "med" },
+    });
+    assert.equal(parsed.estimate?.confidence, "med");
+  });
+
+  it("rejects negative estimate", () => {
+    assert.throws(() =>
+      TaskSchema.parse({
+        ...validTask,
+        estimate: { unit: "hours", value: -1 },
+      })
+    );
+  });
+
+  it("rejects unknown priority", () => {
+    assert.throws(() => TaskSchema.parse({ ...validTask, priority: "p4" }));
+  });
+});
+
+describe("PipelineStateSchema", () => {
+  const validState = {
+    schema_version: SCHEMA_VERSION,
+    project_id: "demo",
+    phase: "intake" as const,
+    effective_gate_config: {
+      decisions_required_status: "accepted" as const,
+      review_required_phases: [],
+      review_required_per_decision: false,
+      max_task_estimate_hours: 16,
+      require_human_signoff_phases: ["handing-off"],
+      min_decisions: 0,
+      min_tasks: 3,
+    },
+    next_decision_seq: 1,
+    next_task_seq: 1,
+    pending_questions: [],
+    gate_failures: [],
+  };
+
+  it("round-trips and defaults", () => {
+    const parsed = PipelineStateSchema.parse(validState);
+    assert.equal(parsed.phase, "intake");
+    assert.equal(parsed.next_decision_seq, 1);
+  });
+
+  it("rejects non-semver schema_version", () => {
+    assert.throws(() =>
+      PipelineStateSchema.parse({ ...validState, schema_version: "0.1" })
+    );
+  });
+});
+
+describe("EventSchema", () => {
+  it("accepts a minimal event", () => {
+    const parsed = EventSchema.parse({
+      at: NOW,
+      actor: "agent",
+      kind: "project_initialized",
+    });
+    assert.equal(parsed.kind, "project_initialized");
+  });
+
+  it("accepts a payload of arbitrary shape", () => {
+    const parsed = EventSchema.parse({
+      at: NOW,
+      actor: "human",
+      kind: "decision_accepted",
+      entity_kind: "decision",
+      entity_id: "0001-x",
+      payload: { reason: "fine", nested: { key: "value" } },
+    });
+    assert.equal(parsed.payload?.["reason"], "fine");
+  });
+
+  it("rejects unknown event kinds", () => {
+    assert.throws(() =>
+      EventSchema.parse({ at: NOW, actor: "agent", kind: "totally_made_up" })
+    );
+  });
+});
diff --git a/server/tsup.config.ts b/server/tsup.config.ts
index b32b759..ce9b473 100644
--- a/server/tsup.config.ts
+++ b/server/tsup.config.ts
@@ -1,7 +1,7 @@
 import { defineConfig } from "tsup";
 
 export default defineConfig({
-  entry: ["src/index.ts"],
+  entry: ["src/index.ts", "src/cli.ts"],
   format: ["esm"],
   target: "node20",
   clean: true,