From 322e60968e3acb5970d540a550d28a530e9eec2a Mon Sep 17 00:00:00 2001 From: CL Kao Date: Sat, 13 Jun 2026 09:53:38 -0700 Subject: [PATCH 01/18] =?UTF-8?q?roadmap(0203):=20fo-efficiency=20shaping?= =?UTF-8?q?=20=E2=80=94=20j9=20backbone=20+=20boot=20forensics?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit j9 = backbone (3 phases): contract split (enabler + contract-audit), lazy-TeamCreate (~89k lever), shallow-boot-then-greet. Phase-1 + Phase-2/3 ideation spikes both VIABLE. T3 prose-audit files along. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/roadmap/0203-fo-efficiency/README.md | 70 +++++ .../roadmap/0203-fo-efficiency/T1-ideation.md | 184 ++++++++++++ .../0203-fo-efficiency/boot-analysis.md | 131 +++++++++ .../j9-phases-2-3-ideation.md | 262 ++++++++++++++++++ 4 files changed, 647 insertions(+) create mode 100644 docs/roadmap/0203-fo-efficiency/README.md create mode 100644 docs/roadmap/0203-fo-efficiency/T1-ideation.md create mode 100644 docs/roadmap/0203-fo-efficiency/boot-analysis.md create mode 100644 docs/roadmap/0203-fo-efficiency/j9-phases-2-3-ideation.md diff --git a/docs/roadmap/0203-fo-efficiency/README.md b/docs/roadmap/0203-fo-efficiency/README.md new file mode 100644 index 00000000..172a366c --- /dev/null +++ b/docs/roadmap/0203-fo-efficiency/README.md @@ -0,0 +1,70 @@ +# 0203 — FO Efficiency: shallow boot + lazy contract + +**Milestone:** 0.20.3 +**Status:** shaping (design approved 2026-06-13; T1 ideation in flight) +**Theme:** make the first officer cheap to boot and run. + +## Why + +Boot forensics on a live FO session (`/tmp/boot-analysis-spacedock-v1.md`) measured **~160k peak context and ~13.6 min** to reach an interactive greet — **with no team created and no worker dispatched.** The cost is structural, not a bug: the FO reads its entire contract (both reference files ~16k), the workflow README, and both mod files up front, then renders the full status table — most of it unused on a boot that never dispatches. Generation latency scales with loaded context, so the wall-clock is dominated by thinking at 100k+. + +## Goal (success criterion) + +An FO reaches interactive readiness — greet + state summary + *able to present a gate* — in seconds at **< ~60k** context, versus today's minutes at 126k+. Proven by a live FO-boot drive that observes correct behavior with the deferred modules unloaded — never by a grep over the restructured contract. + +## Cost levers (ranked — why j9 is the backbone) + +| Lever | ~boot cost removed | Needs the split? | +|-------|-------------------:|------------------| +| Lazy-TeamCreate (defer the team-mode prefix re-cache) | **~89k cache-creation** | no | +| Defer contract reads at greet | ~16k | yes (minimal) | +| Defer the human status-table render | ~8.7k | no | +| Defer mod-file reads | ~6.5k | no | + +The 89k lazy-TeamCreate dwarfs the rest and sits on the critical path *before the dispatch gate* — so **`j9` (lazy-TeamCreate + shallow-boot) is the backbone.** The contract split is the enabling refactor for the one lever that needs it, and doubles as the contract-cleanup ask. + +## The cut — reorganize the contract by *when* it is needed + +**Boot-resident** (read on every FO start): contract-gate/startup, discovery, `status --boot --json`, status viewer, ID styles, single-entity mode, write scope, captain interaction, event loop, gate-presentation entry, clarification, working principles. Exactly enough to greet, report state, and present a gate. + +**Deferred — loaded only when its phase begins:** +- **Dispatch/team module** (first dispatch): team creation, dispatch adapter, worker resolution, reuse conditions, standing teammates, degraded mode, context budget. ≈70% of `claude-first-officer-runtime.md` plus shared-core's reuse/standing-teammate sections. The bulk — and the biggest pure boot waste today. +- **Merge module** (terminal boundary): merge-and-cleanup, ship-local, teardown, mod-block enforcement. +- **Already lazy** (the precedent we extend): `present-gate`, `feedback-rejection-flow`, `using-claude-team` skills. + +## Mechanism + +The `spacedock:first-officer` skill reads only the boot-resident core at startup. The team/dispatch content folds behind the existing lazily-loaded `using-claude-team` skill (already invoked at first dispatch); the merge content becomes a lazily-loaded reference invoked at terminalization. No new pattern — we extend the one already in the codebase. + +## Boot flow (the j9 shallow-boot) + +contract-gate → discovery → `status --boot --json` → **greet and stop for input.** Team creation, mod-file reads, the dispatch/merge modules, and the human status table all defer to the moment they are needed. + +## Test plan (honors the proof policy) + +- **New live shared-runtime scenario `shallow-boot`:** the FO boots, greets, and presents a gate with the dispatch/merge modules *not* loaded — verified through the live `internal/ensigncycle` harness on durable behavior. The win is correct behavior at lower loaded context; behavioral and live, not a contract grep. +- **Regression:** existing `gate-guardrail` / `rejection-flow` / `merge-hook-guardrail` scenarios still pass — the deferred modules load correctly when a real dispatch/merge happens. +- **Structural guard** (in the allowed `internal/contractlint` quarantine): a reference-closure check that the boot-resident core has no dependency on deferred-only content. Structural, not prose-grep. + +## Tasks + +**`j9` is the backbone — one task, three phases.** Reshape the existing backlog entity; do not fragment it. The contract split runs first because it is the behavior-preserving enabler (and the contract-audit ask); the 89k lazy-TeamCreate is the headline lever. + +**j9 — Lazy-TeamCreate + shallow-boot-then-greet:** +- **Phase 1 — contract structural split** (enabler + the "audit and cleanup the fo contract" ask): extract boot-resident vs deferred into a lazy dispatch ref + a lazy merge ref; slim the boot-resident core + the skill loader. Behavior-preserving. *Proof: existing `gate-guardrail`/`rejection-flow`/`merge-hook-guardrail` live scenarios still pass + a `internal/contractlint` reference-closure guard.* Shaped in `T1-ideation.md` (labeled T1 there — it is j9 Phase 1; spike verdict VIABLE, ~70% boot-read cut). +- **Phase 2 — lazy-TeamCreate**: defer the `TeamCreate` call (the ~89k cache-creation) off the boot/greet path to first-dispatch-need. Needs no split. +- **Phase 3 — shallow-boot-then-greet**: greet off `status --boot --json`; defer mod-reads, the human status-table render, and the (now-split) deferred contract modules. Folds C3 (mod-defer) + C4 (status-render discipline). *Proof: new live `shallow-boot` scenario.* + +**T3 — residual-prose audit + comm-officer polish** (file along, post-Phase-1; the cut-list does not exist until the split lands). + +Boot-report habits (scope greps to headings; delegate bulk reconciliation reads to a subagent) tighten existing "Probe and Ideation Discipline" prose and ride along in Phase 1 / T3. + +## Out of scope (parked, not 0.20.3) + +- **p2 / vc** — `spacedock pr complete` + `reconcile --act`: the binary-simplification line (higher ROI, heavier lift) → 0.20.4. +- **xp** — cross-session FO↔Commander comms: the coordination infra that makes multi-FO safe. Its own track. +- **ey** — proof-policy port to shipped scaffolding: adjacent, separate. + +## Operational landing + +This doc lives in the main repo (collision-free). Filing T1 and reshaping `j9` into `docs/dev/.spacedock-state` are coordinated with the active Commander session via path-scoped commits (disjoint entities → safe under the multi-writer protocol). diff --git a/docs/roadmap/0203-fo-efficiency/T1-ideation.md b/docs/roadmap/0203-fo-efficiency/T1-ideation.md new file mode 100644 index 00000000..64cd3688 --- /dev/null +++ b/docs/roadmap/0203-fo-efficiency/T1-ideation.md @@ -0,0 +1,184 @@ +# T1 — FO contract structural split + lazy-load + +**Milestone:** 0.20.3 (0203 FO efficiency) +**Stage:** ideation +**Blocks:** T2 (shallow-boot-then-greet); T3 files along after. + +Restructure the first-officer contract so a boot reads only what it needs to greet, report state, and present a gate. Extract the dispatch/team material and the merge material into already-lazy load points, leaving a slim boot-resident core. This is the structural split that T2's boot-flow reshape and T3's prose audit both depend on — neither cut-list exists until the split lands. + +## Problem + +Boot forensics on a live FO session (`/tmp/boot-analysis-spacedock-v1.md`) measured the cost of reaching an interactive greet **with no team created and no worker dispatched** — a 100% pre-dispatch session: + +- Peak context **160,594 tokens** (event 148); the session crossed 100k at event 97 (~363s) and never fell back. +- Wall-clock to the captain-facing gate question **~511s (~8.5 min)**; to the tail ~13.6 min. The wall-clock is dominated by generation latency that grows with loaded context — the two slowest think-turns (128.6s, 100.1s) both fired *above 97k context*. + +The single biggest avoidable structural waste in that picture is the two FO reference files, **read whole, back-to-back, at events 23 and 25 (t≈8–9s), immediately after the skill loaded, before any work began:** + +| Forensics rank | Tokens | File | +|---|---|---| +| #1 | ~9,310 (37,249 ch) | `first-officer-shared-core.md` — single biggest ingest | +| #3 | ~6,900 (27,602 ch) | `claude-first-officer-runtime.md` — read back-to-back with #1 | + +That is **~16,200 tokens** of contract read on every boot. The forensics call out the runtime adapter specifically: "gate `claude-first-officer-runtime.md` behind 'am I creating a team this turn' — which never happened this session, so ~6.9k was pure waste." The runtime adapter is ~70% team/dispatch material that a boot which never dispatches never uses, yet it loads first. + +The cost is structural, not a bug: the loader reads the entire contract up front. Generation latency scales with loaded context, so trimming the boot read trims the wall-clock directly. + +## Spike (riskiest-first) — is the two-tier split structurally sound? + +**The riskiest unknown:** does the boot-resident path have a hard dependency on deferred content that would break if that content is not loaded at boot? This is static mechanism analysis — dependency tracing over the contract sections plus confirming the lazy hooks already exist. It is the right depth for ideation. The *live* FO-boot drive that proves the FO still greets and presents a gate correctly with the modules unloaded is the implementation/validation proof (the `shallow-boot` scenario in AC-1), not this spike. + +### Spike step 1 — every top-level section mapped to a tier + +Token estimates are char/4 over the section body. The two files total **~9,014 + ~6,700 ≈ 15,700 tokens**, matching the forensics' ~16.2k. + +**`first-officer-shared-core.md` (17 sections, ~9,014 tok)** + +| Section | Lines | ~tok | Tier | +|---|--:|--:|---| +| Startup | 24 | 963 | **boot-resident** | +| Status Viewer | 39 | 655 | **boot-resident** | +| ID Styles | 10 | 256 | **boot-resident** | +| Single-Entity Mode | 12 | 188 | **boot-resident** | +| Working Directory | 4 | 44 | **boot-resident** | +| Dispatch | 24 | 746 | **dispatch-deferred** | +| Completion and Gates | 44 | 1340 | **split** — gate-decision spine boot-resident; reuse-conditions dispatch-deferred (see step 5) | +| Merge and Cleanup (+ Ship-Local, Worktree-removal) | 49 | 1785 | **merge-deferred** | +| State Management | 6 | 66 | **boot-resident** (rebase-conflict halt referenced by Startup) | +| Worktree Ownership (+ Split-Root Worktree Contract) | 30 | 750 | **dispatch-deferred** | +| FO Write Scope | 15 | 430 | **boot-resident** | +| Mod Hook Convention (+ Mod-Block Enforcement) | 21 | 390 | **merge-deferred** (mod-block is a merge-ceremony concept) | +| Standing Teammates | 9 | 402 | **dispatch-deferred** (folds behind `using-claude-team`) | +| Clarification and Communication | 6 | 115 | **boot-resident** | +| Working Principles | 13 | 524 | **boot-resident** | +| Probe and Ideation Discipline | 7 | 308 | **boot-resident** | +| Issue Filing | 4 | 19 | **boot-resident** | + +**`claude-first-officer-runtime.md` (11 sections, ~6,700 tok)** + +| Section | Lines | ~tok | Tier | +|---|--:|--:|---| +| Team Creation (+ standing-teammate discovery/lazy-spawn/declaration) | 44 | 1622 | **dispatch-deferred** → `using-claude-team` | +| Worker Resolution | 10 | 153 | **dispatch-deferred** | +| Dispatch Adapter (+ break-glass) | 58 | 1941 | **dispatch-deferred** | +| Degraded Mode (spacedock seams) | 7 | 159 | **dispatch-deferred** → `using-claude-team` | +| Context Budget and Dead Ensign Handling | 23 | 586 | **dispatch-deferred** | +| Captain Interaction | 16 | 468 | **split** — gate-guardrail + greet boot-resident; team-mode chat hint + single-entity gate-resolution dispatch-deferred | +| Feedback Rejection Flow (bare mode) | 6 | 86 | **dispatch-deferred** (the routing skill `feedback-rejection-flow` is already lazy) | +| Event Loop (incl. reconcile sweep step 0) | 24 | 988 | **dispatch-deferred** | +| Mod-Block Enforcement at Terminal Transitions | 23 | 503 | **merge-deferred** | +| Agent Back-off | 6 | 84 | **boot-resident** (cheap; captain-interaction adjacent) | +| Entity-Body Inspection | 4 | 82 | **boot-resident** (points at shared-core Probe discipline) | + +**Tier totals (approximate):** + +- **Boot-resident:** shared-core ~3,800 tok (Startup, Status Viewer, ID Styles, Single-Entity Mode, Working Directory, State Management, FO Write Scope, Clarification, Working Principles, Probe discipline, Issue Filing, plus the gate-decision spine of Completion-and-Gates) + runtime ~700 tok (Captain Interaction greet/guardrail, Agent Back-off, Entity-Body Inspection) ≈ **~4,500 tok.** +- **Dispatch-deferred:** ~6,000 tok (the bulk of the runtime adapter + shared-core Dispatch/Worktree-Ownership/Standing-Teammates/reuse-conditions). +- **Merge-deferred:** ~2,700 tok (Merge-and-Cleanup, Ship-Local, Mod-Hook/Mod-Block, runtime Mod-Block-at-Terminal). + +**Boot read drops from ~15,700 tok to ~4,500 tok — roughly an 11k-token cut on every boot, ~70% of the contract-read cost.** (Net win is slightly less than the gross because the deferred modules re-load when a real dispatch/merge happens — but on a no-dispatch boot, the session the forensics measured, the full ~11k is saved.) + +### Spike step 2 — loader claim confirmed + +`skills/first-officer/SKILL.md` instructs reading **both** reference files at startup: + +- Line 18: `@references/first-officer-shared-core.md` — the `@`-directive inlines the shared core into the skill body at load. +- Lines 23–25: "Load the runtime adapter for your platform: … read `references/claude-first-officer-runtime.md`" (Claude branch). +- Line 27: "Then begin the Startup procedure from the shared core." + +The live forensics timeline corroborates this exactly: row 1 = `Skill spacedock:first-officer` (event 13, t=4s), row 2 = Read `first-officer-shared-core.md` (event 23, t=8s), row 3 = Read `claude-first-officer-runtime.md` (event 25, t=9s). Both files are read within one second of each other, at the very top of the session. `agents/first-officer.md` adds nothing — it only delegates to the skill ("invoke the `spacedock:first-officer` skill now to load it. Then begin the Startup procedure"). The skill is the single loader. + +### Spike step 3 — coupling trace (boot-resident → deferred) + +The crux question: does any boot-resident step *depend on* team/dispatch/merge knowledge before the greet? Findings, with the resolution for each: + +**C1 — shared-core Startup (the boot procedure itself) has ZERO reference into deferred content.** Startup steps 1–7 are: contract-version gate, `git rev-parse` root, `status --discover`, README read, `status --boot`, split-root halt-gate, split-root pull-on-boot. None mentions team, reconcile, dispatch, or merge. Its only forward reference is "follow the rebase-conflict halt in **State Management**" — and State Management is itself boot-resident. **This is the clean island that makes the split viable.** The boot procedure stands alone. + +**C2 — the runtime adapter "Team Creation" says "At startup (after reading the README, before dispatch)".** This is the load-bearing phrase. Read literally, "at startup" couples team creation into boot. But its own next sentence reframes it: "Invoke it before the first team-mode tool call in the session." And it already delegates entirely to the lazy skill: `Skill(skill="spacedock:using-claude-team")`. **Resolution:** retire "at startup" in favor of the truthful trigger — team creation fires at *first dispatch*, which is when `using-claude-team` is meant to load. This is a one-clause wording change, not a structural break. It aligns the contract with what the forensics already shows happening (the measured session never created a team because it never dispatched). + +**C3 — the reconcile sweep (Event Loop step 0) runs "(a) at boot, AFTER the split-root pull --rebase and BEFORE the first dispatch".** This is the genuine boot-adjacent step inside otherwise-deferred content. It needs a `team_name` (the A/B/C drift classes are roster-derived). **It does NOT need to run before the greet** — it runs "before the first dispatch," and the shallow-boot flow (T2) greets before any dispatch. So reconcile rides into the dispatch-deferred Event Loop module that loads at first dispatch, alongside the `team_name` it requires. There is no boot-resident step that calls reconcile; the greet happens off `status --boot --json` alone (forensics row 8 confirms `--boot --json` is already run at t=27s, before all the heavy reads). **No stub needed in the boot-resident core.** + +**C4 — standing-teammate discovery pass ("after team creation … BEFORE entering the normal dispatch event loop").** This is wholly inside the team-creation flow; it lazy-spawns at first dispatch already ("No spawn calls at boot. Spawn is deferred to the first team-mode dispatch"). It travels with the dispatch module. No boot coupling. + +**C5 — Completion-and-Gates "decide reuse-or-fresh" references the reuse-conditions and the runtime context-budget probe.** The gate-decision spine (never self-approve, present-gate, AC cross-check, the gated-stage branch) is boot-resident — a shallow boot must be able to present a gate. But the reuse-conditions block and the budget probe are only reached *after a worker completes*, which cannot happen before a dispatch. **Resolution:** split the section — the gate-presentation/AC-cross-check spine stays boot-resident; the reuse-or-fresh machinery (reuse conditions 0–4, the model-mismatch diagnostic, SendMessage advancement, supersede-shutdown) moves to the dispatch module. `present-gate` and `feedback-rejection-flow` are already lazy skills the boot-resident spine invokes by name — that precedent is exactly the shape. + +**C6 — Mod-Block Enforcement (shared-core) and Mod-Block at Terminal (runtime) are referenced from Merge-and-Cleanup only.** Both are merge-ceremony concepts. They travel with the merge module. The boot-resident core needs to know merge hooks *exist* (the MODS section of `status --boot` reports them), but not the enforcement mechanics — those are read at terminalization. No boot coupling. + +**Summary:** the only boot-resident step that touches deferred concepts is C2's "at startup" wording, and that is a wording-truth fix, not a structural dependency. C3 (reconcile) is boot-adjacent but fires before-first-dispatch, not before-greet, so it lives cleanly in the dispatch module. No boot-resident step genuinely needs team/dispatch/merge knowledge before the greet. + +### Spike step 4 — is the fold target non-duplicative? + +`spacedock:using-claude-team` already carries the generic team lifecycle: Deferred Team Tools (the ToolSearch hop), Team Creation (TeamCreate-first sequencing, naming, bare-mode fallback), the TeamCreate recovery procedure, the failure-recovery ladder, Degraded Mode (triggers/effects/captain-report/shutdown-sweep), Awaiting Completion, and Terminal Team Teardown. The runtime adapter's "Team Creation" section **already invokes this skill** (`Skill(skill="spacedock:using-claude-team")`) and explicitly states "the generic blocks they reference (`## Degraded Mode`, `## Awaiting Completion`, `## Terminal Team Teardown`) live in that skill, not in this file." + +So the split is already partly done: the *generic* team lifecycle is in the lazy skill; the *spacedock-specific* adapter sections (Worker Resolution, Dispatch Adapter, standing-teammate discovery/lazy-spawn, the spacedock Degraded-Mode seams, Context Budget, Event Loop/reconcile) are what still load eagerly in the runtime adapter. **Folding the runtime adapter's team/dispatch sections behind the same lazy load point as `using-claude-team` is non-duplicative** — they are the spacedock specializations that the generic skill leaves to the consumer. The clean mechanism: a new lazily-loaded reference (e.g. `references/claude-fo-dispatch.md`) holding the spacedock dispatch/team sections, read at the same first-dispatch moment `using-claude-team` is invoked. No content overlaps with the generic skill; the generic skill keeps the lifecycle, the deferred reference keeps the spacedock adapter. + +### Spike step 5 — VERDICT + +**Viable with one wording tweak.** The clean two-tier split is structurally sound: + +- The shared-core Startup procedure is a self-contained boot island with no forward dependency into deferred content (C1) — this is what makes the cut clean. +- The greet runs off `status --boot --json` alone, already executed early in the live session (forensics row 8), so nothing team/dispatch/merge is needed before the greet. +- Every coupling resolves without a boot-resident stub: reconcile (C3) and standing-teammates (C4) fire before-first-dispatch (not before-greet) and travel with the dispatch module; mod-block (C6) travels with the merge module; the gate spine splits cleanly from reuse-or-fresh (C5) along the present-gate/feedback-rejection precedent. +- **The one required tweak:** retire the runtime adapter's "At startup … before dispatch" framing for team creation (C2) in favor of "at first dispatch," matching the lazy `using-claude-team` invocation that is already the team's load point. One clause, not a redesign. + +No boot-path step genuinely needs team/dispatch/merge knowledge before the greet. The split is not blocked. + +## Proposed approach + +The section→module assignment is the table in spike step 1. The mechanism: + +1. **Slim the boot-resident core.** `first-officer-shared-core.md` keeps its boot-resident sections (Startup, Status Viewer, ID Styles, Single-Entity Mode, Working Directory, State Management, FO Write Scope, Clarification, Working Principles, Probe discipline, Issue Filing) plus the gate-presentation spine extracted from Completion-and-Gates. The runtime adapter keeps Captain Interaction's greet/guardrail, Agent Back-off, and Entity-Body Inspection. + +2. **Extract the dispatch/team module.** Move the runtime adapter's Worker Resolution, Dispatch Adapter, Context Budget, Event Loop (incl. reconcile), Degraded-Mode seams, standing-teammate discovery/lazy-spawn/declaration, and the shared-core Dispatch / Worktree-Ownership / Standing-Teammates / reuse-conditions into a lazily-loaded reference read at first dispatch. The generic team lifecycle stays in `using-claude-team` (already lazy); this new reference holds only the spacedock adapter. The first-dispatch load point is the existing `Skill(skill="spacedock:using-claude-team")` invocation — extend it to also pull the spacedock dispatch reference. **C2 fix:** the team-creation trigger reads "at first dispatch," not "at startup." + +3. **Extract the merge module.** Move Merge-and-Cleanup, Ship-Local Ceremony, Worktree-removal safety, Mod-Hook/Mod-Block Enforcement (shared-core), and Mod-Block-at-Terminal (runtime) into a lazily-loaded reference read at terminalization. The boot-resident core reaches it the same way it reaches `present-gate` / `feedback-rejection-flow`: by naming the load point at the terminal boundary. + +4. **Loader change.** `SKILL.md` line 18's `@references/first-officer-shared-core.md` inlines only the slimmed core. The runtime-adapter read (lines 23–25) loads only the boot-resident runtime sections. The dispatch and merge references are NOT read at startup — they load at their phase boundary, extending the existing lazy pattern (`present-gate`, `feedback-rejection-flow`, `using-claude-team`). No new mechanism. + +**Coupling resolutions (from step 3):** C1 — no change needed (clean island). C2 — wording fix to "at first dispatch." C3/C4 — reconcile + standing-teammates travel with the dispatch module (before-first-dispatch, not before-greet). C5 — split Completion-and-Gates: gate spine boot-resident, reuse-or-fresh deferred. C6 — mod-block travels with the merge module. + +This task is the structural move only. It is a content reorganization that must be **behavior-preserving** — the same instructions, reachable at the same moments, just loaded lazily. The regression scenarios (AC-2) are the proof of no behavior change; they are the load-bearing safety net for "this is a move, not a rewrite." + +## Out of scope + +- **The j9 boot-flow change (T2).** Greet-first sequencing, deferred mod-reads, deferred status-table render — the *new* shallow-boot behavior — is T2, blocked-by this task. T1 makes the modules loadable lazily; T2 changes when the FO greets. The `shallow-boot` live scenario is authored against T1's structure but its greet-first behavior is T2's claim. +- **p2 / vc binary-simplification** (`spacedock pr complete`, `reconcile --act`) — parked to 0.20.4. +- **The residual-prose audit + comm-officer polish (T3)** — the cut-list of leftover prose to trim does not exist until this split lands; T3 files along after. +- **The Codex and Pi runtime adapters.** This task splits the Claude adapter (the bulk file the forensics measured). The shared-core split applies cross-host, but the per-host dispatch/merge reference extraction for `codex-first-officer-runtime.md` (~4.9K) and `pi-first-officer-runtime.md` (~5.9K) is a follow-on if their boot cost warrants it — they are an order smaller than the Claude adapter. + +## Acceptance criteria + +Each AC names an end-state property of the finished split, verified by something outside this task body that can fail. No AC is proven by a string/substring/regex match over an instruction file the model reads — that is banned by this workflow (a passing match only asserts the implementer's own text is present). + +**AC-1 — With the dispatch and merge modules NOT loaded, a freshly-booted FO greets the captain, reports workflow state, and presents a human gate without self-approving, mutating, or archiving the entity.** +Verified by: a new live shared-runtime scenario `shallow-boot` in `internal/ensigncycle` (added to `sharedRuntimeScenarios()` with Claude + Codex runners per the README's add-a-scenario procedure). The runner launches the real host front door against a fixture sitting at a human gate, and the host-neutral assertion over `(before, after, observed)` confirms the FO presented the gate (durable: entity still at the gate stage, not archived, no `verdict`/`completed` set; final message carries a gate review + decision prompt) — the gate-guardrail behavior surviving with the deferred modules unloaded. Behavioral and live, not a contract grep. An offline negative case in `shared_scenarios_negative_test.go` builds the broken end-state (entity self-approved/archived) and proves the assertion goes red. + +**AC-2 — The split is behavior-preserving: the deferred modules load and function correctly when a real dispatch or merge happens.** +Verified by: the existing live scenarios `gate-guardrail`, `rejection-flow`, and `merge-hook-guardrail` in `internal/ensigncycle` still pass after the split. `gate-guardrail`/`rejection-flow` exercise the dispatch module loading at first dispatch (reuse-conditions, feedback routing); `merge-hook-guardrail` exercises the merge module loading at terminalization (mod-block enforcement). A green run of all three is the proof that lazy-loading did not drop a reachable instruction — run via `go test -tags live -run TestLiveClaudeSharedScenarios ./internal/ensigncycle`. + +**AC-3 — The boot-resident core has no reference dependency on deferred-only content: every reference the boot-resident core makes resolves either within the boot-resident set or to a known lazy load point, never into the body of the dispatch/merge modules.** +Verified by: a new reference-closure structural guard in `internal/contractlint` (the allowed quarantine), extending `TestUserSkillReferenceClosureResolves`. The check parses the boot-resident files and the deferred-module manifest as real artifacts, builds the set of section anchors the deferred modules own, and fails if a boot-resident `@`/read reference resolves into a deferred-module section that is not one of the declared lazy load points (`using-claude-team`, `present-gate`, `feedback-rejection-flow`, the new dispatch/merge references). This tests a relationship between two independent values — the boot-resident reference set vs. the deferred-module section set — which can diverge (a future edit that points the boot core at a moved section makes them disagree), so it can fail; it is not a spelling check over a single file. A control test plants a boot-resident reference into a deferred section and proves the guard goes red. + +**AC-4 — The boot read shrinks: the files loaded at startup (the slimmed core + boot-resident runtime sections) no longer carry the dispatch/team or merge sections.** +Verified by: a structural-absence check in `internal/contractlint` confirming the dispatch-module section anchors (Dispatch Adapter, Worker Resolution, Event Loop, Context Budget, standing-teammate spawn) and merge-module anchors (Merge and Cleanup, Ship-Local Ceremony, Mod-Block Enforcement) are ABSENT from the boot-resident files and PRESENT in their deferred references — the same structural-absence shape as `TestRetiredPluginPrivatePathsAbsent`. The expected value (which anchors belong where) comes from the deferred-module manifest, an independent source the boot files can diverge from. This is a structural location check, not a behavioral claim; the behavioral win (lower loaded context) is AC-1's live scenario. The check fails if a deferred section is left behind in the boot core or a boot section is wrongly moved out. + +## Test plan + +- **AC-1 (`shallow-boot` live scenario):** the costly item. Following the README's 4-step add-a-scenario procedure — host-neutral entry in `sharedRuntimeScenarios()`, fixture + prompt in `shared_fixtures_test.go`, host-neutral assertion + offline negative in `shared_scenarios_negative_test.go`, runner entries in BOTH `claudeScenarioRunners()` and `codexScenarioRunners()`. Live, real host, durable-state assertion. Cost: one model-spend scenario added to the serial suite (~5–7 min Claude opus); the offline negative and the parity/definition guards run at zero spend. **Spot-check first:** run the parity/definition guards (`TestSharedScenarioRunnerCoverage|TestSharedRuntimeScenarioDefinitions`) before paying for the live run. +- **AC-2 (regression):** zero new authoring — run the three existing live scenarios after the split. Cost: the existing serial-suite wall-time, already budgeted in CI. +- **AC-3 + AC-4 (structural guards):** Go tests in `internal/contractlint`, no model spend, run in the offline gate job (`go test ./...`). Cost: low — they extend existing reference-closure and structural-absence patterns. Each ships with a control test (planted violation goes red) so the guard is proven able to fail, not vacuous. +- **Fixture vs live:** AC-1 is live (the runtime integration — does the FO behave with modules unloaded — is the claim). AC-2 is live (same reason). AC-3/AC-4 are structural fixtures over the shipped surface. No AC leans on a prose-grep over the contract. + +## Spike result + +**Verdict: viable with one wording tweak.** The two-tier split is structurally sound. Evidence: + +- **Loader claim confirmed** (step 2): `SKILL.md` reads both reference files at startup (`@references/first-officer-shared-core.md` + the runtime-adapter read), corroborated by the live forensics timeline (Skill → shared-core Read → runtime Read within one second, t=8–9s). +- **Boot island is clean** (C1): shared-core Startup steps 1–7 have zero forward reference into team/dispatch/merge content; their only forward reference (rebase-conflict halt) targets the boot-resident State Management section. +- **Greet needs no deferred knowledge:** the greet runs off `status --boot --json`, already executed at t=27s in the live session before any heavy read. +- **Every coupling resolves** (step 3): C2 (one "at startup"→"at first dispatch" clause), C3/C4 (reconcile + standing-teammates fire before-first-dispatch, travel with the dispatch module), C5 (gate spine splits from reuse-or-fresh on the present-gate precedent), C6 (mod-block travels with the merge module). No boot-resident step needs team/dispatch/merge knowledge before the greet, so no boot-resident stub is forced. +- **Fold target is non-duplicative** (step 4): `using-claude-team` already carries the generic team lifecycle and the runtime adapter already invokes it; the spacedock-specific adapter sections fold behind the same first-dispatch load point without overlapping the generic skill. + +**Boot read drops ~15,700 → ~4,500 tokens (~11k cut, ~70% of contract-read cost) on a no-dispatch boot** — the exact ~16.2k the forensics flagged as read-whole-up-front, of which ~6.9k (the runtime adapter) was pure waste in the measured session. + +**Honesty on spike depth:** this is static mechanism analysis (dependency tracing + confirming the lazy hooks exist), the right depth for ideation. It does NOT prove the FO still behaves correctly with the modules unloaded — that is AC-1's live `shallow-boot` drive at implementation/validation. The spike establishes the split *can* be clean; the live scenario establishes that it *is*. diff --git a/docs/roadmap/0203-fo-efficiency/boot-analysis.md b/docs/roadmap/0203-fo-efficiency/boot-analysis.md new file mode 100644 index 00000000..0c01aa48 --- /dev/null +++ b/docs/roadmap/0203-fo-efficiency/boot-analysis.md @@ -0,0 +1,131 @@ +# Boot Forensics — `spacedock-v1` first-officer session + +**Session analyzed:** `334ffb94-3195-48c6-8add-215fdb772598.jsonl` (the only file; live/appending, ~769K on disk at analysis time) +**Project:** `~/.claude/projects/-Users-clkao-git-spacedock-research-spacedock-v1/` (workflow dir `docs/dev`) +**Boot window:** event 7 (first captain prompt "…Engage", t=0) → **the entire captured session is the boot/reconcile window**. No team was ever created and **no workflow worker/ensign was ever dispatched.** The only `Agent` call in the whole transcript (event 150, t≈818s) is *this forensics task itself* — a read-only analyst, not a workflow hand-off. So unlike the reference session (which fired a real ensign `Agent` at ~t+256s), this session is **100% pre-dispatch**: boot + heavy resume-state reconciliation, then a gate-question to the captain, then the captain queued the "analyze my boot cost" command. + +> Token figures below are **tokens** unless a char count is given in parens. The model's *context size* at a turn ≈ `input_tokens + cache_read_input_tokens + cache_creation_input_tokens` (from `.message.usage`). The "peak" is the max of that sum across assistant turns. +> +> **Honesty note on "~200k":** the captain reported "~200k token on boot," and the FO itself said "~200k in context" at event 148. **The jsonl never shows a 200k turn.** The measured peak is **167,420 tokens** (event 156, the FO's continuation *after* it dispatched this forensics subagent); the highest pre-dispatch turn is **160,594** (event 148). The "~200k" is a rounded-up perception (the running UI counter trends toward 200k and the next queued turn would push it further), not a value present in any `.message.usage`. Every number in this report is the actual jsonl value. + +## Boot totals + +| Metric | Value | +|---|---| +| Wall-clock to first dispatch | **N/A — no workflow dispatch ever happened** | +| Wall-clock to the captain-facing gate question (`AskUserQuestion`) | ~511s (~8.5 min), event 111 | +| Wall-clock to the captain's "~7min and not done" remark (queued command) | ~564s (when the gate answer landed) | +| Wall-clock to current tail (forensics `Agent` dispatch) | ~818s (~13.6 min), event 150 | +| **Peak context (pre-dispatch)** | **160,594 tokens** (event 148) | +| **Peak context (whole transcript)** | **167,420 tokens** (event 156, post-dispatch continuation) | +| Context crossed 50k | event 34 (~14s) | +| Context crossed 80k | event 72 (~121s) | +| Context crossed 100k | event 97 (~363s) | +| Context crossed 150k | event 132 (~648s) | +| Context crossed 200k | **never** (peak 167,420) | +| Baseline before any boot work | **27,256 tokens** (turn 1, event 11) | +| Output tokens spent | ~57,741 across 18 assistant API calls (~54,890 across the 17 calls up to the forensics dispatch) | + +The **27,256-token baseline** (paid before the FO read a single workflow file) decomposes roughly as: system prompt + tool schemas (~22k) + the `skill_listing` attachment (11.6k chars ≈ ~2.9k tok) + the superpowers `SessionStart` hook / `hook_additional_context` (6.3k chars ≈ ~1.6k tok). Note the deferred-tools mechanism worked — MCP/team tool schemas stayed deferred and cost nothing material at baseline. + +> **Why the jsonl is only ~769KB on disk but context hit ~167k:** almost all of the context is **cache-reads** (`cache_read_input_tokens`), not freshly stored bytes. Each turn re-references the same cached prefix; the jsonl only stores the *new* deltas (`cache_creation`) plus tool I/O. So a small file backs a large live context — context size ≠ transcript size. + +## Top token consumers (ranked) + +Ranked by what was **newly ingested** (each turn's `cache_creation` ≈ the prior tool result + the model's own prior output). Sizes shown as the tool-result char count and its ~token cost. + +| # | ~tokens | What it was | +|--:|--:|---| +| 1 | ~9,310 (37,249 ch) | Read `skills/first-officer/references/first-officer-shared-core.md` — the `spacedock:first-officer` shared-core reference. Single biggest ingest. | +| 2 | ~8,130 (32,512 ch) | Read `docs/dev/README.md` — the long workflow root README, read whole. | +| 3 | ~6,900 (27,602 ch) | Read `skills/first-officer/references/claude-first-officer-runtime.md` — the FO Claude runtime-adapter reference, read back-to-back with #1. | +| 4 | ~4,880 (19,520 ch) | Bash grep over 7 orphan entities for "recommend/PASSED/REJECTED/verdict/MATERIAL…" (event 89) — but the matched lines are **full multi-paragraph stage-report bodies**, so this pulled whole audit-finding paragraphs into context, not terse verdicts. Avoidable. | +| 5 | ~4,330 (17,321 ch) | **Full `spacedock status` table, render #1** (event 58) — ~40 entities with long `source` fields, human-formatted. | +| 6 | ~4,327 (17,311 ch) | **Full `spacedock status` table, render #2** (event 125) — the *same* table, re-rendered verbatim after the captain answered "Full status dump first." Duplicate of #5. | +| 7 | ~4,134 (16,539 ch) | Bash: read the verbose `8p` entity body inline (`sed -n '280,400p' brew-cask-agentsview-safehouse-deps.md`) + `gh pr view 347` (event 99). | +| 8 | ~3,267 (13,071 ch) | Read `docs/dev/_mods/pr-merge.md` — workflow mod file, read up-front (no merge was actually running). | +| 9 | ~3,202 (12,810 ch) | Read `docs/dev/_mods/comm-officer.md` — workflow mod file, read up-front. | +| 10 | ~2,416 (9,667 ch) | Read the reference report `/tmp/boot-analysis-spacedock-landing.md` (this analysis's own template, pulled into FO context). | +| 11 | ~919 (3,676 ch) | Bash: orphan-worktree frontmatter + stage-report-heading inspection over 9 entities (event 85). | +| — | ~2.9k + ~1.6k | `skill_listing` (11.6k ch) + superpowers `SessionStart` hook (6.3k ch) — both baseline injections (part of the 27,256). | + +Dominant block: the **two FO reference files (#1+#3 ≈ 16,200 tokens)** + the **long dev README (#2 ≈ 8,130)** + the **two mod files (#8+#9 ≈ 6,470)** = **~30,800 tokens of skill/reference/mod material read whole, up front, before any work began.** Stacked on top: the **two full status renders (#5+#6 ≈ 8,660, of which ~4,330 is a pure duplicate)** and the **orphan-inspection greps that returned full report paragraphs (#4+#7+#11 ≈ 9,930)**. + +## Slowest steps (wall-clock) — all model latency, not tool latency + +Every large gap is "tool_result received → next assistant turn starts" — the model *thinking* at 90–160k context. No tool execution was slow. (The 53s between event 111 and 112 is excluded — that's the captain's human think-time at the gate question, not model latency.) + +| Gap | Where | What it was deliberating | +|--:|---|---| +| 128.6s | result idx100→turn idx109 | After reading the verbose 8p body + PR #347 state: composing the full "Boot complete / resume-state" report and the gate `AskUserQuestion` at 100k+ context | +| 100.1 / 95.1s | result idx90/92→turn idx97 | After the orphan validation-recommendation grep: classifying the parked entities and deciding to read the "one genuinely ambiguous" 8p entity | +| 71.1s | result idx86→turn idx87 | After the first orphan-worktree frontmatter sweep: "classify and gather the last facts I need" | +| 62.7 / 60.8 / 59.5s | results idx121/124/126→turn idx132 | After reading the reference report + re-rendering the full status table: deciding to pivot to the forensics task | +| 62.0s | result idx147→turn idx148 | Before composing the forensics `Agent` dispatch | +| 58.6 / 58.2s | results idx75/77→turn idx83 | After reading the two mod files: reasoning about comm-officer/pr-merge before the orphan sweep | +| 47.5 / 46.4s | results idx57/59→turn idx64 | After status render #1 + state-branch pull: building the full resume picture (24 backlog / 10 ideation / 2 impl / 7 validation) | +| 45.2s | result idx135→turn idx136 | Locating the session transcript for the forensics | + +The wall-clock is **dominated by generation latency that grows with loaded context** — the two longest think-turns (128.6s, 100.1s) both fired *above 97k context*, after the heavy reads had already accumulated. + +## Root-cause summary + +This session burned ~160k context and ~13.6 minutes of wall-clock **without ever creating a team or dispatching a single worker** — it never got past boot + reconcile. The cost is partly structural (a 27.3k baseline before boot runs) but, unlike the landing session, a large share is **avoidable / duplicated reconciliation cost**: + +1. **Front-loaded reference reads (~30.8k tokens).** The FO read *both* large FO references (16.2k), *plus* the long `docs/dev/README.md` (8.1k), *plus* **both** mod files `comm-officer.md` + `pr-merge.md` (6.5k) — all up front, before any work. The mod files in particular were read at boot even though **no merge was running** (pr-merge's own startup check found PR #347 still OPEN → "no action"). +2. **The full status table was rendered twice (~8.7k, ~4.3k of it pure duplicate).** Render #1 (event 58) was the FO's own reconciliation; render #2 (event 125) re-emitted the identical ~40-entity human table verbatim because the captain answered "Full status dump first." The FO never needed the *human-formatted* table for its own reasoning — it had `status --boot --json` already. +3. **Orphan-inspection greps returned full multi-paragraph report bodies (~9.9k).** The recommendation grep (event 89) matched lines like the `Material/Polish audit findings` paragraphs and the multi-sentence `REJECTED. AC-1..AC-4 all reproduce…` verdicts — entire stage-report prose, not just the verdict tokens the FO needed. Reading the verbose 8p body inline (event 99, ~4.1k) added more of the same. +4. **High-context generation latency.** The two slowest turns (128.6s, 100.1s) both ran above ~97k context. Wall-clock scales with the context the prior three points piled up. + +**Contrast with the landing session:** landing peaked at **~108k and DID dispatch a real ensign at t+256s** (~4.3 min). It paid a similar ~31.9k baseline and read its two FO refs (~16.2k), but it did **not** read mod files up front, did **not** render the human status table at all (let alone twice), and its state-inspection reads (entity `index.md`, focused git checks) stayed terse. This session paid **+~52k peak and +~9 min and produced zero dispatch** — the delta is almost entirely the duplicated status render, the paragraph-returning greps, and the up-front mod + README + dual-FO-ref reads. + +> One genuinely unavoidable cost showed up at the tail: at event 156 the FO detected that **8p's status flipped `implementation`→`validation` between its two status reads with no `status --set` of its own** — i.e. a concurrent session writing the same local state checkout. That's real reconciliation work, not waste; but it's exactly the kind of heavy resume churn that should not sit in the FO's own context (see rec 5). + +## Recommendations (each shrinks context → directly cuts per-turn latency) + +1. **`j9` (Lazy-TeamCreate + shallow-boot-then-greet) is the headline fix here.** This session created no team and dispatched nothing, yet paid full deep-boot cost. A shallow boot that greets the captain off `status --boot --json` alone — deferring references, mods, and the human table until an action is actually chosen — would have answered the captain in seconds at <60k instead of 8.5 min at 126k. Everything below is a lever inside `j9`. +2. **Lazy-load / split the two FO references (~16.2k).** Same recommendation as landing, but it bites harder here. Read `first-officer-shared-core.md` only; gate `claude-first-officer-runtime.md` behind "am I creating a team this turn" — which never happened this session, so ~6.9k was pure waste. +3. **Defer mod-file reads until the mod actually fires (~6.5k saved).** `comm-officer.md` and `pr-merge.md` were read at boot but neither ran (no team spawned, PR #347 still open). Read a mod only when its hook triggers (a merge starts / a team is created), not on every boot. +4. **Never render the full human status table for the FO's own reasoning, and never twice (~8.7k saved).** Use `status --boot --json` (already run) and `status --json --fields ` for internal reconciliation; render the human table to the *captain* at most once, on explicit request — don't re-emit the identical 17k-char table the FO already reasoned over. +5. **Scope orphan-inspection greps to headings / single recommendation lines, not full paragraphs (~10k saved).** Match only the terse verdict (e.g. `grep -oE '^(PASSED|REJECTED|APPROVED)\b.*' | head -1` per entity, or the `### Recommendation` heading line) instead of `grep -niE 'recommend|MATERIAL|…'`, which dragged whole multi-paragraph audit bodies into context. Read the verbose 8p body only if the JSON status is genuinely ambiguous. +6. **Delegate heavy resume reconciliation to a subagent so it never sits in the FO's own context.** The 9-orphan sweep, the per-entity verdict greps, the 8p body read, and the concurrent-write detection are exactly the kind of bulk parsing that should run in a throwaway subagent that returns a one-screen digest — the same pattern this very forensics task uses. That keeps the FO's working context near baseline and its per-turn latency low. + +--- + +*Read-only analysis; no files modified. Numbers parsed directly from the session jsonl (live/appending — frozen at the analysis snapshot of 161 events). The chronological tool-use timeline below was extracted fresh from the same jsonl.* + +## Chronological tool-use timeline + +Freshly extracted from `334ffb94-…jsonl`. 27 tool calls. `ctx` = context tokens (input+cache_read+cache_creation) on the assistant turn that issued the call. `idx` = line index in the jsonl. + +> **The whole table is the boot/reconcile window — there is no post-dispatch workflow continuation.** Row 26 (idx150) is the *forensics* `Agent` dispatch (this task), and row 27 (idx158) is the FO's parallel continuation after handing the forensics off. No workflow ensign was ever launched, so there is no "boot peak at dispatch" to cite — the peak is just where the heavy reconcile reads happened to land (idx148/156, ~160–167k). + +| # | idx | t+s | ctx(tok) | tool | detail | +|--:|--:|--:|--:|---|---| +| 1 | 13 | 4 | 27,256 | Skill | spacedock:first-officer | +| 2 | 23 | 8 | 28,032 | Read | spacedock-v1/skills/first-officer/references/first-officer-shared-core.md | +| 3 | 25 | 9 | 28,032 | Read | spacedock-v1/skills/first-officer/references/claude-first-officer-runtime.md | +| 4 | 27 | 9 | 28,032 | Bash | echo "CLAUDECODE=$CLAUDECODE"; echo "CODEX_THREAD_ID=$CODEX_THREAD_ID" … (env probe) | +| 5 | 36 | 16 | 53,272 | Bash | echo "SPACEDOCK_BIN=$SPACEDOCK_BIN"; spacedock --version; git rev-parse … | +| 6 | 40 | 21 | 53,711 | Bash | spacedock status --discover; echo "---exit:$?" | +| 7 | 45 | 26 | 54,300 | Read | spacedock-v1/docs/dev/README.md | +| 8 | 47 | 27 | 54,300 | Bash | spacedock status --workflow-dir …/docs/dev --boot --json | +| 9 | 55 | 57 | 67,630 | Bash | cd …/docs/dev/.spacedock-state && echo "=== state branch ===" … | +| 10 | 58 | 58 | 67,630 | Bash | spacedock status --workflow-dir …/docs/dev (FULL human table, render #1) | +| 11 | 66 | 108 | 76,536 | Bash | git pull --rebase origin spacedock-state/dev 2>&1 | tail -15 | +| 12 | 67 | 109 | 76,536 | Bash | ls -la …/docs/dev/_mods/ | +| 13 | 74 | 122 | 80,489 | Read | spacedock-v1/docs/dev/_mods/comm-officer.md | +| 14 | 76 | 123 | 80,489 | Read | spacedock-v1/docs/dev/_mods/pr-merge.md | +| 15 | 85 | 187 | 91,363 | Bash | orphan sweep: 9 entities, frontmatter + stage-report headings | +| 16 | 89 | 263 | 97,734 | Bash | grep -niE "recommend|PASSED|REJECTED|verdict|MATERIAL…" over 7 entities (full paras) | +| 17 | 91 | 267 | 97,734 | Bash | cd …/spacedock-v1; local branch / origin freshness (no fetch) | +| 18 | 99 | 367 | 112,132 | Bash | sed -n '280,400p' 8p body (brew-cask-agentsview-safehouse-deps) + gh pr view 347 | +| 19 | 111 | 511 | 125,854 | AskUserQuestion | Q: Where do you want me to start landing the parked work? | +| 20 | 120 | 586 | 136,546 | Read | /tmp/boot-analysis-spacedock-landing.md (the reference report) | +| 21 | 122 | 587 | 136,546 | Bash | ls -lat ~/.claude/projects/*spacedock-v1*/ … (locate transcript) | +| 22 | 125 | 588 | 136,546 | Bash | spacedock status --workflow-dir …/docs/dev (FULL human table, render #2 — duplicate) | +| 23 | 134 | 651 | 150,397 | Bash | cd …/projects/-Users-…-spacedock-v1/; newest jsonl file … | +| 24 | 138 | 700 | 154,921 | Bash | cd …/projects/…; for f in $(ls -t *.jsonl) … (identify session file) | +| 25 | 146 | 730 | 158,551 | Bash | cd …/projects/…; for f in $(ls -t *.jsonl) … (confirm session file) | +| 26 | 150 | 818 | 160,594 | Agent | Boot token forensics for this session (THIS task — read-only analyst, not a workflow dispatch) | +| 27 | 158 | 863 | 167,420 | Bash | cd …/docs/dev/.spacedock-state; state checkout recency … (parallel continuation; caught the 8p concurrent-write flip) | diff --git a/docs/roadmap/0203-fo-efficiency/j9-phases-2-3-ideation.md b/docs/roadmap/0203-fo-efficiency/j9-phases-2-3-ideation.md new file mode 100644 index 00000000..a03eb92e --- /dev/null +++ b/docs/roadmap/0203-fo-efficiency/j9-phases-2-3-ideation.md @@ -0,0 +1,262 @@ +# j9 Phases 2-3 — Lazy-TeamCreate + shallow-boot-then-greet + +**Milestone:** 0.20.3 (0203 FO efficiency) +**Stage:** ideation +**Backbone task:** j9 (one task, three phases). Phase 1 (the contract structural split) is shaped in `T1-ideation.md` (labeled T1 there). This document shapes **Phase 2 (lazy-TeamCreate)** and **Phase 3 (shallow-boot-then-greet)**. +**Depends on:** Phase 1 makes the dispatch/merge content lazily loadable; Phase 3's greet-first sequencing rests on that split. Phase 2 (lazy-TeamCreate) needs no split. + +--- + +## Problem + +Boot forensics on a live FO session (`/tmp/boot-analysis-spacedock-v1.md`) measured **~160k peak context and ~13.6 min wall-clock to reach an interactive greet — with no team created and no worker dispatched.** A 100% pre-dispatch session paid full deep-boot cost. Two structural levers dominate that picture and are this document's targets: + +1. **The 89k team-mode prefix re-cache.** The milestone README ranks lazy-TeamCreate as the single biggest lever at **~89k cache-creation removed** — far larger than the contract reads (~16k), the status-table render (~8.7k), or the mod reads (~6.5k). Creating a Claude team re-caches the whole conversation prefix under the new team context. The measured session never created a team (it never dispatched), so on a no-dispatch boot the entire 89k is avoidable — *if* the contract stops telling the FO to create a team at startup. Today the runtime adapter's Team Creation section reads "At startup (after reading the README, before dispatch)," which couples the team into the boot path. + +2. **Everything read before the greet that the greet does not need.** The forensics timeline shows the FO, before answering the captain, reading both FO references (~16.2k), the full `docs/dev/README.md` (~7.9k), both mod files (~6.5k), and rendering the human status table (~4.3k) — then thinking at 100k+ context, where the two slowest turns (128.6s, 100.1s) fired. The greet itself runs off `status --boot --json`, already executed at t=27s (forensics row 8), long before any of those heavy reads. The cost is structural: the loader and Startup procedure front-load all of it, and generation latency scales with loaded context, so the wall-clock is dominated by thinking at a context the greet never required. + +The goal (milestone success criterion): an FO reaches interactive readiness — greet + state summary + *able to present a gate* — in seconds at **<~60k** context, deferring everything not needed for an accurate greet to the moment it is needed. + +--- + +## Spike (riskiest-first) — can TeamCreate and the team-dependent startup steps all defer past the greet without breaking a correctness guarantee? + +**The riskiest unknown:** the shallow-boot bet is that the FO can greet *before* creating a team, *before* reading the mods, and *before* loading the dispatch/merge modules — yet still honor every correctness guarantee those steps carry (a merged PR advances, an orphan surfaces, a mod-block resumes, a superseded agent is cleaned, a split-root state tree is fresh). If any of those guarantees genuinely must fire before the greet, the shallow-boot sequence is wrong. This spike enumerates every boot step in execution order, classifies each as must-run-before-greet vs defer-to-first-action, and hunts for a correctness break. + +**Honesty on spike depth:** this is STATIC analysis — step enumeration plus correctness tracing over the SKILL loader, the shared-core Startup procedure, the runtime Team Creation / standing-teammate / Event-Loop sections, and the two mod files. It establishes that the deferral *can* be clean. It does NOT prove the FO actually greets correctly with the team and modules unloaded — that is the live `shallow-boot` drive in `internal/ensigncycle` (AC-1), at implementation/validation. The spike says the sequence is sound on paper; the live scenario proves it in flight. + +### Spike step 1 — every boot/startup step in execution order + +Traced from the SKILL loader through the first event-loop pass. Sources: `SKILL.md` (lines 7-27), `first-officer-shared-core.md` (`## Startup` steps 1-7), `claude-first-officer-runtime.md` (`## Team Creation`, `### Standing teammate discovery pass`, `### Standing teammate lazy-spawn`, `## Event Loop` step 0), the startup mod hooks in `docs/dev/_mods/comm-officer.md` and `docs/dev/_mods/pr-merge.md`. + +| # | Step | Source | What it does at boot | +|---|------|--------|----------------------| +| L1 | Skill load: inline shared core (`@references/first-officer-shared-core.md`), read runtime adapter | `SKILL.md` 18, 23-25 | Loads the contract. Phase-1 split slims this to boot-resident core only. | +| S0 | Single-entity-mode check | `SKILL.md` 7-14 | Non-interactive + named entity → bounded mode (out of scope for shallow-boot, which is the interactive greet path). | +| S1 | Contract-version gate (`spacedock --version`, parse `contract `, range check) | shared-core Startup 1 | Aborts on missing binary / out-of-range contract. | +| S2 | `git rev-parse --show-toplevel` | shared-core Startup 2 | Project root. | +| S3 | Workflow discovery (`status --discover` or explicit path) | shared-core Startup 3 | Resolves `{workflow_dir}`. | +| S4 | Read `{workflow_dir}/README.md` | shared-core Startup 4 | Mission, entity labels, stage ordering/defaults, stage properties (initial/terminal/gate/worktree/feedback-to/agent). | +| S5 | `status --boot` (FO consumes `--boot --json`) | shared-core Startup 5 | MODS, ID_STYLE, NEXT_ID, ORPHANS, PR_STATE, DISPATCHABLE, STATE_BACKEND, team_state, sandbox — one call. | +| S6 | Split-root state halt-gate | shared-core Startup 6 | If `state_backend==split-root && entity_dir_present==false`, HALT — state checkout not initialized (would render EMPTY + VALID, a silent failure). | +| S7 | Split-root pull-on-boot (`git -C pull --rebase`) | shared-core Startup 7 | Integrate peers' state once at boot; rebase-conflict → HALT. | +| T1 | Team Creation (`Skill(using-claude-team)` → TeamCreate) | runtime Team Creation | **The ~89k re-cache.** Currently framed "at startup … before dispatch." | +| T2 | Standing-teammate discovery pass (`dispatch list-standing`) | runtime discovery pass | Records standing-teammate mod paths. "No spawn calls at boot." | +| M1 | Startup mod hooks: pr-merge `## Hook: startup` | `pr-merge.md` 13-26 | Scan entities with non-empty `pr` + non-terminal; `gh pr view`; advance MERGED, report CLOSED, no-op OPEN. | +| M2 | Startup mod hooks: comm-officer `## Hook: startup` | `comm-officer.md` 12-27 | If team lacks `comm-officer`, spawn it (fire-and-forget). Needs a team. | +| E0 | Event Loop step 0: reconcile sweep (`dispatch reconcile --team-name {team_name}`) | runtime Event Loop 0 | "(a) at boot, AFTER the split-root pull, BEFORE the first dispatch." Roster-derived A/B/C drift + git-only D/E. | +| E1+ | Event Loop steps 1-4: PR-pending check, mod-block check, `--next` dispatch, idle | runtime Event Loop 1-4 | The normal dispatch loop. | + +**Note on the greet:** there is no explicit "greet" step in the contract today — the FO greets when it has enough state to report. The forensics show `--boot --json` (S5) already executed at t=27s, but the FO did not greet until t=511s because it ran S4 (whole README), M1, M2, and the orphan reconciliation reads first, all at rising context. The shallow-boot change is to **insert the greet immediately after the accuracy-critical boot steps and stop for input**, pushing T1, T2, M1-as-spawn, M2, E0, and the deferred contract modules past it. + +### Spike step 2 — classify each step: must-run-before-greet vs defer + +The discriminator is: *does the greet's accuracy or a correctness guarantee depend on this step having run before the FO speaks to the captain?* An accurate greet = correct workflow state summary + the ability to present a gate. A guarantee that the deferred timing still catches on first-action is safe to defer. + +| Step | Verdict | Reason | +|------|---------|--------| +| L1 (boot-resident core load) | **before-greet** | The greet/gate/status-viewer instructions live here. Phase-1-slimmed. | +| S1 (contract gate) | **before-greet** | A version mismatch must abort before any state read; greeting against a wrong-contract binary is unsafe. | +| S2 (git root) | **before-greet** | Every subsequent path resolves from it. | +| S3 (workflow discovery) | **before-greet** | The greet names the workflow; can't greet without `{workflow_dir}`. | +| S4 (README read) | **before-greet for the greet-relevant fields; the full ~7.9k read DEFERS** | The greet needs entity-label, stage names/ordering, gate flags — a small slice. The full body (proof-policy prose, add-a-scenario procedure, CI setup, PR-body template, task template) is dispatch/validation material the greet never uses. See spike step 5. | +| S5 (`status --boot --json`) | **before-greet** | This IS the greet's data source. ORPHANS, PR_STATE, DISPATCHABLE, STATE_BACKEND, team_state all feed the state summary. | +| S6 (split-root halt-gate) | **before-greet** | A silent-empty state checkout would make the greet REPORT A FALSE "no work" state. The halt is an accuracy guarantee for the greet itself — it must fire before the FO summarizes state. | +| S7 (split-root pull-on-boot) | **before-greet** | State freshness: the greet must report peers' committed state, not a stale local tree. A rebase-conflict halt also pre-empts a dispatch against an unmerged tree. The greet's accuracy depends on this. | +| T1 (TeamCreate) | **defer-to-first-dispatch** | The ~89k re-cache. No team is needed to greet, report state, or present a gate (gates render as captain-facing text, not team messages). Needed only when the FO dispatches a worker. | +| T2 (standing-teammate discovery) | **defer-to-first-dispatch** | Records mod paths for later spawn; "no spawn at boot." Not greet-relevant. Travels with the dispatch module. | +| M1 (pr-merge startup hook) | **before-greet — but as a STATE READ, not via the mod file** | This carries a real guarantee: a PR merged while the FO was away must be advanced, and the greet should report it. BUT `status --boot --json` already carries `pr_state` (PR-pending entities + current merge state). The accuracy obligation is "the greet reports merged PRs," satisfied by reading `pr_state` from the boot JSON — which is before-greet anyway (S5). Reading the **pr-merge mod FILE** (~3.3k) and running its `gh pr view` advancement can ride the first event-loop pass. See the correctness hunt (step 3, break #1). | +| M2 (comm-officer startup hook = spawn) | **defer-to-first-dispatch** | Spawning the prose-polisher needs a team (T1). comm-officer polishes deliberate drafts (PR bodies, gate summaries) — none of which exist at the greet. The greet is a live captain reply, explicitly OUT of comm-officer's scope (`comm-officer.md` 41-42). Reading the mod file and spawning both defer to first dispatch alongside T1. | +| E0 (reconcile sweep) | **defer-to-first-dispatch** | Phase-1 finding C3: reconcile fires "before the first dispatch," NOT before-greet, and needs a `team_name` (A/B/C are roster-derived). It travels with the dispatch module that loads at first dispatch. See the correctness hunt (step 3, breaks #2/#3/#4). | +| E1+ (dispatch loop) | **defer-to-first-action** | The loop runs when there is work to dispatch; the shallow boot greets and stops for input first. | + +### Spike step 3 — correctness hunt: does deferring past the greet drop any guarantee? + +For each guarantee carried by a deferred step, the question is: does the deferred timing (first-action / first-dispatch) still catch it, or MUST it run pre-greet? + +**Break #1 — a merged PR not advanced (pr-merge startup hook M1).** The guarantee: a PR that merged while the FO was offline must advance its entity to terminal and archive it. If the FO greets without running M1, does the merged PR get lost? +- **Caught on first-action, with one accuracy caveat.** The pr-merge hook's advancement logic is duplicated in the Event Loop: step 1 ("Check PR-pending entities … advance merged PRs") runs on the first loop pass, AND pr-merge declares an `## Hook: idle` that re-checks (`pr-merge.md` 28-30, "defense in depth"). So the *advancement* is caught the moment the FO acts. The caveat is the *greet's accuracy*: if the FO greets reporting "PR #347 pending" when it actually merged, the greet is stale. **Resolution:** the greet reads `pr_state` from `status --boot --json` (S5, before-greet), which reports each PR-pending entity's current merge state. The boot probe itself queries merge state, so the greet can say "PR #347 (now MERGED — will advance)" accurately WITHOUT reading the mod file or running the mod's `gh pr view`. The mod-file read and the advancement action defer to first event-loop pass. **No guarantee dropped; the greet stays accurate off the boot JSON.** (Verify during implementation that `--boot --json`'s `pr_state.entries[].state` reflects live merge state, not just the stored `pr:` field — the boot probe parity test `internal/status/boot_probe_parity_test.go` is the place this is pinned.) +- **Genuine constraint:** this only holds if `pr_state` in the boot JSON carries live merge state. If it carries only the stored `pr:` value, the greet cannot report a freshly-merged PR accurately and M1's `gh pr view` would need to run pre-greet. This is the one item to confirm at implementation; the spike flags it as the single accuracy dependency to pin. + +**Break #2 — an orphan not surfaced (reconcile sweep E0).** The guarantee: a lingering/superseded agent, an un-advanced PR, a stale branch, or stale local main is detected and acted on. +- **Caught: ORPHANS already surface at the greet via `status --boot --json`.** The boot JSON's `orphans` array (worktree fields cross-referenced against filesystem + git state) is computed by the binary, needs no team, and is before-greet (S5). The greet reports anomalies from it (Startup step 5: "Report anomalies; do not auto-redispatch"). The reconcile sweep's roster-derived classes (A lingering, B superseded, C un-advanced PR) are a SUPERSET that needs a `team_name`, and the contract already says it fires "before the first dispatch," not before-greet. The git-only classes (D stale branch, E stale local main) are session-independent but are pre-dispatch hygiene, not greet-accuracy. **No guarantee dropped: the greet surfaces filesystem/git orphans off the boot JSON; the roster-derived reconcile rides the first-dispatch path where its `team_name` exists.** This is exactly Phase-1's C3 finding ("the reconcile sweep travels with the dispatch module — fires before-first-dispatch, not before-greet"). + +**Break #3 — a mod-block not resumed.** The guarantee: an entity left `mod-block`-pending across a session boundary resumes its pending merge action. +- **Caught on first-action.** Event Loop step 2 ("Check mod-blocked entities … resume its pending action") runs on the first loop pass; the runtime Mod-Block-at-Terminal section ("On session resume, scan entities with non-empty `mod-block` and resume") is merge-module content. A mod-blocked entity is by definition NOT dispatchable (the loop refuses new work for it), so resumption belongs in the loop, not the greet. The greet reports the pending state from the boot JSON's `mods`/`pr_state` view. **No guarantee dropped: resumption is a first-action obligation, and the merge-module read defers to terminalization where it's needed.** + +**Break #4 — a superseded agent not cleaned (supersede-shutdown / terminal teardown).** The guarantee: a stale cohort from a prior dispatch is shut down. +- **Vacuous at a shallow boot.** A freshly-booted FO that has not created a team has no live roster to clean. Supersede-shutdown fires "on fresh dispatch from a -cycleN increment" (a dispatch action) and terminal teardown fires at the terminal boundary (a merge action). Reconcile Class A/B (the resume-time backstop for a missed teardown/supersede) needs a team and rides first-dispatch (break #2). **No guarantee dropped: there is nothing to clean before the first dispatch creates a team.** + +**Break #5 — split-root state freshness / silent-empty (S6, S7).** These are the one place the hunt says **MUST run pre-greet** — and they already do (classified before-greet in step 2). S6's halt prevents the greet from reporting a false "no work" state off an uninitialized checkout; S7's pull prevents the greet from reporting a stale tree. Neither needs a team. They stay in the before-greet set. **This is the step that genuinely cannot defer** — but it's a shared-core boot-resident step, not a deferred one, so it is no obstacle to the shallow-boot design; it is part of it. + +**Summary of the hunt:** no guarantee carried by a deferred step (TeamCreate, reconcile, standing-teammate spawn, the mod-file reads, the dispatch/merge modules) is dropped by deferring past the greet. Every one is either (a) re-asserted as a state READ in `status --boot --json` so the greet stays accurate (merged-PR state, orphans, mod-block/pending), or (b) an ACTION that the first event-loop pass / first dispatch catches before it matters (PR advancement, reconcile A/B/C, mod-block resume, supersede/teardown). The single accuracy dependency to pin at implementation is that `pr_state` in the boot JSON reflects live merge state (break #1). The only genuinely-cannot-defer steps are the split-root halt-gate and pull-on-boot (S6/S7) — and those are already before-greet shared-core steps. + +### Spike step 4 — lazy-TeamCreate mechanics + +**Where the contract currently mandates team creation at startup:** + +`claude-first-officer-runtime.md` `## Team Creation`, line 7 (verbatim): + +> "At startup (after reading the README, before dispatch), invoke the generic Claude-team-harness discipline: `Skill(skill="spacedock:using-claude-team")`" + +and line 11 (the truthful trigger already present in the next clause): + +> "Invoke it before the first team-mode tool call in the session." + +Phase-1's C2 tweak (from `T1-ideation.md`) already identifies this: retire "at startup" in favor of the first-dispatch trigger. The two clauses contradict — "at startup" vs "before the first team-mode tool call" — and the forensics confirm the truthful one (the measured session never created a team because it never dispatched). + +**Minimal wording change to make it genuinely first-dispatch-triggered:** + +- Replace line 7's "At startup (after reading the README, before dispatch), invoke …" with: **"Before the first team-mode dispatch (the first `Agent()` call that uses a `team_name`), invoke the generic Claude-team-harness discipline:"**. Drop the "at startup" clause entirely; keep line 11's "before the first team-mode tool call" as the now-consistent trigger. +- **Companion timing changes (already aligned by Phase 1, confirmed here):** + - The **standing-teammate discovery pass** (line 19: "After team creation succeeds … and BEFORE entering the normal dispatch event loop") moves with team creation to first-dispatch. It already says "No spawn calls at boot"; the discovery `list-standing` call (cheap) can either ride first-dispatch with the rest of the dispatch module or stay a one-line boot probe — recommend riding first-dispatch so the boot read drops the whole section. + - The **reconcile sweep** (Event Loop step 0): already "(a) at boot, AFTER the split-root pull --rebase and BEFORE the first dispatch." Its "at boot" timing is the before-FIRST-DISPATCH boot moment, not before-greet. With shallow-boot the FO greets before any dispatch, so reconcile fires when the first dispatch arrives (or at the first idle/explicit-action pass), where its `team_name` exists. **No wording change needed beyond making clear "boot reconcile" means before-first-dispatch, which the contract already says — but its *placement* in the deferred dispatch module (Phase 1) is what realizes the deferral.** + - The **comm-officer / standing-teammate spawn** (lazy-spawn, line 28: "Before the first `Agent()` call that uses a `team_name`, spawn all declared standing teammates") is ALREADY first-dispatch-triggered in wording. The only change is that the comm-officer mod FILE read (~3.3k) defers with it instead of being read at boot. + +The lazy-TeamCreate change is therefore **one clause** in the runtime adapter (line 7), plus the Phase-1 placement of the team/dispatch sections behind the first-dispatch load point so the FO never reads them — or creates a team — at boot. + +### Spike step 5 — README-read question (Startup step 4) + +**Is the full ~7.9k README read greet-blocking?** No — only a small slice is. + +`docs/dev/README.md` is **31,456 chars (~7.9k tokens)**. Its frontmatter (lines 1-27) carries everything the greet needs: `entity-label`/`entity-label-plural` (the FO speaks the workflow's declared noun, per Working Principles), `id-style`, `state`, and the full `stages` block — stage names, ordering, and per-stage `initial`/`terminal`/`gate`/`worktree`/`fresh`/`feedback-to` flags. **That frontmatter is ~700 chars (~175 tokens) of the 7.9k.** The remaining ~7.7k is prose the greet never touches: the field-reference table, the long per-stage Good/Bad/proof-policy narratives (the ~6k anti-prose-grep and detached-audit prose), Workflow State, Runtime Live CI / shared-scenario add procedure, the PR-body template, the task template, Testing Resources, Commit Discipline. All of that is dispatch-time, validation-time, or merge-time material. + +**Does `status --boot --json` already carry the greet-relevant slice?** Partly, and not the parts that matter. Confirmed by reading `internal/status/json_commands.go` (`bootJSON`): the boot envelope carries `command, mods, id_style, next_id, [min_prefix], orphans, pr_state, dispatchable, team_state, state_backend, definition_dir, entity_dir, entity_dir_present, sandbox`. The `dispatchable` array carries each ready entity's `current`/`next` stage NAMES, but the boot JSON does **NOT** carry: the entity-label, the full stage ordering/taxonomy, or the per-stage gate/terminal flags. Those are parsed from the README frontmatter by `internal/status/stages.go` (`mappingValue(doc.Content[0], "stages")`), never emitted by any status `--json` command (confirmed: no `"stages"`/`"labels"`/`"gates"` key in `json_commands.go`). So the greet's need for the entity-label and the gate/stage taxonomy is **not** met by the current boot JSON — it is met only by reading the README frontmatter. + +**Recommendation (two viable shapes; recommend the first):** + +1. **Read the README frontmatter only at Startup step 4, defer the body.** The FO already parses YAML frontmatter elsewhere; reading just the `---`-delimited head (lines 1-27, ~700 chars) gives the greet the entity-label, stage names/ordering, and gate/terminal flags. The body (the per-stage prose, proof policy, templates, CI docs) defers — it is read when its phase begins: stage Good/Bad and proof policy at dispatch/gate adjudication, the PR-body template at merge, the add-a-scenario procedure at validation. This is a clean, behavior-preserving slim: the same `## stage` subsections are copied verbatim into dispatch messages (Dispatch step 8, "the full stage definition") at first dispatch, so the body is genuinely not needed before then. **~7.7k cut from every boot.** Implementation note: Startup step 4's wording changes from "Read `{workflow_dir}/README.md` for mission, entity labels, stage ordering …" to "Read the README frontmatter for entity labels, stage ordering and per-stage flags; defer the body (per-stage prose, proof policy, templates) to the phase that needs it." + +2. **Extend `status --boot --json` to carry the greet slice (heavier, defers to a later task).** Add a `labels` + `stages` projection to the boot JSON so the greet needs zero README read. This removes the README read from boot entirely but is a binary change with its own test surface (boot JSON schema, golden fixtures, the FO's key-order parse) — larger than Phase 3 warrants. **Recommend deferring this to p2/vc or a follow-on; Phase 3 takes shape 1 (frontmatter-only read).** + +**Verdict on the README read:** the full ~7.9k read is NOT greet-blocking; only the ~175-token frontmatter is. Phase 3 slims Startup step 4 to a frontmatter read and defers the body. The boot JSON cannot today substitute for the frontmatter (it omits labels + stage taxonomy), so the frontmatter read stays in the before-greet set; only the body defers. + +### Spike step 6 — VERDICT and the greet-blocking step set + +**Verdict: VIABLE with the adjustments named below.** The shallow-boot + lazy-TeamCreate sequence is structurally sound. No correctness guarantee is dropped by deferring TeamCreate, the reconcile sweep, standing-teammate spawn, the mod-file reads, and the dispatch/merge modules past the greet (spike step 3). Every guarantee is either re-asserted as a state READ in `status --boot --json` (so the greet stays accurate) or caught as an ACTION on the first event-loop pass / first dispatch. The lazy-TeamCreate change is one clause (spike step 4). The README read slims to its frontmatter (spike step 5). + +**Adjustments required (none is a redesign):** +- **A1 — lazy-TeamCreate wording:** runtime adapter line 7, "at startup … before dispatch" → "before the first team-mode dispatch." (Phase-1 C2; restated as Phase 2's core change.) +- **A2 — README frontmatter-only read:** Startup step 4 reads the frontmatter, defers the body. +- **A3 — defer the mod-file reads:** pr-merge and comm-officer mod files are read when their hook fires (pr-merge advancement on first event-loop pass; comm-officer spawn at first dispatch), not at boot. The greet reports PR/orphan/mod state off the boot JSON. +- **A4 — insert an explicit greet-and-stop after the before-greet set.** The contract gains a "greet and stop for input" step after S7, ahead of T1/T2/M1-action/M2/E0. +- **A5 — pin the boot-JSON `pr_state` live-merge-state dependency** (spike step 3, break #1) — confirm at implementation that the greet can report a freshly-merged PR off the boot JSON without running the mod. + +**The greet-blocking step set (the new shallow-boot sequence) — the deliverable:** + +``` +contract-gate (S1) + → git root (S2) + → workflow discovery (S3) + → README FRONTMATTER read (S4-slim: entity-label, stage names/ordering, gate/terminal flags) + → status --boot --json (S5: state summary source — orphans, pr_state w/ live merge state, dispatchable, team_state, state_backend) + → split-root halt-gate (S6: prevents a false-empty greet) + → split-root pull-on-boot (S7: state freshness; rebase-conflict HALT) + → GREET the captain (state summary off the boot JSON; able to present a gate) and STOP for input +``` + +**Everything deferred past the greet:** TeamCreate (T1, the ~89k re-cache), standing-teammate discovery + spawn (T2 + lazy-spawn), the comm-officer mod read+spawn (M2), the pr-merge mod read + `gh pr view` advancement action (M1-action), the reconcile sweep (E0), the dispatch/merge contract modules (Phase 1), the README body, and the human status-table render (rendered to the captain only on explicit request, once). + +**The one step that genuinely cannot defer:** the split-root halt-gate + pull-on-boot (S6/S7) — but it is already a before-greet shared-core step, so it is part of the shallow-boot sequence, not an obstacle to it. + +**Honesty on depth:** this VERDICT rests on static step-enumeration and correctness tracing. The live `shallow-boot` drive in `internal/ensigncycle` — observing the FO greet correctly with NO team created and the dispatch/merge modules unloaded — is the implementation/validation proof (AC-1), not this spike's. The spike establishes the sequence *can* be correct; the live scenario establishes that it *is*. + +--- + +## Proposed approach + +### The new shallow-boot sequence + +Reshape the Startup procedure and the runtime Team Creation trigger so the FO reaches the greet through exactly the before-greet step set (spike step 6), then stops for input. Concretely: + +1. **Lazy-TeamCreate (Phase 2).** Change the runtime adapter's Team Creation trigger from "at startup … before dispatch" to "before the first team-mode dispatch" (A1). The FO creates no team at boot; the ~89k re-cache happens only when the first worker is dispatched. The standing-teammate discovery/spawn (already "no spawn at boot" / "before the first `Agent()` with a `team_name`") and the reconcile sweep (already "before the first dispatch") travel with the dispatch module so the FO never reads them — or creates a team — at boot. + +2. **Greet off `status --boot --json` (Phase 3).** After S1-S7 (the before-greet set), the FO greets the captain with a state summary built from the boot JSON (orphans, pr_state with live merge state, dispatchable, team_state, state_backend) and the README frontmatter (entity-label, stage taxonomy, gate flags), then stops for input (A4). It can present a gate from this state without a team (gates are captain-facing text). + +3. **README frontmatter-only read (Phase 3).** Startup step 4 reads the README frontmatter for the greet-relevant fields and defers the ~7.7k body to the phase that needs it (A2). + +4. **Defer mod-file reads (Phase 3).** The comm-officer and pr-merge mod FILES are read when their hooks fire — comm-officer at first dispatch (spawn needs a team), pr-merge advancement on the first event-loop pass (A3). The greet reports PR/mod/orphan state off the boot JSON, never the mod files. + +5. **Defer the human status-table render (Phase 3).** The FO never renders the human-formatted table for its own reasoning (it has the boot JSON); it renders it to the captain at most once, on explicit request, per the Status Viewer's existing captain-facing-display rules. + +### Greet-blocking classification table + +| Step | Classification | Reason | +|------|----------------|--------| +| contract-gate (S1) | **before-greet** | A version mismatch must abort before any state read. | +| git root (S2) | **before-greet** | All paths resolve from it. | +| workflow discovery (S3) | **before-greet** | The greet names the workflow. | +| README **frontmatter** (S4-slim) | **before-greet** | Entity-label, stage names/ordering, gate/terminal flags — the greet's vocabulary and the gate taxonomy. | +| README **body** | defer | Per-stage prose, proof policy, templates, CI docs — dispatch/validation/merge material. | +| `status --boot --json` (S5) | **before-greet** | The state-summary data source (orphans, pr_state, dispatchable, team_state, state_backend). | +| split-root halt-gate (S6) | **before-greet** | Prevents a false-empty greet off an uninitialized checkout. | +| split-root pull-on-boot (S7) | **before-greet** | State freshness; rebase-conflict HALT pre-empts a stale-tree greet/dispatch. | +| **GREET + stop for input** | **the boundary** | State summary + able to present a gate. | +| TeamCreate (T1) | defer (first dispatch) | The ~89k re-cache. No team needed to greet/report/present-a-gate. | +| standing-teammate discovery + spawn (T2) | defer (first dispatch) | Spawn needs a team; not greet-relevant. | +| comm-officer mod read + spawn (M2) | defer (first dispatch) | Polishes deliberate drafts, not the live greet; spawn needs a team. | +| pr-merge mod read + `gh pr view` advance (M1-action) | defer (first event-loop pass) | Advancement caught by Event-Loop step 1 + the pr-merge idle hook; the greet reports merge state off the boot JSON. | +| reconcile sweep (E0) | defer (first dispatch) | Needs a `team_name`; fires before-first-dispatch, not before-greet (Phase-1 C3). | +| dispatch/merge contract modules | defer (Phase 1 load points) | First dispatch / terminalization. | +| human status-table render | defer | Captain-facing, on explicit request, once. | + +### Lazy-TeamCreate mechanics (restated for implementation) + +Single clause: runtime adapter `## Team Creation` line 7. Current: "At startup (after reading the README, before dispatch), invoke …". New: "Before the first team-mode dispatch (the first `Agent()` call that uses a `team_name`), invoke …". Drop "at startup"; the existing line-11 clause ("before the first team-mode tool call in the session") becomes the consistent trigger. Companion: discovery/spawn and reconcile travel with the dispatch module (Phase 1 placement), so the boot read drops them and the FO never creates a team at boot. No binary change. + +### README-read decision + +Take spike-step-5 shape 1: Startup step 4 reads the README **frontmatter only** (entity-label, stage taxonomy, gate/terminal flags); defer the body to the phase that consumes it. The boot JSON cannot substitute (it omits labels + stage taxonomy — confirmed in `json_commands.go`), so the frontmatter read stays before-greet; only the body defers. Extending the boot JSON to carry labels+stages (shape 2) is a heavier binary change deferred to a follow-on, not Phase 3. + +--- + +## Out of scope + +- **Phase 1 — the contract structural split** (extract dispatch/merge modules behind lazy load points, slim the boot-resident core). Shaped in `T1-ideation.md`. Phase 3's deferred-module unload rests on it, but the split itself is Phase 1's deliverable, not this body's. +- **p2 / vc** — `spacedock pr complete` + `reconcile --act` (the binary-simplification line). Parked to 0.20.4. +- **Extending `status --boot --json` to carry labels + stage taxonomy** (README-read shape 2). A heavier binary change; a follow-on, not Phase 3. +- **T3 — residual-prose audit + comm-officer polish.** Files along post-Phase-1; the cut-list does not exist until the split lands. +- **The Codex and Pi runtime adapters' lazy-TeamCreate.** Phase 2 changes the Claude adapter (the bulk file the forensics measured). The Codex/Pi team-creation timing is a follow-on if their boot cost warrants it. + +--- + +## Acceptance criteria + +Each AC names an end-state property of finished Phases 2-3, verified by something outside this task body that can fail. No AC is proven by a string/substring/regex match over an instruction file the model reads — that is banned by this workflow (a passing match only asserts the implementer's own text is present). The behavioral ACs are live drives; the structural AC tests a relationship between independent values. + +**AC-1 — A freshly-booted FO greets the captain and reports accurate workflow state with NO team created and the dispatch/merge modules NOT loaded.** +Verified by: a new live shared-runtime scenario `shallow-boot` in `internal/ensigncycle` (added to `sharedRuntimeScenarios()` with Claude + Codex runners per the README's 4-step add-a-scenario procedure). The runner launches the real host front door against a fixture with at least one dispatchable entity, and the host-neutral assertion over `(before, after, observed)` confirms: (a) the FO produced a greet with a state summary in its final message; (b) durable state shows NO team artifact created and NO worker dispatched (no entity advanced past its boot stage, no worktree created); (c) the FO stopped for input rather than auto-dispatching. The team-not-created observation is the lazy-TeamCreate proof; the greet-without-dispatch is the shallow-boot proof. Behavioral and live, not a contract grep. An offline negative in `shared_scenarios_negative_test.go` builds the broken end-state (a team artifact present / an entity dispatched at boot) and proves the assertion goes red. + +**AC-2 — The shallow boot greets at materially lower loaded context than the deep boot, with the team-mode re-cache absent.** +Verified by: the `shallow-boot` live scenario's captured host artifacts (stream jsonl / session transcript) show the FO reached its greet without a `TeamCreate` tool call in the pre-greet window, and the pre-greet context never incurred the team-mode prefix re-cache. The check parses the live transcript for the presence/absence and ordering of the `TeamCreate` (or team-tool) call relative to the greet message — a behavioral observation over the real run's tool-call sequence (the team call is absent before the greet), NOT a grep over the contract. The negative control: a deep-boot run (or a fixture forcing eager team creation) shows the `TeamCreate` call before the greet, proving the assertion distinguishes the two. + +**AC-3 — The deferred startup steps still run correctly on first action: a merged PR advances and the reconcile sweep fires before the first dispatch.** +Verified by: the existing live scenarios `merge-hook-guardrail` and `rejection-flow`/`gate-guardrail` in `internal/ensigncycle` still pass after Phases 2-3 — they exercise the merge module loading at terminalization (mod-block enforcement, merged-PR advancement) and the dispatch module loading at first dispatch (reconcile sweep, reuse/feedback routing). A green run of all existing live scenarios is the proof that deferring the startup steps past the greet did not drop their first-action obligations. Run via `go test -tags live -run TestLiveClaudeSharedScenarios ./internal/ensigncycle`. (Regression, zero new authoring beyond AC-1's scenario.) + +**AC-4 — The greet reads only the README frontmatter, not the full body, for its greet-relevant fields.** +Verified by: a structural check in `internal/contractlint` (the allowed quarantine) confirming the boot-resident Startup step-4 instruction targets the README frontmatter slice (entity-label, stage taxonomy, gate flags) and that the README-body-only anchors (the proof-policy prose, the add-a-scenario procedure, the PR-body template, the task template) are reachable only from the deferred phase modules, not from the boot-resident core. The expected value (which README sections are greet-relevant vs phase-deferred) comes from the deferred-module manifest, an independent source the boot files can diverge from — so the check can fail if a future edit points the boot core at the README body. A control test plants a boot-resident reference into a body-only section and proves the guard goes red. (Structural, not a prose-grep; pairs with AC-1, which is the behavioral proof the greet is accurate off the slim read.) + +--- + +## Test plan + +- **AC-1 (`shallow-boot` live scenario):** the costly item. Following the README's 4-step add-a-scenario procedure — host-neutral entry in `sharedRuntimeScenarios()`, fixture + prompt in `shared_fixtures_test.go`, host-neutral assertion (greet present + no-team + no-dispatch + stopped-for-input) and offline negative in `shared_scenarios_negative_test.go`, runner entries in BOTH `claudeScenarioRunners()` and `codexScenarioRunners()`. Live, real host, durable-state + final-message assertion. Cost: one model-spend scenario added to the serial suite (~5-7 min Claude opus). **Spot-check first:** run the parity/definition guards (`TestSharedScenarioRunnerCoverage|TestSharedRuntimeScenarioDefinitions`) at zero spend before paying for the live run. +- **AC-2 (no team-mode re-cache before greet):** rides AC-1's live run — adds a transcript assertion over the tool-call sequence (no `TeamCreate` before the greet message). Cost: zero additional model spend; it reads AC-1's captured artifacts. The negative control (a deep-boot / eager-team fixture showing the call before the greet) proves the assertion distinguishes the two. +- **AC-3 (regression):** zero new authoring — run the existing live scenarios (`gate-guardrail`, `rejection-flow`, `merge-hook-guardrail`, `feedback-3-cycle-escalation`) after Phases 2-3. Cost: the existing serial-suite wall-time, already budgeted in CI. Green proves the deferred startup steps still fire correctly on first action. +- **AC-4 (structural guard):** a Go test in `internal/contractlint`, no model spend, runs in the offline gate job (`go test ./...`). Extends the existing reference-closure pattern. Ships with a control test (planted violation goes red) so the guard is proven able to fail, not vacuous. +- **Implementation-time spike to pin (spike step 3, break #1):** before authoring AC-1, confirm `status --boot --json`'s `pr_state.entries[].state` carries live merge state (so the greet can report a freshly-merged PR without reading the mod). The cheap check is a `internal/status/boot_probe_parity_test.go`-adjacent fixture; pin it before paying for the live scenario. If it does NOT carry live merge state, the pr-merge `gh pr view` would need to stay before-greet for accuracy — flag and re-scope. +- **Fixture vs live:** AC-1, AC-2, AC-3 are live (the runtime integration — does the FO greet correctly with the team and modules deferred — is the claim). AC-4 is a structural fixture over the shipped surface. No AC leans on a prose-grep over the contract. + +--- + +## Spike result + +**Verdict: VIABLE with the named adjustments (A1-A5).** The shallow-boot + lazy-TeamCreate sequence is structurally sound. Evidence: + +- **The boot steps enumerate cleanly into before-greet vs defer** (spike steps 1-2). The before-greet set is exactly: contract-gate, git root, workflow discovery, README frontmatter, `status --boot --json`, split-root halt-gate, split-root pull-on-boot — then greet and stop. Everything else (TeamCreate, standing-teammate discovery/spawn, the mod reads, the reconcile sweep, the dispatch/merge modules, the README body, the human table) defers. +- **No correctness guarantee is dropped by deferring past the greet** (spike step 3). Merged-PR advancement, orphan surfacing, mod-block resume, and supersede/teardown are each either re-asserted as a state READ in `status --boot --json` (so the greet stays accurate) or caught as an ACTION on the first event-loop pass / first dispatch. The single accuracy dependency to pin is that the boot JSON's `pr_state` reflects live merge state (break #1). +- **The only genuinely-cannot-defer step is the split-root halt-gate + pull-on-boot** (break #5) — and it is already a before-greet shared-core step, so it is part of the shallow-boot sequence, not an obstacle. +- **Lazy-TeamCreate is one clause** (spike step 4): runtime adapter line 7, "at startup … before dispatch" → "before the first team-mode dispatch." The companion timings (discovery/spawn, reconcile) are already first-dispatch / before-first-dispatch in wording; Phase 1's placement realizes the deferral. No binary change. +- **The full README read is not greet-blocking** (spike step 5): only the ~175-token frontmatter is (entity-label + stage taxonomy + gate flags). The boot JSON omits labels and stage taxonomy (confirmed in `internal/status/json_commands.go` — the boot envelope carries no `"stages"`/`"labels"`/`"gates"` key), so the frontmatter read stays before-greet and the ~7.7k body defers. + +**Honesty on spike depth:** this is STATIC analysis — step enumeration plus correctness tracing over the SKILL loader, the shared-core Startup procedure, the runtime Team Creation / standing-teammate / Event-Loop sections, and the two mod files, with one binary-fact check of the boot JSON schema. It establishes the sequence *can* be correct. The live `shallow-boot` drive in `internal/ensigncycle` — observing the FO greet accurately with NO team created and the dispatch/merge modules unloaded — is the implementation/validation proof (AC-1), not this spike's. The spike establishes the sequence is sound; the live scenario establishes that it holds in flight. From 972c21979b4960ff4774bd5f5b4ccec83edbece2 Mon Sep 17 00:00:00 2001 From: CL Kao Date: Sat, 13 Jun 2026 10:06:30 -0700 Subject: [PATCH 02/18] =?UTF-8?q?roadmap(0203):=20fold=20captain=20contrac?= =?UTF-8?q?t=20audit=20=E2=80=94=20drop=20agents/first-officer.md=20ref=20?= =?UTF-8?q?+=20add=20ethos=20operating-principles?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/roadmap/0203-fo-efficiency/README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/roadmap/0203-fo-efficiency/README.md b/docs/roadmap/0203-fo-efficiency/README.md index 172a366c..f255d74c 100644 --- a/docs/roadmap/0203-fo-efficiency/README.md +++ b/docs/roadmap/0203-fo-efficiency/README.md @@ -59,6 +59,20 @@ contract-gate → discovery → `status --boot --json` → **greet and stop for Boot-report habits (scope greps to headings; delegate bulk reconciliation reads to a subagent) tighten existing "Probe and Ideation Discipline" prose and ride along in Phase 1 / T3. +## Contract content fixes (captain audit, 2026-06-13) + +Two fixes fold into Phase 1's contract cleanup, beyond the structural split: + +1. **Drop the unnecessary `agents/first-officer.md` cross-reference** from `first-officer-shared-core.md` (line 3, "Keep aligned with…") — not load-bearing. +2. **Add a top Operating-principles (ethos) section** the shipped skill lacks today — its absence lets Codex drift from the `agents/first-officer.md` ethos. Combine the existing `## Working Principles` under it. Verbatim: + + > You are dispatcher and responsible for making sure the work is done by the crew. What awesome looks like for the crew: + > - Begin with the end, be clear about the value. + > - Do the hardest things first, de-risk when it is cheap. + > - Communicate and act concisely, choose the simplest approach, JFDI. + +This is boot-resident **guidance content** (behavior-shaping, not a testable AC): proof is the existing live scenarios still passing + review — not a "drift reduced" metric. The principles also govern *how* the contract is simplified (lead with value, hardest-first, simplest/concise). + ## Out of scope (parked, not 0.20.3) - **p2 / vc** — `spacedock pr complete` + `reconcile --act`: the binary-simplification line (higher ROI, heavier lift) → 0.20.4. From 13afe3f71bf1c3f731bd7355c73970268721e32a Mon Sep 17 00:00:00 2001 From: CL Kao Date: Sat, 13 Jun 2026 10:12:35 -0700 Subject: [PATCH 03/18] roadmap(0203): spec moved into the j9 entity; sprint dir is now a thin index (drop staging spike docs) Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/roadmap/0203-fo-efficiency/README.md | 4 +- .../roadmap/0203-fo-efficiency/T1-ideation.md | 184 ------------ .../j9-phases-2-3-ideation.md | 262 ------------------ 3 files changed, 2 insertions(+), 448 deletions(-) delete mode 100644 docs/roadmap/0203-fo-efficiency/T1-ideation.md delete mode 100644 docs/roadmap/0203-fo-efficiency/j9-phases-2-3-ideation.md diff --git a/docs/roadmap/0203-fo-efficiency/README.md b/docs/roadmap/0203-fo-efficiency/README.md index f255d74c..ae6c03a3 100644 --- a/docs/roadmap/0203-fo-efficiency/README.md +++ b/docs/roadmap/0203-fo-efficiency/README.md @@ -1,7 +1,7 @@ # 0203 — FO Efficiency: shallow boot + lazy contract **Milestone:** 0.20.3 -**Status:** shaping (design approved 2026-06-13; T1 ideation in flight) +**Status:** ideation complete, at the ideation gate (2026-06-13). The full spec lives in the j9 entity (`docs/dev/.spacedock-state/lazy-teamcreate-shallow-boot/`); this doc is the sprint index. **Theme:** make the first officer cheap to boot and run. ## Why @@ -51,7 +51,7 @@ contract-gate → discovery → `status --boot --json` → **greet and stop for **`j9` is the backbone — one task, three phases.** Reshape the existing backlog entity; do not fragment it. The contract split runs first because it is the behavior-preserving enabler (and the contract-audit ask); the 89k lazy-TeamCreate is the headline lever. **j9 — Lazy-TeamCreate + shallow-boot-then-greet:** -- **Phase 1 — contract structural split** (enabler + the "audit and cleanup the fo contract" ask): extract boot-resident vs deferred into a lazy dispatch ref + a lazy merge ref; slim the boot-resident core + the skill loader. Behavior-preserving. *Proof: existing `gate-guardrail`/`rejection-flow`/`merge-hook-guardrail` live scenarios still pass + a `internal/contractlint` reference-closure guard.* Shaped in `T1-ideation.md` (labeled T1 there — it is j9 Phase 1; spike verdict VIABLE, ~70% boot-read cut). +- **Phase 1 — contract structural split** (enabler + the "audit and cleanup the fo contract" ask): extract boot-resident vs deferred into a lazy dispatch ref + a lazy merge ref; slim the boot-resident core + the skill loader. Behavior-preserving. *Proof: existing `gate-guardrail`/`rejection-flow`/`merge-hook-guardrail` live scenarios still pass + a `internal/contractlint` reference-closure guard.* Spike verdict VIABLE, ~70% boot-read cut (full spec in the j9 entity). - **Phase 2 — lazy-TeamCreate**: defer the `TeamCreate` call (the ~89k cache-creation) off the boot/greet path to first-dispatch-need. Needs no split. - **Phase 3 — shallow-boot-then-greet**: greet off `status --boot --json`; defer mod-reads, the human status-table render, and the (now-split) deferred contract modules. Folds C3 (mod-defer) + C4 (status-render discipline). *Proof: new live `shallow-boot` scenario.* diff --git a/docs/roadmap/0203-fo-efficiency/T1-ideation.md b/docs/roadmap/0203-fo-efficiency/T1-ideation.md deleted file mode 100644 index 64cd3688..00000000 --- a/docs/roadmap/0203-fo-efficiency/T1-ideation.md +++ /dev/null @@ -1,184 +0,0 @@ -# T1 — FO contract structural split + lazy-load - -**Milestone:** 0.20.3 (0203 FO efficiency) -**Stage:** ideation -**Blocks:** T2 (shallow-boot-then-greet); T3 files along after. - -Restructure the first-officer contract so a boot reads only what it needs to greet, report state, and present a gate. Extract the dispatch/team material and the merge material into already-lazy load points, leaving a slim boot-resident core. This is the structural split that T2's boot-flow reshape and T3's prose audit both depend on — neither cut-list exists until the split lands. - -## Problem - -Boot forensics on a live FO session (`/tmp/boot-analysis-spacedock-v1.md`) measured the cost of reaching an interactive greet **with no team created and no worker dispatched** — a 100% pre-dispatch session: - -- Peak context **160,594 tokens** (event 148); the session crossed 100k at event 97 (~363s) and never fell back. -- Wall-clock to the captain-facing gate question **~511s (~8.5 min)**; to the tail ~13.6 min. The wall-clock is dominated by generation latency that grows with loaded context — the two slowest think-turns (128.6s, 100.1s) both fired *above 97k context*. - -The single biggest avoidable structural waste in that picture is the two FO reference files, **read whole, back-to-back, at events 23 and 25 (t≈8–9s), immediately after the skill loaded, before any work began:** - -| Forensics rank | Tokens | File | -|---|---|---| -| #1 | ~9,310 (37,249 ch) | `first-officer-shared-core.md` — single biggest ingest | -| #3 | ~6,900 (27,602 ch) | `claude-first-officer-runtime.md` — read back-to-back with #1 | - -That is **~16,200 tokens** of contract read on every boot. The forensics call out the runtime adapter specifically: "gate `claude-first-officer-runtime.md` behind 'am I creating a team this turn' — which never happened this session, so ~6.9k was pure waste." The runtime adapter is ~70% team/dispatch material that a boot which never dispatches never uses, yet it loads first. - -The cost is structural, not a bug: the loader reads the entire contract up front. Generation latency scales with loaded context, so trimming the boot read trims the wall-clock directly. - -## Spike (riskiest-first) — is the two-tier split structurally sound? - -**The riskiest unknown:** does the boot-resident path have a hard dependency on deferred content that would break if that content is not loaded at boot? This is static mechanism analysis — dependency tracing over the contract sections plus confirming the lazy hooks already exist. It is the right depth for ideation. The *live* FO-boot drive that proves the FO still greets and presents a gate correctly with the modules unloaded is the implementation/validation proof (the `shallow-boot` scenario in AC-1), not this spike. - -### Spike step 1 — every top-level section mapped to a tier - -Token estimates are char/4 over the section body. The two files total **~9,014 + ~6,700 ≈ 15,700 tokens**, matching the forensics' ~16.2k. - -**`first-officer-shared-core.md` (17 sections, ~9,014 tok)** - -| Section | Lines | ~tok | Tier | -|---|--:|--:|---| -| Startup | 24 | 963 | **boot-resident** | -| Status Viewer | 39 | 655 | **boot-resident** | -| ID Styles | 10 | 256 | **boot-resident** | -| Single-Entity Mode | 12 | 188 | **boot-resident** | -| Working Directory | 4 | 44 | **boot-resident** | -| Dispatch | 24 | 746 | **dispatch-deferred** | -| Completion and Gates | 44 | 1340 | **split** — gate-decision spine boot-resident; reuse-conditions dispatch-deferred (see step 5) | -| Merge and Cleanup (+ Ship-Local, Worktree-removal) | 49 | 1785 | **merge-deferred** | -| State Management | 6 | 66 | **boot-resident** (rebase-conflict halt referenced by Startup) | -| Worktree Ownership (+ Split-Root Worktree Contract) | 30 | 750 | **dispatch-deferred** | -| FO Write Scope | 15 | 430 | **boot-resident** | -| Mod Hook Convention (+ Mod-Block Enforcement) | 21 | 390 | **merge-deferred** (mod-block is a merge-ceremony concept) | -| Standing Teammates | 9 | 402 | **dispatch-deferred** (folds behind `using-claude-team`) | -| Clarification and Communication | 6 | 115 | **boot-resident** | -| Working Principles | 13 | 524 | **boot-resident** | -| Probe and Ideation Discipline | 7 | 308 | **boot-resident** | -| Issue Filing | 4 | 19 | **boot-resident** | - -**`claude-first-officer-runtime.md` (11 sections, ~6,700 tok)** - -| Section | Lines | ~tok | Tier | -|---|--:|--:|---| -| Team Creation (+ standing-teammate discovery/lazy-spawn/declaration) | 44 | 1622 | **dispatch-deferred** → `using-claude-team` | -| Worker Resolution | 10 | 153 | **dispatch-deferred** | -| Dispatch Adapter (+ break-glass) | 58 | 1941 | **dispatch-deferred** | -| Degraded Mode (spacedock seams) | 7 | 159 | **dispatch-deferred** → `using-claude-team` | -| Context Budget and Dead Ensign Handling | 23 | 586 | **dispatch-deferred** | -| Captain Interaction | 16 | 468 | **split** — gate-guardrail + greet boot-resident; team-mode chat hint + single-entity gate-resolution dispatch-deferred | -| Feedback Rejection Flow (bare mode) | 6 | 86 | **dispatch-deferred** (the routing skill `feedback-rejection-flow` is already lazy) | -| Event Loop (incl. reconcile sweep step 0) | 24 | 988 | **dispatch-deferred** | -| Mod-Block Enforcement at Terminal Transitions | 23 | 503 | **merge-deferred** | -| Agent Back-off | 6 | 84 | **boot-resident** (cheap; captain-interaction adjacent) | -| Entity-Body Inspection | 4 | 82 | **boot-resident** (points at shared-core Probe discipline) | - -**Tier totals (approximate):** - -- **Boot-resident:** shared-core ~3,800 tok (Startup, Status Viewer, ID Styles, Single-Entity Mode, Working Directory, State Management, FO Write Scope, Clarification, Working Principles, Probe discipline, Issue Filing, plus the gate-decision spine of Completion-and-Gates) + runtime ~700 tok (Captain Interaction greet/guardrail, Agent Back-off, Entity-Body Inspection) ≈ **~4,500 tok.** -- **Dispatch-deferred:** ~6,000 tok (the bulk of the runtime adapter + shared-core Dispatch/Worktree-Ownership/Standing-Teammates/reuse-conditions). -- **Merge-deferred:** ~2,700 tok (Merge-and-Cleanup, Ship-Local, Mod-Hook/Mod-Block, runtime Mod-Block-at-Terminal). - -**Boot read drops from ~15,700 tok to ~4,500 tok — roughly an 11k-token cut on every boot, ~70% of the contract-read cost.** (Net win is slightly less than the gross because the deferred modules re-load when a real dispatch/merge happens — but on a no-dispatch boot, the session the forensics measured, the full ~11k is saved.) - -### Spike step 2 — loader claim confirmed - -`skills/first-officer/SKILL.md` instructs reading **both** reference files at startup: - -- Line 18: `@references/first-officer-shared-core.md` — the `@`-directive inlines the shared core into the skill body at load. -- Lines 23–25: "Load the runtime adapter for your platform: … read `references/claude-first-officer-runtime.md`" (Claude branch). -- Line 27: "Then begin the Startup procedure from the shared core." - -The live forensics timeline corroborates this exactly: row 1 = `Skill spacedock:first-officer` (event 13, t=4s), row 2 = Read `first-officer-shared-core.md` (event 23, t=8s), row 3 = Read `claude-first-officer-runtime.md` (event 25, t=9s). Both files are read within one second of each other, at the very top of the session. `agents/first-officer.md` adds nothing — it only delegates to the skill ("invoke the `spacedock:first-officer` skill now to load it. Then begin the Startup procedure"). The skill is the single loader. - -### Spike step 3 — coupling trace (boot-resident → deferred) - -The crux question: does any boot-resident step *depend on* team/dispatch/merge knowledge before the greet? Findings, with the resolution for each: - -**C1 — shared-core Startup (the boot procedure itself) has ZERO reference into deferred content.** Startup steps 1–7 are: contract-version gate, `git rev-parse` root, `status --discover`, README read, `status --boot`, split-root halt-gate, split-root pull-on-boot. None mentions team, reconcile, dispatch, or merge. Its only forward reference is "follow the rebase-conflict halt in **State Management**" — and State Management is itself boot-resident. **This is the clean island that makes the split viable.** The boot procedure stands alone. - -**C2 — the runtime adapter "Team Creation" says "At startup (after reading the README, before dispatch)".** This is the load-bearing phrase. Read literally, "at startup" couples team creation into boot. But its own next sentence reframes it: "Invoke it before the first team-mode tool call in the session." And it already delegates entirely to the lazy skill: `Skill(skill="spacedock:using-claude-team")`. **Resolution:** retire "at startup" in favor of the truthful trigger — team creation fires at *first dispatch*, which is when `using-claude-team` is meant to load. This is a one-clause wording change, not a structural break. It aligns the contract with what the forensics already shows happening (the measured session never created a team because it never dispatched). - -**C3 — the reconcile sweep (Event Loop step 0) runs "(a) at boot, AFTER the split-root pull --rebase and BEFORE the first dispatch".** This is the genuine boot-adjacent step inside otherwise-deferred content. It needs a `team_name` (the A/B/C drift classes are roster-derived). **It does NOT need to run before the greet** — it runs "before the first dispatch," and the shallow-boot flow (T2) greets before any dispatch. So reconcile rides into the dispatch-deferred Event Loop module that loads at first dispatch, alongside the `team_name` it requires. There is no boot-resident step that calls reconcile; the greet happens off `status --boot --json` alone (forensics row 8 confirms `--boot --json` is already run at t=27s, before all the heavy reads). **No stub needed in the boot-resident core.** - -**C4 — standing-teammate discovery pass ("after team creation … BEFORE entering the normal dispatch event loop").** This is wholly inside the team-creation flow; it lazy-spawns at first dispatch already ("No spawn calls at boot. Spawn is deferred to the first team-mode dispatch"). It travels with the dispatch module. No boot coupling. - -**C5 — Completion-and-Gates "decide reuse-or-fresh" references the reuse-conditions and the runtime context-budget probe.** The gate-decision spine (never self-approve, present-gate, AC cross-check, the gated-stage branch) is boot-resident — a shallow boot must be able to present a gate. But the reuse-conditions block and the budget probe are only reached *after a worker completes*, which cannot happen before a dispatch. **Resolution:** split the section — the gate-presentation/AC-cross-check spine stays boot-resident; the reuse-or-fresh machinery (reuse conditions 0–4, the model-mismatch diagnostic, SendMessage advancement, supersede-shutdown) moves to the dispatch module. `present-gate` and `feedback-rejection-flow` are already lazy skills the boot-resident spine invokes by name — that precedent is exactly the shape. - -**C6 — Mod-Block Enforcement (shared-core) and Mod-Block at Terminal (runtime) are referenced from Merge-and-Cleanup only.** Both are merge-ceremony concepts. They travel with the merge module. The boot-resident core needs to know merge hooks *exist* (the MODS section of `status --boot` reports them), but not the enforcement mechanics — those are read at terminalization. No boot coupling. - -**Summary:** the only boot-resident step that touches deferred concepts is C2's "at startup" wording, and that is a wording-truth fix, not a structural dependency. C3 (reconcile) is boot-adjacent but fires before-first-dispatch, not before-greet, so it lives cleanly in the dispatch module. No boot-resident step genuinely needs team/dispatch/merge knowledge before the greet. - -### Spike step 4 — is the fold target non-duplicative? - -`spacedock:using-claude-team` already carries the generic team lifecycle: Deferred Team Tools (the ToolSearch hop), Team Creation (TeamCreate-first sequencing, naming, bare-mode fallback), the TeamCreate recovery procedure, the failure-recovery ladder, Degraded Mode (triggers/effects/captain-report/shutdown-sweep), Awaiting Completion, and Terminal Team Teardown. The runtime adapter's "Team Creation" section **already invokes this skill** (`Skill(skill="spacedock:using-claude-team")`) and explicitly states "the generic blocks they reference (`## Degraded Mode`, `## Awaiting Completion`, `## Terminal Team Teardown`) live in that skill, not in this file." - -So the split is already partly done: the *generic* team lifecycle is in the lazy skill; the *spacedock-specific* adapter sections (Worker Resolution, Dispatch Adapter, standing-teammate discovery/lazy-spawn, the spacedock Degraded-Mode seams, Context Budget, Event Loop/reconcile) are what still load eagerly in the runtime adapter. **Folding the runtime adapter's team/dispatch sections behind the same lazy load point as `using-claude-team` is non-duplicative** — they are the spacedock specializations that the generic skill leaves to the consumer. The clean mechanism: a new lazily-loaded reference (e.g. `references/claude-fo-dispatch.md`) holding the spacedock dispatch/team sections, read at the same first-dispatch moment `using-claude-team` is invoked. No content overlaps with the generic skill; the generic skill keeps the lifecycle, the deferred reference keeps the spacedock adapter. - -### Spike step 5 — VERDICT - -**Viable with one wording tweak.** The clean two-tier split is structurally sound: - -- The shared-core Startup procedure is a self-contained boot island with no forward dependency into deferred content (C1) — this is what makes the cut clean. -- The greet runs off `status --boot --json` alone, already executed early in the live session (forensics row 8), so nothing team/dispatch/merge is needed before the greet. -- Every coupling resolves without a boot-resident stub: reconcile (C3) and standing-teammates (C4) fire before-first-dispatch (not before-greet) and travel with the dispatch module; mod-block (C6) travels with the merge module; the gate spine splits cleanly from reuse-or-fresh (C5) along the present-gate/feedback-rejection precedent. -- **The one required tweak:** retire the runtime adapter's "At startup … before dispatch" framing for team creation (C2) in favor of "at first dispatch," matching the lazy `using-claude-team` invocation that is already the team's load point. One clause, not a redesign. - -No boot-path step genuinely needs team/dispatch/merge knowledge before the greet. The split is not blocked. - -## Proposed approach - -The section→module assignment is the table in spike step 1. The mechanism: - -1. **Slim the boot-resident core.** `first-officer-shared-core.md` keeps its boot-resident sections (Startup, Status Viewer, ID Styles, Single-Entity Mode, Working Directory, State Management, FO Write Scope, Clarification, Working Principles, Probe discipline, Issue Filing) plus the gate-presentation spine extracted from Completion-and-Gates. The runtime adapter keeps Captain Interaction's greet/guardrail, Agent Back-off, and Entity-Body Inspection. - -2. **Extract the dispatch/team module.** Move the runtime adapter's Worker Resolution, Dispatch Adapter, Context Budget, Event Loop (incl. reconcile), Degraded-Mode seams, standing-teammate discovery/lazy-spawn/declaration, and the shared-core Dispatch / Worktree-Ownership / Standing-Teammates / reuse-conditions into a lazily-loaded reference read at first dispatch. The generic team lifecycle stays in `using-claude-team` (already lazy); this new reference holds only the spacedock adapter. The first-dispatch load point is the existing `Skill(skill="spacedock:using-claude-team")` invocation — extend it to also pull the spacedock dispatch reference. **C2 fix:** the team-creation trigger reads "at first dispatch," not "at startup." - -3. **Extract the merge module.** Move Merge-and-Cleanup, Ship-Local Ceremony, Worktree-removal safety, Mod-Hook/Mod-Block Enforcement (shared-core), and Mod-Block-at-Terminal (runtime) into a lazily-loaded reference read at terminalization. The boot-resident core reaches it the same way it reaches `present-gate` / `feedback-rejection-flow`: by naming the load point at the terminal boundary. - -4. **Loader change.** `SKILL.md` line 18's `@references/first-officer-shared-core.md` inlines only the slimmed core. The runtime-adapter read (lines 23–25) loads only the boot-resident runtime sections. The dispatch and merge references are NOT read at startup — they load at their phase boundary, extending the existing lazy pattern (`present-gate`, `feedback-rejection-flow`, `using-claude-team`). No new mechanism. - -**Coupling resolutions (from step 3):** C1 — no change needed (clean island). C2 — wording fix to "at first dispatch." C3/C4 — reconcile + standing-teammates travel with the dispatch module (before-first-dispatch, not before-greet). C5 — split Completion-and-Gates: gate spine boot-resident, reuse-or-fresh deferred. C6 — mod-block travels with the merge module. - -This task is the structural move only. It is a content reorganization that must be **behavior-preserving** — the same instructions, reachable at the same moments, just loaded lazily. The regression scenarios (AC-2) are the proof of no behavior change; they are the load-bearing safety net for "this is a move, not a rewrite." - -## Out of scope - -- **The j9 boot-flow change (T2).** Greet-first sequencing, deferred mod-reads, deferred status-table render — the *new* shallow-boot behavior — is T2, blocked-by this task. T1 makes the modules loadable lazily; T2 changes when the FO greets. The `shallow-boot` live scenario is authored against T1's structure but its greet-first behavior is T2's claim. -- **p2 / vc binary-simplification** (`spacedock pr complete`, `reconcile --act`) — parked to 0.20.4. -- **The residual-prose audit + comm-officer polish (T3)** — the cut-list of leftover prose to trim does not exist until this split lands; T3 files along after. -- **The Codex and Pi runtime adapters.** This task splits the Claude adapter (the bulk file the forensics measured). The shared-core split applies cross-host, but the per-host dispatch/merge reference extraction for `codex-first-officer-runtime.md` (~4.9K) and `pi-first-officer-runtime.md` (~5.9K) is a follow-on if their boot cost warrants it — they are an order smaller than the Claude adapter. - -## Acceptance criteria - -Each AC names an end-state property of the finished split, verified by something outside this task body that can fail. No AC is proven by a string/substring/regex match over an instruction file the model reads — that is banned by this workflow (a passing match only asserts the implementer's own text is present). - -**AC-1 — With the dispatch and merge modules NOT loaded, a freshly-booted FO greets the captain, reports workflow state, and presents a human gate without self-approving, mutating, or archiving the entity.** -Verified by: a new live shared-runtime scenario `shallow-boot` in `internal/ensigncycle` (added to `sharedRuntimeScenarios()` with Claude + Codex runners per the README's add-a-scenario procedure). The runner launches the real host front door against a fixture sitting at a human gate, and the host-neutral assertion over `(before, after, observed)` confirms the FO presented the gate (durable: entity still at the gate stage, not archived, no `verdict`/`completed` set; final message carries a gate review + decision prompt) — the gate-guardrail behavior surviving with the deferred modules unloaded. Behavioral and live, not a contract grep. An offline negative case in `shared_scenarios_negative_test.go` builds the broken end-state (entity self-approved/archived) and proves the assertion goes red. - -**AC-2 — The split is behavior-preserving: the deferred modules load and function correctly when a real dispatch or merge happens.** -Verified by: the existing live scenarios `gate-guardrail`, `rejection-flow`, and `merge-hook-guardrail` in `internal/ensigncycle` still pass after the split. `gate-guardrail`/`rejection-flow` exercise the dispatch module loading at first dispatch (reuse-conditions, feedback routing); `merge-hook-guardrail` exercises the merge module loading at terminalization (mod-block enforcement). A green run of all three is the proof that lazy-loading did not drop a reachable instruction — run via `go test -tags live -run TestLiveClaudeSharedScenarios ./internal/ensigncycle`. - -**AC-3 — The boot-resident core has no reference dependency on deferred-only content: every reference the boot-resident core makes resolves either within the boot-resident set or to a known lazy load point, never into the body of the dispatch/merge modules.** -Verified by: a new reference-closure structural guard in `internal/contractlint` (the allowed quarantine), extending `TestUserSkillReferenceClosureResolves`. The check parses the boot-resident files and the deferred-module manifest as real artifacts, builds the set of section anchors the deferred modules own, and fails if a boot-resident `@`/read reference resolves into a deferred-module section that is not one of the declared lazy load points (`using-claude-team`, `present-gate`, `feedback-rejection-flow`, the new dispatch/merge references). This tests a relationship between two independent values — the boot-resident reference set vs. the deferred-module section set — which can diverge (a future edit that points the boot core at a moved section makes them disagree), so it can fail; it is not a spelling check over a single file. A control test plants a boot-resident reference into a deferred section and proves the guard goes red. - -**AC-4 — The boot read shrinks: the files loaded at startup (the slimmed core + boot-resident runtime sections) no longer carry the dispatch/team or merge sections.** -Verified by: a structural-absence check in `internal/contractlint` confirming the dispatch-module section anchors (Dispatch Adapter, Worker Resolution, Event Loop, Context Budget, standing-teammate spawn) and merge-module anchors (Merge and Cleanup, Ship-Local Ceremony, Mod-Block Enforcement) are ABSENT from the boot-resident files and PRESENT in their deferred references — the same structural-absence shape as `TestRetiredPluginPrivatePathsAbsent`. The expected value (which anchors belong where) comes from the deferred-module manifest, an independent source the boot files can diverge from. This is a structural location check, not a behavioral claim; the behavioral win (lower loaded context) is AC-1's live scenario. The check fails if a deferred section is left behind in the boot core or a boot section is wrongly moved out. - -## Test plan - -- **AC-1 (`shallow-boot` live scenario):** the costly item. Following the README's 4-step add-a-scenario procedure — host-neutral entry in `sharedRuntimeScenarios()`, fixture + prompt in `shared_fixtures_test.go`, host-neutral assertion + offline negative in `shared_scenarios_negative_test.go`, runner entries in BOTH `claudeScenarioRunners()` and `codexScenarioRunners()`. Live, real host, durable-state assertion. Cost: one model-spend scenario added to the serial suite (~5–7 min Claude opus); the offline negative and the parity/definition guards run at zero spend. **Spot-check first:** run the parity/definition guards (`TestSharedScenarioRunnerCoverage|TestSharedRuntimeScenarioDefinitions`) before paying for the live run. -- **AC-2 (regression):** zero new authoring — run the three existing live scenarios after the split. Cost: the existing serial-suite wall-time, already budgeted in CI. -- **AC-3 + AC-4 (structural guards):** Go tests in `internal/contractlint`, no model spend, run in the offline gate job (`go test ./...`). Cost: low — they extend existing reference-closure and structural-absence patterns. Each ships with a control test (planted violation goes red) so the guard is proven able to fail, not vacuous. -- **Fixture vs live:** AC-1 is live (the runtime integration — does the FO behave with modules unloaded — is the claim). AC-2 is live (same reason). AC-3/AC-4 are structural fixtures over the shipped surface. No AC leans on a prose-grep over the contract. - -## Spike result - -**Verdict: viable with one wording tweak.** The two-tier split is structurally sound. Evidence: - -- **Loader claim confirmed** (step 2): `SKILL.md` reads both reference files at startup (`@references/first-officer-shared-core.md` + the runtime-adapter read), corroborated by the live forensics timeline (Skill → shared-core Read → runtime Read within one second, t=8–9s). -- **Boot island is clean** (C1): shared-core Startup steps 1–7 have zero forward reference into team/dispatch/merge content; their only forward reference (rebase-conflict halt) targets the boot-resident State Management section. -- **Greet needs no deferred knowledge:** the greet runs off `status --boot --json`, already executed at t=27s in the live session before any heavy read. -- **Every coupling resolves** (step 3): C2 (one "at startup"→"at first dispatch" clause), C3/C4 (reconcile + standing-teammates fire before-first-dispatch, travel with the dispatch module), C5 (gate spine splits from reuse-or-fresh on the present-gate precedent), C6 (mod-block travels with the merge module). No boot-resident step needs team/dispatch/merge knowledge before the greet, so no boot-resident stub is forced. -- **Fold target is non-duplicative** (step 4): `using-claude-team` already carries the generic team lifecycle and the runtime adapter already invokes it; the spacedock-specific adapter sections fold behind the same first-dispatch load point without overlapping the generic skill. - -**Boot read drops ~15,700 → ~4,500 tokens (~11k cut, ~70% of contract-read cost) on a no-dispatch boot** — the exact ~16.2k the forensics flagged as read-whole-up-front, of which ~6.9k (the runtime adapter) was pure waste in the measured session. - -**Honesty on spike depth:** this is static mechanism analysis (dependency tracing + confirming the lazy hooks exist), the right depth for ideation. It does NOT prove the FO still behaves correctly with the modules unloaded — that is AC-1's live `shallow-boot` drive at implementation/validation. The spike establishes the split *can* be clean; the live scenario establishes that it *is*. diff --git a/docs/roadmap/0203-fo-efficiency/j9-phases-2-3-ideation.md b/docs/roadmap/0203-fo-efficiency/j9-phases-2-3-ideation.md deleted file mode 100644 index a03eb92e..00000000 --- a/docs/roadmap/0203-fo-efficiency/j9-phases-2-3-ideation.md +++ /dev/null @@ -1,262 +0,0 @@ -# j9 Phases 2-3 — Lazy-TeamCreate + shallow-boot-then-greet - -**Milestone:** 0.20.3 (0203 FO efficiency) -**Stage:** ideation -**Backbone task:** j9 (one task, three phases). Phase 1 (the contract structural split) is shaped in `T1-ideation.md` (labeled T1 there). This document shapes **Phase 2 (lazy-TeamCreate)** and **Phase 3 (shallow-boot-then-greet)**. -**Depends on:** Phase 1 makes the dispatch/merge content lazily loadable; Phase 3's greet-first sequencing rests on that split. Phase 2 (lazy-TeamCreate) needs no split. - ---- - -## Problem - -Boot forensics on a live FO session (`/tmp/boot-analysis-spacedock-v1.md`) measured **~160k peak context and ~13.6 min wall-clock to reach an interactive greet — with no team created and no worker dispatched.** A 100% pre-dispatch session paid full deep-boot cost. Two structural levers dominate that picture and are this document's targets: - -1. **The 89k team-mode prefix re-cache.** The milestone README ranks lazy-TeamCreate as the single biggest lever at **~89k cache-creation removed** — far larger than the contract reads (~16k), the status-table render (~8.7k), or the mod reads (~6.5k). Creating a Claude team re-caches the whole conversation prefix under the new team context. The measured session never created a team (it never dispatched), so on a no-dispatch boot the entire 89k is avoidable — *if* the contract stops telling the FO to create a team at startup. Today the runtime adapter's Team Creation section reads "At startup (after reading the README, before dispatch)," which couples the team into the boot path. - -2. **Everything read before the greet that the greet does not need.** The forensics timeline shows the FO, before answering the captain, reading both FO references (~16.2k), the full `docs/dev/README.md` (~7.9k), both mod files (~6.5k), and rendering the human status table (~4.3k) — then thinking at 100k+ context, where the two slowest turns (128.6s, 100.1s) fired. The greet itself runs off `status --boot --json`, already executed at t=27s (forensics row 8), long before any of those heavy reads. The cost is structural: the loader and Startup procedure front-load all of it, and generation latency scales with loaded context, so the wall-clock is dominated by thinking at a context the greet never required. - -The goal (milestone success criterion): an FO reaches interactive readiness — greet + state summary + *able to present a gate* — in seconds at **<~60k** context, deferring everything not needed for an accurate greet to the moment it is needed. - ---- - -## Spike (riskiest-first) — can TeamCreate and the team-dependent startup steps all defer past the greet without breaking a correctness guarantee? - -**The riskiest unknown:** the shallow-boot bet is that the FO can greet *before* creating a team, *before* reading the mods, and *before* loading the dispatch/merge modules — yet still honor every correctness guarantee those steps carry (a merged PR advances, an orphan surfaces, a mod-block resumes, a superseded agent is cleaned, a split-root state tree is fresh). If any of those guarantees genuinely must fire before the greet, the shallow-boot sequence is wrong. This spike enumerates every boot step in execution order, classifies each as must-run-before-greet vs defer-to-first-action, and hunts for a correctness break. - -**Honesty on spike depth:** this is STATIC analysis — step enumeration plus correctness tracing over the SKILL loader, the shared-core Startup procedure, the runtime Team Creation / standing-teammate / Event-Loop sections, and the two mod files. It establishes that the deferral *can* be clean. It does NOT prove the FO actually greets correctly with the team and modules unloaded — that is the live `shallow-boot` drive in `internal/ensigncycle` (AC-1), at implementation/validation. The spike says the sequence is sound on paper; the live scenario proves it in flight. - -### Spike step 1 — every boot/startup step in execution order - -Traced from the SKILL loader through the first event-loop pass. Sources: `SKILL.md` (lines 7-27), `first-officer-shared-core.md` (`## Startup` steps 1-7), `claude-first-officer-runtime.md` (`## Team Creation`, `### Standing teammate discovery pass`, `### Standing teammate lazy-spawn`, `## Event Loop` step 0), the startup mod hooks in `docs/dev/_mods/comm-officer.md` and `docs/dev/_mods/pr-merge.md`. - -| # | Step | Source | What it does at boot | -|---|------|--------|----------------------| -| L1 | Skill load: inline shared core (`@references/first-officer-shared-core.md`), read runtime adapter | `SKILL.md` 18, 23-25 | Loads the contract. Phase-1 split slims this to boot-resident core only. | -| S0 | Single-entity-mode check | `SKILL.md` 7-14 | Non-interactive + named entity → bounded mode (out of scope for shallow-boot, which is the interactive greet path). | -| S1 | Contract-version gate (`spacedock --version`, parse `contract `, range check) | shared-core Startup 1 | Aborts on missing binary / out-of-range contract. | -| S2 | `git rev-parse --show-toplevel` | shared-core Startup 2 | Project root. | -| S3 | Workflow discovery (`status --discover` or explicit path) | shared-core Startup 3 | Resolves `{workflow_dir}`. | -| S4 | Read `{workflow_dir}/README.md` | shared-core Startup 4 | Mission, entity labels, stage ordering/defaults, stage properties (initial/terminal/gate/worktree/feedback-to/agent). | -| S5 | `status --boot` (FO consumes `--boot --json`) | shared-core Startup 5 | MODS, ID_STYLE, NEXT_ID, ORPHANS, PR_STATE, DISPATCHABLE, STATE_BACKEND, team_state, sandbox — one call. | -| S6 | Split-root state halt-gate | shared-core Startup 6 | If `state_backend==split-root && entity_dir_present==false`, HALT — state checkout not initialized (would render EMPTY + VALID, a silent failure). | -| S7 | Split-root pull-on-boot (`git -C pull --rebase`) | shared-core Startup 7 | Integrate peers' state once at boot; rebase-conflict → HALT. | -| T1 | Team Creation (`Skill(using-claude-team)` → TeamCreate) | runtime Team Creation | **The ~89k re-cache.** Currently framed "at startup … before dispatch." | -| T2 | Standing-teammate discovery pass (`dispatch list-standing`) | runtime discovery pass | Records standing-teammate mod paths. "No spawn calls at boot." | -| M1 | Startup mod hooks: pr-merge `## Hook: startup` | `pr-merge.md` 13-26 | Scan entities with non-empty `pr` + non-terminal; `gh pr view`; advance MERGED, report CLOSED, no-op OPEN. | -| M2 | Startup mod hooks: comm-officer `## Hook: startup` | `comm-officer.md` 12-27 | If team lacks `comm-officer`, spawn it (fire-and-forget). Needs a team. | -| E0 | Event Loop step 0: reconcile sweep (`dispatch reconcile --team-name {team_name}`) | runtime Event Loop 0 | "(a) at boot, AFTER the split-root pull, BEFORE the first dispatch." Roster-derived A/B/C drift + git-only D/E. | -| E1+ | Event Loop steps 1-4: PR-pending check, mod-block check, `--next` dispatch, idle | runtime Event Loop 1-4 | The normal dispatch loop. | - -**Note on the greet:** there is no explicit "greet" step in the contract today — the FO greets when it has enough state to report. The forensics show `--boot --json` (S5) already executed at t=27s, but the FO did not greet until t=511s because it ran S4 (whole README), M1, M2, and the orphan reconciliation reads first, all at rising context. The shallow-boot change is to **insert the greet immediately after the accuracy-critical boot steps and stop for input**, pushing T1, T2, M1-as-spawn, M2, E0, and the deferred contract modules past it. - -### Spike step 2 — classify each step: must-run-before-greet vs defer - -The discriminator is: *does the greet's accuracy or a correctness guarantee depend on this step having run before the FO speaks to the captain?* An accurate greet = correct workflow state summary + the ability to present a gate. A guarantee that the deferred timing still catches on first-action is safe to defer. - -| Step | Verdict | Reason | -|------|---------|--------| -| L1 (boot-resident core load) | **before-greet** | The greet/gate/status-viewer instructions live here. Phase-1-slimmed. | -| S1 (contract gate) | **before-greet** | A version mismatch must abort before any state read; greeting against a wrong-contract binary is unsafe. | -| S2 (git root) | **before-greet** | Every subsequent path resolves from it. | -| S3 (workflow discovery) | **before-greet** | The greet names the workflow; can't greet without `{workflow_dir}`. | -| S4 (README read) | **before-greet for the greet-relevant fields; the full ~7.9k read DEFERS** | The greet needs entity-label, stage names/ordering, gate flags — a small slice. The full body (proof-policy prose, add-a-scenario procedure, CI setup, PR-body template, task template) is dispatch/validation material the greet never uses. See spike step 5. | -| S5 (`status --boot --json`) | **before-greet** | This IS the greet's data source. ORPHANS, PR_STATE, DISPATCHABLE, STATE_BACKEND, team_state all feed the state summary. | -| S6 (split-root halt-gate) | **before-greet** | A silent-empty state checkout would make the greet REPORT A FALSE "no work" state. The halt is an accuracy guarantee for the greet itself — it must fire before the FO summarizes state. | -| S7 (split-root pull-on-boot) | **before-greet** | State freshness: the greet must report peers' committed state, not a stale local tree. A rebase-conflict halt also pre-empts a dispatch against an unmerged tree. The greet's accuracy depends on this. | -| T1 (TeamCreate) | **defer-to-first-dispatch** | The ~89k re-cache. No team is needed to greet, report state, or present a gate (gates render as captain-facing text, not team messages). Needed only when the FO dispatches a worker. | -| T2 (standing-teammate discovery) | **defer-to-first-dispatch** | Records mod paths for later spawn; "no spawn at boot." Not greet-relevant. Travels with the dispatch module. | -| M1 (pr-merge startup hook) | **before-greet — but as a STATE READ, not via the mod file** | This carries a real guarantee: a PR merged while the FO was away must be advanced, and the greet should report it. BUT `status --boot --json` already carries `pr_state` (PR-pending entities + current merge state). The accuracy obligation is "the greet reports merged PRs," satisfied by reading `pr_state` from the boot JSON — which is before-greet anyway (S5). Reading the **pr-merge mod FILE** (~3.3k) and running its `gh pr view` advancement can ride the first event-loop pass. See the correctness hunt (step 3, break #1). | -| M2 (comm-officer startup hook = spawn) | **defer-to-first-dispatch** | Spawning the prose-polisher needs a team (T1). comm-officer polishes deliberate drafts (PR bodies, gate summaries) — none of which exist at the greet. The greet is a live captain reply, explicitly OUT of comm-officer's scope (`comm-officer.md` 41-42). Reading the mod file and spawning both defer to first dispatch alongside T1. | -| E0 (reconcile sweep) | **defer-to-first-dispatch** | Phase-1 finding C3: reconcile fires "before the first dispatch," NOT before-greet, and needs a `team_name` (A/B/C are roster-derived). It travels with the dispatch module that loads at first dispatch. See the correctness hunt (step 3, breaks #2/#3/#4). | -| E1+ (dispatch loop) | **defer-to-first-action** | The loop runs when there is work to dispatch; the shallow boot greets and stops for input first. | - -### Spike step 3 — correctness hunt: does deferring past the greet drop any guarantee? - -For each guarantee carried by a deferred step, the question is: does the deferred timing (first-action / first-dispatch) still catch it, or MUST it run pre-greet? - -**Break #1 — a merged PR not advanced (pr-merge startup hook M1).** The guarantee: a PR that merged while the FO was offline must advance its entity to terminal and archive it. If the FO greets without running M1, does the merged PR get lost? -- **Caught on first-action, with one accuracy caveat.** The pr-merge hook's advancement logic is duplicated in the Event Loop: step 1 ("Check PR-pending entities … advance merged PRs") runs on the first loop pass, AND pr-merge declares an `## Hook: idle` that re-checks (`pr-merge.md` 28-30, "defense in depth"). So the *advancement* is caught the moment the FO acts. The caveat is the *greet's accuracy*: if the FO greets reporting "PR #347 pending" when it actually merged, the greet is stale. **Resolution:** the greet reads `pr_state` from `status --boot --json` (S5, before-greet), which reports each PR-pending entity's current merge state. The boot probe itself queries merge state, so the greet can say "PR #347 (now MERGED — will advance)" accurately WITHOUT reading the mod file or running the mod's `gh pr view`. The mod-file read and the advancement action defer to first event-loop pass. **No guarantee dropped; the greet stays accurate off the boot JSON.** (Verify during implementation that `--boot --json`'s `pr_state.entries[].state` reflects live merge state, not just the stored `pr:` field — the boot probe parity test `internal/status/boot_probe_parity_test.go` is the place this is pinned.) -- **Genuine constraint:** this only holds if `pr_state` in the boot JSON carries live merge state. If it carries only the stored `pr:` value, the greet cannot report a freshly-merged PR accurately and M1's `gh pr view` would need to run pre-greet. This is the one item to confirm at implementation; the spike flags it as the single accuracy dependency to pin. - -**Break #2 — an orphan not surfaced (reconcile sweep E0).** The guarantee: a lingering/superseded agent, an un-advanced PR, a stale branch, or stale local main is detected and acted on. -- **Caught: ORPHANS already surface at the greet via `status --boot --json`.** The boot JSON's `orphans` array (worktree fields cross-referenced against filesystem + git state) is computed by the binary, needs no team, and is before-greet (S5). The greet reports anomalies from it (Startup step 5: "Report anomalies; do not auto-redispatch"). The reconcile sweep's roster-derived classes (A lingering, B superseded, C un-advanced PR) are a SUPERSET that needs a `team_name`, and the contract already says it fires "before the first dispatch," not before-greet. The git-only classes (D stale branch, E stale local main) are session-independent but are pre-dispatch hygiene, not greet-accuracy. **No guarantee dropped: the greet surfaces filesystem/git orphans off the boot JSON; the roster-derived reconcile rides the first-dispatch path where its `team_name` exists.** This is exactly Phase-1's C3 finding ("the reconcile sweep travels with the dispatch module — fires before-first-dispatch, not before-greet"). - -**Break #3 — a mod-block not resumed.** The guarantee: an entity left `mod-block`-pending across a session boundary resumes its pending merge action. -- **Caught on first-action.** Event Loop step 2 ("Check mod-blocked entities … resume its pending action") runs on the first loop pass; the runtime Mod-Block-at-Terminal section ("On session resume, scan entities with non-empty `mod-block` and resume") is merge-module content. A mod-blocked entity is by definition NOT dispatchable (the loop refuses new work for it), so resumption belongs in the loop, not the greet. The greet reports the pending state from the boot JSON's `mods`/`pr_state` view. **No guarantee dropped: resumption is a first-action obligation, and the merge-module read defers to terminalization where it's needed.** - -**Break #4 — a superseded agent not cleaned (supersede-shutdown / terminal teardown).** The guarantee: a stale cohort from a prior dispatch is shut down. -- **Vacuous at a shallow boot.** A freshly-booted FO that has not created a team has no live roster to clean. Supersede-shutdown fires "on fresh dispatch from a -cycleN increment" (a dispatch action) and terminal teardown fires at the terminal boundary (a merge action). Reconcile Class A/B (the resume-time backstop for a missed teardown/supersede) needs a team and rides first-dispatch (break #2). **No guarantee dropped: there is nothing to clean before the first dispatch creates a team.** - -**Break #5 — split-root state freshness / silent-empty (S6, S7).** These are the one place the hunt says **MUST run pre-greet** — and they already do (classified before-greet in step 2). S6's halt prevents the greet from reporting a false "no work" state off an uninitialized checkout; S7's pull prevents the greet from reporting a stale tree. Neither needs a team. They stay in the before-greet set. **This is the step that genuinely cannot defer** — but it's a shared-core boot-resident step, not a deferred one, so it is no obstacle to the shallow-boot design; it is part of it. - -**Summary of the hunt:** no guarantee carried by a deferred step (TeamCreate, reconcile, standing-teammate spawn, the mod-file reads, the dispatch/merge modules) is dropped by deferring past the greet. Every one is either (a) re-asserted as a state READ in `status --boot --json` so the greet stays accurate (merged-PR state, orphans, mod-block/pending), or (b) an ACTION that the first event-loop pass / first dispatch catches before it matters (PR advancement, reconcile A/B/C, mod-block resume, supersede/teardown). The single accuracy dependency to pin at implementation is that `pr_state` in the boot JSON reflects live merge state (break #1). The only genuinely-cannot-defer steps are the split-root halt-gate and pull-on-boot (S6/S7) — and those are already before-greet shared-core steps. - -### Spike step 4 — lazy-TeamCreate mechanics - -**Where the contract currently mandates team creation at startup:** - -`claude-first-officer-runtime.md` `## Team Creation`, line 7 (verbatim): - -> "At startup (after reading the README, before dispatch), invoke the generic Claude-team-harness discipline: `Skill(skill="spacedock:using-claude-team")`" - -and line 11 (the truthful trigger already present in the next clause): - -> "Invoke it before the first team-mode tool call in the session." - -Phase-1's C2 tweak (from `T1-ideation.md`) already identifies this: retire "at startup" in favor of the first-dispatch trigger. The two clauses contradict — "at startup" vs "before the first team-mode tool call" — and the forensics confirm the truthful one (the measured session never created a team because it never dispatched). - -**Minimal wording change to make it genuinely first-dispatch-triggered:** - -- Replace line 7's "At startup (after reading the README, before dispatch), invoke …" with: **"Before the first team-mode dispatch (the first `Agent()` call that uses a `team_name`), invoke the generic Claude-team-harness discipline:"**. Drop the "at startup" clause entirely; keep line 11's "before the first team-mode tool call" as the now-consistent trigger. -- **Companion timing changes (already aligned by Phase 1, confirmed here):** - - The **standing-teammate discovery pass** (line 19: "After team creation succeeds … and BEFORE entering the normal dispatch event loop") moves with team creation to first-dispatch. It already says "No spawn calls at boot"; the discovery `list-standing` call (cheap) can either ride first-dispatch with the rest of the dispatch module or stay a one-line boot probe — recommend riding first-dispatch so the boot read drops the whole section. - - The **reconcile sweep** (Event Loop step 0): already "(a) at boot, AFTER the split-root pull --rebase and BEFORE the first dispatch." Its "at boot" timing is the before-FIRST-DISPATCH boot moment, not before-greet. With shallow-boot the FO greets before any dispatch, so reconcile fires when the first dispatch arrives (or at the first idle/explicit-action pass), where its `team_name` exists. **No wording change needed beyond making clear "boot reconcile" means before-first-dispatch, which the contract already says — but its *placement* in the deferred dispatch module (Phase 1) is what realizes the deferral.** - - The **comm-officer / standing-teammate spawn** (lazy-spawn, line 28: "Before the first `Agent()` call that uses a `team_name`, spawn all declared standing teammates") is ALREADY first-dispatch-triggered in wording. The only change is that the comm-officer mod FILE read (~3.3k) defers with it instead of being read at boot. - -The lazy-TeamCreate change is therefore **one clause** in the runtime adapter (line 7), plus the Phase-1 placement of the team/dispatch sections behind the first-dispatch load point so the FO never reads them — or creates a team — at boot. - -### Spike step 5 — README-read question (Startup step 4) - -**Is the full ~7.9k README read greet-blocking?** No — only a small slice is. - -`docs/dev/README.md` is **31,456 chars (~7.9k tokens)**. Its frontmatter (lines 1-27) carries everything the greet needs: `entity-label`/`entity-label-plural` (the FO speaks the workflow's declared noun, per Working Principles), `id-style`, `state`, and the full `stages` block — stage names, ordering, and per-stage `initial`/`terminal`/`gate`/`worktree`/`fresh`/`feedback-to` flags. **That frontmatter is ~700 chars (~175 tokens) of the 7.9k.** The remaining ~7.7k is prose the greet never touches: the field-reference table, the long per-stage Good/Bad/proof-policy narratives (the ~6k anti-prose-grep and detached-audit prose), Workflow State, Runtime Live CI / shared-scenario add procedure, the PR-body template, the task template, Testing Resources, Commit Discipline. All of that is dispatch-time, validation-time, or merge-time material. - -**Does `status --boot --json` already carry the greet-relevant slice?** Partly, and not the parts that matter. Confirmed by reading `internal/status/json_commands.go` (`bootJSON`): the boot envelope carries `command, mods, id_style, next_id, [min_prefix], orphans, pr_state, dispatchable, team_state, state_backend, definition_dir, entity_dir, entity_dir_present, sandbox`. The `dispatchable` array carries each ready entity's `current`/`next` stage NAMES, but the boot JSON does **NOT** carry: the entity-label, the full stage ordering/taxonomy, or the per-stage gate/terminal flags. Those are parsed from the README frontmatter by `internal/status/stages.go` (`mappingValue(doc.Content[0], "stages")`), never emitted by any status `--json` command (confirmed: no `"stages"`/`"labels"`/`"gates"` key in `json_commands.go`). So the greet's need for the entity-label and the gate/stage taxonomy is **not** met by the current boot JSON — it is met only by reading the README frontmatter. - -**Recommendation (two viable shapes; recommend the first):** - -1. **Read the README frontmatter only at Startup step 4, defer the body.** The FO already parses YAML frontmatter elsewhere; reading just the `---`-delimited head (lines 1-27, ~700 chars) gives the greet the entity-label, stage names/ordering, and gate/terminal flags. The body (the per-stage prose, proof policy, templates, CI docs) defers — it is read when its phase begins: stage Good/Bad and proof policy at dispatch/gate adjudication, the PR-body template at merge, the add-a-scenario procedure at validation. This is a clean, behavior-preserving slim: the same `## stage` subsections are copied verbatim into dispatch messages (Dispatch step 8, "the full stage definition") at first dispatch, so the body is genuinely not needed before then. **~7.7k cut from every boot.** Implementation note: Startup step 4's wording changes from "Read `{workflow_dir}/README.md` for mission, entity labels, stage ordering …" to "Read the README frontmatter for entity labels, stage ordering and per-stage flags; defer the body (per-stage prose, proof policy, templates) to the phase that needs it." - -2. **Extend `status --boot --json` to carry the greet slice (heavier, defers to a later task).** Add a `labels` + `stages` projection to the boot JSON so the greet needs zero README read. This removes the README read from boot entirely but is a binary change with its own test surface (boot JSON schema, golden fixtures, the FO's key-order parse) — larger than Phase 3 warrants. **Recommend deferring this to p2/vc or a follow-on; Phase 3 takes shape 1 (frontmatter-only read).** - -**Verdict on the README read:** the full ~7.9k read is NOT greet-blocking; only the ~175-token frontmatter is. Phase 3 slims Startup step 4 to a frontmatter read and defers the body. The boot JSON cannot today substitute for the frontmatter (it omits labels + stage taxonomy), so the frontmatter read stays in the before-greet set; only the body defers. - -### Spike step 6 — VERDICT and the greet-blocking step set - -**Verdict: VIABLE with the adjustments named below.** The shallow-boot + lazy-TeamCreate sequence is structurally sound. No correctness guarantee is dropped by deferring TeamCreate, the reconcile sweep, standing-teammate spawn, the mod-file reads, and the dispatch/merge modules past the greet (spike step 3). Every guarantee is either re-asserted as a state READ in `status --boot --json` (so the greet stays accurate) or caught as an ACTION on the first event-loop pass / first dispatch. The lazy-TeamCreate change is one clause (spike step 4). The README read slims to its frontmatter (spike step 5). - -**Adjustments required (none is a redesign):** -- **A1 — lazy-TeamCreate wording:** runtime adapter line 7, "at startup … before dispatch" → "before the first team-mode dispatch." (Phase-1 C2; restated as Phase 2's core change.) -- **A2 — README frontmatter-only read:** Startup step 4 reads the frontmatter, defers the body. -- **A3 — defer the mod-file reads:** pr-merge and comm-officer mod files are read when their hook fires (pr-merge advancement on first event-loop pass; comm-officer spawn at first dispatch), not at boot. The greet reports PR/orphan/mod state off the boot JSON. -- **A4 — insert an explicit greet-and-stop after the before-greet set.** The contract gains a "greet and stop for input" step after S7, ahead of T1/T2/M1-action/M2/E0. -- **A5 — pin the boot-JSON `pr_state` live-merge-state dependency** (spike step 3, break #1) — confirm at implementation that the greet can report a freshly-merged PR off the boot JSON without running the mod. - -**The greet-blocking step set (the new shallow-boot sequence) — the deliverable:** - -``` -contract-gate (S1) - → git root (S2) - → workflow discovery (S3) - → README FRONTMATTER read (S4-slim: entity-label, stage names/ordering, gate/terminal flags) - → status --boot --json (S5: state summary source — orphans, pr_state w/ live merge state, dispatchable, team_state, state_backend) - → split-root halt-gate (S6: prevents a false-empty greet) - → split-root pull-on-boot (S7: state freshness; rebase-conflict HALT) - → GREET the captain (state summary off the boot JSON; able to present a gate) and STOP for input -``` - -**Everything deferred past the greet:** TeamCreate (T1, the ~89k re-cache), standing-teammate discovery + spawn (T2 + lazy-spawn), the comm-officer mod read+spawn (M2), the pr-merge mod read + `gh pr view` advancement action (M1-action), the reconcile sweep (E0), the dispatch/merge contract modules (Phase 1), the README body, and the human status-table render (rendered to the captain only on explicit request, once). - -**The one step that genuinely cannot defer:** the split-root halt-gate + pull-on-boot (S6/S7) — but it is already a before-greet shared-core step, so it is part of the shallow-boot sequence, not an obstacle to it. - -**Honesty on depth:** this VERDICT rests on static step-enumeration and correctness tracing. The live `shallow-boot` drive in `internal/ensigncycle` — observing the FO greet correctly with NO team created and the dispatch/merge modules unloaded — is the implementation/validation proof (AC-1), not this spike's. The spike establishes the sequence *can* be correct; the live scenario establishes that it *is*. - ---- - -## Proposed approach - -### The new shallow-boot sequence - -Reshape the Startup procedure and the runtime Team Creation trigger so the FO reaches the greet through exactly the before-greet step set (spike step 6), then stops for input. Concretely: - -1. **Lazy-TeamCreate (Phase 2).** Change the runtime adapter's Team Creation trigger from "at startup … before dispatch" to "before the first team-mode dispatch" (A1). The FO creates no team at boot; the ~89k re-cache happens only when the first worker is dispatched. The standing-teammate discovery/spawn (already "no spawn at boot" / "before the first `Agent()` with a `team_name`") and the reconcile sweep (already "before the first dispatch") travel with the dispatch module so the FO never reads them — or creates a team — at boot. - -2. **Greet off `status --boot --json` (Phase 3).** After S1-S7 (the before-greet set), the FO greets the captain with a state summary built from the boot JSON (orphans, pr_state with live merge state, dispatchable, team_state, state_backend) and the README frontmatter (entity-label, stage taxonomy, gate flags), then stops for input (A4). It can present a gate from this state without a team (gates are captain-facing text). - -3. **README frontmatter-only read (Phase 3).** Startup step 4 reads the README frontmatter for the greet-relevant fields and defers the ~7.7k body to the phase that needs it (A2). - -4. **Defer mod-file reads (Phase 3).** The comm-officer and pr-merge mod FILES are read when their hooks fire — comm-officer at first dispatch (spawn needs a team), pr-merge advancement on the first event-loop pass (A3). The greet reports PR/mod/orphan state off the boot JSON, never the mod files. - -5. **Defer the human status-table render (Phase 3).** The FO never renders the human-formatted table for its own reasoning (it has the boot JSON); it renders it to the captain at most once, on explicit request, per the Status Viewer's existing captain-facing-display rules. - -### Greet-blocking classification table - -| Step | Classification | Reason | -|------|----------------|--------| -| contract-gate (S1) | **before-greet** | A version mismatch must abort before any state read. | -| git root (S2) | **before-greet** | All paths resolve from it. | -| workflow discovery (S3) | **before-greet** | The greet names the workflow. | -| README **frontmatter** (S4-slim) | **before-greet** | Entity-label, stage names/ordering, gate/terminal flags — the greet's vocabulary and the gate taxonomy. | -| README **body** | defer | Per-stage prose, proof policy, templates, CI docs — dispatch/validation/merge material. | -| `status --boot --json` (S5) | **before-greet** | The state-summary data source (orphans, pr_state, dispatchable, team_state, state_backend). | -| split-root halt-gate (S6) | **before-greet** | Prevents a false-empty greet off an uninitialized checkout. | -| split-root pull-on-boot (S7) | **before-greet** | State freshness; rebase-conflict HALT pre-empts a stale-tree greet/dispatch. | -| **GREET + stop for input** | **the boundary** | State summary + able to present a gate. | -| TeamCreate (T1) | defer (first dispatch) | The ~89k re-cache. No team needed to greet/report/present-a-gate. | -| standing-teammate discovery + spawn (T2) | defer (first dispatch) | Spawn needs a team; not greet-relevant. | -| comm-officer mod read + spawn (M2) | defer (first dispatch) | Polishes deliberate drafts, not the live greet; spawn needs a team. | -| pr-merge mod read + `gh pr view` advance (M1-action) | defer (first event-loop pass) | Advancement caught by Event-Loop step 1 + the pr-merge idle hook; the greet reports merge state off the boot JSON. | -| reconcile sweep (E0) | defer (first dispatch) | Needs a `team_name`; fires before-first-dispatch, not before-greet (Phase-1 C3). | -| dispatch/merge contract modules | defer (Phase 1 load points) | First dispatch / terminalization. | -| human status-table render | defer | Captain-facing, on explicit request, once. | - -### Lazy-TeamCreate mechanics (restated for implementation) - -Single clause: runtime adapter `## Team Creation` line 7. Current: "At startup (after reading the README, before dispatch), invoke …". New: "Before the first team-mode dispatch (the first `Agent()` call that uses a `team_name`), invoke …". Drop "at startup"; the existing line-11 clause ("before the first team-mode tool call in the session") becomes the consistent trigger. Companion: discovery/spawn and reconcile travel with the dispatch module (Phase 1 placement), so the boot read drops them and the FO never creates a team at boot. No binary change. - -### README-read decision - -Take spike-step-5 shape 1: Startup step 4 reads the README **frontmatter only** (entity-label, stage taxonomy, gate/terminal flags); defer the body to the phase that consumes it. The boot JSON cannot substitute (it omits labels + stage taxonomy — confirmed in `json_commands.go`), so the frontmatter read stays before-greet; only the body defers. Extending the boot JSON to carry labels+stages (shape 2) is a heavier binary change deferred to a follow-on, not Phase 3. - ---- - -## Out of scope - -- **Phase 1 — the contract structural split** (extract dispatch/merge modules behind lazy load points, slim the boot-resident core). Shaped in `T1-ideation.md`. Phase 3's deferred-module unload rests on it, but the split itself is Phase 1's deliverable, not this body's. -- **p2 / vc** — `spacedock pr complete` + `reconcile --act` (the binary-simplification line). Parked to 0.20.4. -- **Extending `status --boot --json` to carry labels + stage taxonomy** (README-read shape 2). A heavier binary change; a follow-on, not Phase 3. -- **T3 — residual-prose audit + comm-officer polish.** Files along post-Phase-1; the cut-list does not exist until the split lands. -- **The Codex and Pi runtime adapters' lazy-TeamCreate.** Phase 2 changes the Claude adapter (the bulk file the forensics measured). The Codex/Pi team-creation timing is a follow-on if their boot cost warrants it. - ---- - -## Acceptance criteria - -Each AC names an end-state property of finished Phases 2-3, verified by something outside this task body that can fail. No AC is proven by a string/substring/regex match over an instruction file the model reads — that is banned by this workflow (a passing match only asserts the implementer's own text is present). The behavioral ACs are live drives; the structural AC tests a relationship between independent values. - -**AC-1 — A freshly-booted FO greets the captain and reports accurate workflow state with NO team created and the dispatch/merge modules NOT loaded.** -Verified by: a new live shared-runtime scenario `shallow-boot` in `internal/ensigncycle` (added to `sharedRuntimeScenarios()` with Claude + Codex runners per the README's 4-step add-a-scenario procedure). The runner launches the real host front door against a fixture with at least one dispatchable entity, and the host-neutral assertion over `(before, after, observed)` confirms: (a) the FO produced a greet with a state summary in its final message; (b) durable state shows NO team artifact created and NO worker dispatched (no entity advanced past its boot stage, no worktree created); (c) the FO stopped for input rather than auto-dispatching. The team-not-created observation is the lazy-TeamCreate proof; the greet-without-dispatch is the shallow-boot proof. Behavioral and live, not a contract grep. An offline negative in `shared_scenarios_negative_test.go` builds the broken end-state (a team artifact present / an entity dispatched at boot) and proves the assertion goes red. - -**AC-2 — The shallow boot greets at materially lower loaded context than the deep boot, with the team-mode re-cache absent.** -Verified by: the `shallow-boot` live scenario's captured host artifacts (stream jsonl / session transcript) show the FO reached its greet without a `TeamCreate` tool call in the pre-greet window, and the pre-greet context never incurred the team-mode prefix re-cache. The check parses the live transcript for the presence/absence and ordering of the `TeamCreate` (or team-tool) call relative to the greet message — a behavioral observation over the real run's tool-call sequence (the team call is absent before the greet), NOT a grep over the contract. The negative control: a deep-boot run (or a fixture forcing eager team creation) shows the `TeamCreate` call before the greet, proving the assertion distinguishes the two. - -**AC-3 — The deferred startup steps still run correctly on first action: a merged PR advances and the reconcile sweep fires before the first dispatch.** -Verified by: the existing live scenarios `merge-hook-guardrail` and `rejection-flow`/`gate-guardrail` in `internal/ensigncycle` still pass after Phases 2-3 — they exercise the merge module loading at terminalization (mod-block enforcement, merged-PR advancement) and the dispatch module loading at first dispatch (reconcile sweep, reuse/feedback routing). A green run of all existing live scenarios is the proof that deferring the startup steps past the greet did not drop their first-action obligations. Run via `go test -tags live -run TestLiveClaudeSharedScenarios ./internal/ensigncycle`. (Regression, zero new authoring beyond AC-1's scenario.) - -**AC-4 — The greet reads only the README frontmatter, not the full body, for its greet-relevant fields.** -Verified by: a structural check in `internal/contractlint` (the allowed quarantine) confirming the boot-resident Startup step-4 instruction targets the README frontmatter slice (entity-label, stage taxonomy, gate flags) and that the README-body-only anchors (the proof-policy prose, the add-a-scenario procedure, the PR-body template, the task template) are reachable only from the deferred phase modules, not from the boot-resident core. The expected value (which README sections are greet-relevant vs phase-deferred) comes from the deferred-module manifest, an independent source the boot files can diverge from — so the check can fail if a future edit points the boot core at the README body. A control test plants a boot-resident reference into a body-only section and proves the guard goes red. (Structural, not a prose-grep; pairs with AC-1, which is the behavioral proof the greet is accurate off the slim read.) - ---- - -## Test plan - -- **AC-1 (`shallow-boot` live scenario):** the costly item. Following the README's 4-step add-a-scenario procedure — host-neutral entry in `sharedRuntimeScenarios()`, fixture + prompt in `shared_fixtures_test.go`, host-neutral assertion (greet present + no-team + no-dispatch + stopped-for-input) and offline negative in `shared_scenarios_negative_test.go`, runner entries in BOTH `claudeScenarioRunners()` and `codexScenarioRunners()`. Live, real host, durable-state + final-message assertion. Cost: one model-spend scenario added to the serial suite (~5-7 min Claude opus). **Spot-check first:** run the parity/definition guards (`TestSharedScenarioRunnerCoverage|TestSharedRuntimeScenarioDefinitions`) at zero spend before paying for the live run. -- **AC-2 (no team-mode re-cache before greet):** rides AC-1's live run — adds a transcript assertion over the tool-call sequence (no `TeamCreate` before the greet message). Cost: zero additional model spend; it reads AC-1's captured artifacts. The negative control (a deep-boot / eager-team fixture showing the call before the greet) proves the assertion distinguishes the two. -- **AC-3 (regression):** zero new authoring — run the existing live scenarios (`gate-guardrail`, `rejection-flow`, `merge-hook-guardrail`, `feedback-3-cycle-escalation`) after Phases 2-3. Cost: the existing serial-suite wall-time, already budgeted in CI. Green proves the deferred startup steps still fire correctly on first action. -- **AC-4 (structural guard):** a Go test in `internal/contractlint`, no model spend, runs in the offline gate job (`go test ./...`). Extends the existing reference-closure pattern. Ships with a control test (planted violation goes red) so the guard is proven able to fail, not vacuous. -- **Implementation-time spike to pin (spike step 3, break #1):** before authoring AC-1, confirm `status --boot --json`'s `pr_state.entries[].state` carries live merge state (so the greet can report a freshly-merged PR without reading the mod). The cheap check is a `internal/status/boot_probe_parity_test.go`-adjacent fixture; pin it before paying for the live scenario. If it does NOT carry live merge state, the pr-merge `gh pr view` would need to stay before-greet for accuracy — flag and re-scope. -- **Fixture vs live:** AC-1, AC-2, AC-3 are live (the runtime integration — does the FO greet correctly with the team and modules deferred — is the claim). AC-4 is a structural fixture over the shipped surface. No AC leans on a prose-grep over the contract. - ---- - -## Spike result - -**Verdict: VIABLE with the named adjustments (A1-A5).** The shallow-boot + lazy-TeamCreate sequence is structurally sound. Evidence: - -- **The boot steps enumerate cleanly into before-greet vs defer** (spike steps 1-2). The before-greet set is exactly: contract-gate, git root, workflow discovery, README frontmatter, `status --boot --json`, split-root halt-gate, split-root pull-on-boot — then greet and stop. Everything else (TeamCreate, standing-teammate discovery/spawn, the mod reads, the reconcile sweep, the dispatch/merge modules, the README body, the human table) defers. -- **No correctness guarantee is dropped by deferring past the greet** (spike step 3). Merged-PR advancement, orphan surfacing, mod-block resume, and supersede/teardown are each either re-asserted as a state READ in `status --boot --json` (so the greet stays accurate) or caught as an ACTION on the first event-loop pass / first dispatch. The single accuracy dependency to pin is that the boot JSON's `pr_state` reflects live merge state (break #1). -- **The only genuinely-cannot-defer step is the split-root halt-gate + pull-on-boot** (break #5) — and it is already a before-greet shared-core step, so it is part of the shallow-boot sequence, not an obstacle. -- **Lazy-TeamCreate is one clause** (spike step 4): runtime adapter line 7, "at startup … before dispatch" → "before the first team-mode dispatch." The companion timings (discovery/spawn, reconcile) are already first-dispatch / before-first-dispatch in wording; Phase 1's placement realizes the deferral. No binary change. -- **The full README read is not greet-blocking** (spike step 5): only the ~175-token frontmatter is (entity-label + stage taxonomy + gate flags). The boot JSON omits labels and stage taxonomy (confirmed in `internal/status/json_commands.go` — the boot envelope carries no `"stages"`/`"labels"`/`"gates"` key), so the frontmatter read stays before-greet and the ~7.7k body defers. - -**Honesty on spike depth:** this is STATIC analysis — step enumeration plus correctness tracing over the SKILL loader, the shared-core Startup procedure, the runtime Team Creation / standing-teammate / Event-Loop sections, and the two mod files, with one binary-fact check of the boot JSON schema. It establishes the sequence *can* be correct. The live `shallow-boot` drive in `internal/ensigncycle` — observing the FO greet accurately with NO team created and the dispatch/merge modules unloaded — is the implementation/validation proof (AC-1), not this spike's. The spike establishes the sequence is sound; the live scenario establishes that it holds in flight. From 483c3c6d46fe5e4cc5ceee20715cc27f6f71f9a5 Mon Sep 17 00:00:00 2001 From: CL Kao Date: Sat, 13 Jun 2026 10:29:03 -0700 Subject: [PATCH 04/18] roadmap(0203): intake github#344 (context-budget spurious warnings) to the sprint index Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/roadmap/0203-fo-efficiency/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/roadmap/0203-fo-efficiency/README.md b/docs/roadmap/0203-fo-efficiency/README.md index ae6c03a3..aea772f5 100644 --- a/docs/roadmap/0203-fo-efficiency/README.md +++ b/docs/roadmap/0203-fo-efficiency/README.md @@ -73,6 +73,10 @@ Two fixes fold into Phase 1's contract cleanup, beyond the structural split: This is boot-resident **guidance content** (behavior-shaping, not a testable AC): proof is the existing live scenarios still passing + review — not a "drift reduced" metric. The principles also govern *how* the contract is simplified (lead with value, hardest-first, simplest/concise). +## Intake (added to the sprint) + +- **`context-budget-spurious-warnings` — github#344** (captain intake 2026-06-13). `spacedock dispatch context-budget` emits spurious `config_drift_warning` (the `[1m]`-suffixed captain-session model string never matches the canonical jsonl id) and `mixed_models_warning` (harness-injected `` jsonl entries pollute the model census) on *healthy* reused team members — eroding trust in the reuse-condition-0 budget signal. FO dispatch-path quality. Filed as a backlog seed in `docs/dev`; ideation fills approach + ACs (proven by Go tests over the probe's output, not prose). + ## Out of scope (parked, not 0.20.3) - **p2 / vc** — `spacedock pr complete` + `reconcile --act`: the binary-simplification line (higher ROI, heavier lift) → 0.20.4. From be3cf39509fae81e0592952a1484a3cb18301d42 Mon Sep 17 00:00:00 2001 From: CL Kao Date: Sat, 13 Jun 2026 10:38:17 -0700 Subject: [PATCH 05/18] roadmap(0203): T3 filed as backlog seed fo-contract-prose-audit (blocked on Phase-1) Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/roadmap/0203-fo-efficiency/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/roadmap/0203-fo-efficiency/README.md b/docs/roadmap/0203-fo-efficiency/README.md index aea772f5..a4ef0cf3 100644 --- a/docs/roadmap/0203-fo-efficiency/README.md +++ b/docs/roadmap/0203-fo-efficiency/README.md @@ -55,7 +55,7 @@ contract-gate → discovery → `status --boot --json` → **greet and stop for - **Phase 2 — lazy-TeamCreate**: defer the `TeamCreate` call (the ~89k cache-creation) off the boot/greet path to first-dispatch-need. Needs no split. - **Phase 3 — shallow-boot-then-greet**: greet off `status --boot --json`; defer mod-reads, the human status-table render, and the (now-split) deferred contract modules. Folds C3 (mod-defer) + C4 (status-render discipline). *Proof: new live `shallow-boot` scenario.* -**T3 — residual-prose audit + comm-officer polish** (file along, post-Phase-1; the cut-list does not exist until the split lands). +**T3 — residual-prose audit + comm-officer polish** — filed as backlog seed `fo-contract-prose-audit`, BLOCKED on j9 Phase-1; ideation dispatches post-split (the cut-list does not exist until the split lands), and collapses to a roadmap decision if the split left nothing to cut. Boot-report habits (scope greps to headings; delegate bulk reconciliation reads to a subagent) tighten existing "Probe and Ideation Discipline" prose and ride along in Phase 1 / T3. From 0af3652ed6d97b5b2baba0450fdae0dc2ae1ee44 Mon Sep 17 00:00:00 2001 From: CL Kao Date: Sat, 13 Jun 2026 12:31:21 -0700 Subject: [PATCH 06/18] =?UTF-8?q?roadmap(0203):=20sprint=20artifacts=20?= =?UTF-8?q?=E2=80=94=20index.md=20+=20staff-review.md=20+=20dispatch-sprin?= =?UTF-8?q?t-execution.md=20(README=E2=86=92index)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/roadmap/0203-fo-efficiency/README.md | 88 ------------------- .../dispatch-sprint-execution.md | 42 +++++++++ docs/roadmap/0203-fo-efficiency/index.md | 45 ++++++++++ .../0203-fo-efficiency/staff-review.md | 32 +++++++ 4 files changed, 119 insertions(+), 88 deletions(-) delete mode 100644 docs/roadmap/0203-fo-efficiency/README.md create mode 100644 docs/roadmap/0203-fo-efficiency/dispatch-sprint-execution.md create mode 100644 docs/roadmap/0203-fo-efficiency/index.md create mode 100644 docs/roadmap/0203-fo-efficiency/staff-review.md diff --git a/docs/roadmap/0203-fo-efficiency/README.md b/docs/roadmap/0203-fo-efficiency/README.md deleted file mode 100644 index a4ef0cf3..00000000 --- a/docs/roadmap/0203-fo-efficiency/README.md +++ /dev/null @@ -1,88 +0,0 @@ -# 0203 — FO Efficiency: shallow boot + lazy contract - -**Milestone:** 0.20.3 -**Status:** ideation complete, at the ideation gate (2026-06-13). The full spec lives in the j9 entity (`docs/dev/.spacedock-state/lazy-teamcreate-shallow-boot/`); this doc is the sprint index. -**Theme:** make the first officer cheap to boot and run. - -## Why - -Boot forensics on a live FO session (`/tmp/boot-analysis-spacedock-v1.md`) measured **~160k peak context and ~13.6 min** to reach an interactive greet — **with no team created and no worker dispatched.** The cost is structural, not a bug: the FO reads its entire contract (both reference files ~16k), the workflow README, and both mod files up front, then renders the full status table — most of it unused on a boot that never dispatches. Generation latency scales with loaded context, so the wall-clock is dominated by thinking at 100k+. - -## Goal (success criterion) - -An FO reaches interactive readiness — greet + state summary + *able to present a gate* — in seconds at **< ~60k** context, versus today's minutes at 126k+. Proven by a live FO-boot drive that observes correct behavior with the deferred modules unloaded — never by a grep over the restructured contract. - -## Cost levers (ranked — why j9 is the backbone) - -| Lever | ~boot cost removed | Needs the split? | -|-------|-------------------:|------------------| -| Lazy-TeamCreate (defer the team-mode prefix re-cache) | **~89k cache-creation** | no | -| Defer contract reads at greet | ~16k | yes (minimal) | -| Defer the human status-table render | ~8.7k | no | -| Defer mod-file reads | ~6.5k | no | - -The 89k lazy-TeamCreate dwarfs the rest and sits on the critical path *before the dispatch gate* — so **`j9` (lazy-TeamCreate + shallow-boot) is the backbone.** The contract split is the enabling refactor for the one lever that needs it, and doubles as the contract-cleanup ask. - -## The cut — reorganize the contract by *when* it is needed - -**Boot-resident** (read on every FO start): contract-gate/startup, discovery, `status --boot --json`, status viewer, ID styles, single-entity mode, write scope, captain interaction, event loop, gate-presentation entry, clarification, working principles. Exactly enough to greet, report state, and present a gate. - -**Deferred — loaded only when its phase begins:** -- **Dispatch/team module** (first dispatch): team creation, dispatch adapter, worker resolution, reuse conditions, standing teammates, degraded mode, context budget. ≈70% of `claude-first-officer-runtime.md` plus shared-core's reuse/standing-teammate sections. The bulk — and the biggest pure boot waste today. -- **Merge module** (terminal boundary): merge-and-cleanup, ship-local, teardown, mod-block enforcement. -- **Already lazy** (the precedent we extend): `present-gate`, `feedback-rejection-flow`, `using-claude-team` skills. - -## Mechanism - -The `spacedock:first-officer` skill reads only the boot-resident core at startup. The team/dispatch content folds behind the existing lazily-loaded `using-claude-team` skill (already invoked at first dispatch); the merge content becomes a lazily-loaded reference invoked at terminalization. No new pattern — we extend the one already in the codebase. - -## Boot flow (the j9 shallow-boot) - -contract-gate → discovery → `status --boot --json` → **greet and stop for input.** Team creation, mod-file reads, the dispatch/merge modules, and the human status table all defer to the moment they are needed. - -## Test plan (honors the proof policy) - -- **New live shared-runtime scenario `shallow-boot`:** the FO boots, greets, and presents a gate with the dispatch/merge modules *not* loaded — verified through the live `internal/ensigncycle` harness on durable behavior. The win is correct behavior at lower loaded context; behavioral and live, not a contract grep. -- **Regression:** existing `gate-guardrail` / `rejection-flow` / `merge-hook-guardrail` scenarios still pass — the deferred modules load correctly when a real dispatch/merge happens. -- **Structural guard** (in the allowed `internal/contractlint` quarantine): a reference-closure check that the boot-resident core has no dependency on deferred-only content. Structural, not prose-grep. - -## Tasks - -**`j9` is the backbone — one task, three phases.** Reshape the existing backlog entity; do not fragment it. The contract split runs first because it is the behavior-preserving enabler (and the contract-audit ask); the 89k lazy-TeamCreate is the headline lever. - -**j9 — Lazy-TeamCreate + shallow-boot-then-greet:** -- **Phase 1 — contract structural split** (enabler + the "audit and cleanup the fo contract" ask): extract boot-resident vs deferred into a lazy dispatch ref + a lazy merge ref; slim the boot-resident core + the skill loader. Behavior-preserving. *Proof: existing `gate-guardrail`/`rejection-flow`/`merge-hook-guardrail` live scenarios still pass + a `internal/contractlint` reference-closure guard.* Spike verdict VIABLE, ~70% boot-read cut (full spec in the j9 entity). -- **Phase 2 — lazy-TeamCreate**: defer the `TeamCreate` call (the ~89k cache-creation) off the boot/greet path to first-dispatch-need. Needs no split. -- **Phase 3 — shallow-boot-then-greet**: greet off `status --boot --json`; defer mod-reads, the human status-table render, and the (now-split) deferred contract modules. Folds C3 (mod-defer) + C4 (status-render discipline). *Proof: new live `shallow-boot` scenario.* - -**T3 — residual-prose audit + comm-officer polish** — filed as backlog seed `fo-contract-prose-audit`, BLOCKED on j9 Phase-1; ideation dispatches post-split (the cut-list does not exist until the split lands), and collapses to a roadmap decision if the split left nothing to cut. - -Boot-report habits (scope greps to headings; delegate bulk reconciliation reads to a subagent) tighten existing "Probe and Ideation Discipline" prose and ride along in Phase 1 / T3. - -## Contract content fixes (captain audit, 2026-06-13) - -Two fixes fold into Phase 1's contract cleanup, beyond the structural split: - -1. **Drop the unnecessary `agents/first-officer.md` cross-reference** from `first-officer-shared-core.md` (line 3, "Keep aligned with…") — not load-bearing. -2. **Add a top Operating-principles (ethos) section** the shipped skill lacks today — its absence lets Codex drift from the `agents/first-officer.md` ethos. Combine the existing `## Working Principles` under it. Verbatim: - - > You are dispatcher and responsible for making sure the work is done by the crew. What awesome looks like for the crew: - > - Begin with the end, be clear about the value. - > - Do the hardest things first, de-risk when it is cheap. - > - Communicate and act concisely, choose the simplest approach, JFDI. - -This is boot-resident **guidance content** (behavior-shaping, not a testable AC): proof is the existing live scenarios still passing + review — not a "drift reduced" metric. The principles also govern *how* the contract is simplified (lead with value, hardest-first, simplest/concise). - -## Intake (added to the sprint) - -- **`context-budget-spurious-warnings` — github#344** (captain intake 2026-06-13). `spacedock dispatch context-budget` emits spurious `config_drift_warning` (the `[1m]`-suffixed captain-session model string never matches the canonical jsonl id) and `mixed_models_warning` (harness-injected `` jsonl entries pollute the model census) on *healthy* reused team members — eroding trust in the reuse-condition-0 budget signal. FO dispatch-path quality. Filed as a backlog seed in `docs/dev`; ideation fills approach + ACs (proven by Go tests over the probe's output, not prose). - -## Out of scope (parked, not 0.20.3) - -- **p2 / vc** — `spacedock pr complete` + `reconcile --act`: the binary-simplification line (higher ROI, heavier lift) → 0.20.4. -- **xp** — cross-session FO↔Commander comms: the coordination infra that makes multi-FO safe. Its own track. -- **ey** — proof-policy port to shipped scaffolding: adjacent, separate. - -## Operational landing - -This doc lives in the main repo (collision-free). Filing T1 and reshaping `j9` into `docs/dev/.spacedock-state` are coordinated with the active Commander session via path-scoped commits (disjoint entities → safe under the multi-writer protocol). diff --git a/docs/roadmap/0203-fo-efficiency/dispatch-sprint-execution.md b/docs/roadmap/0203-fo-efficiency/dispatch-sprint-execution.md new file mode 100644 index 00000000..fdfe406f --- /dev/null +++ b/docs/roadmap/0203-fo-efficiency/dispatch-sprint-execution.md @@ -0,0 +1,42 @@ +# 0203 (0.20.3) — FO efficiency — Commander dispatch (cold-boot) + +## Boot + +Sprint = the entities matching `sprint: 0203-fo-efficiency` (query, not a list): **j9** (lazy-teamcreate-shallow-boot), **#344** (context-budget-spurious-warnings), **T3** (fo-contract-prose-audit). Boot the first officer (`spacedock claude`), `status --boot`, and read each entity body for its gate-approved design + ACs. Readiness: `staff-review.md` (verdict READY). Goal/DoD: `index.md`. Evidence: `boot-analysis.md`. + +## Deliverable & DoD + +**0.20.3** = the FO-efficiency restructure + the context-budget probe fix. Done when, merged to `next` (then `main` at the cut) — see `index.md` Definition of Done. Headline: a live drive **measures** boot reaching < ~60k with the ~89k team-mode re-cache absent before the greet. + +## Drive order — ⚠️ coordination + +1. **j9 first** (the backbone; T3 keys off it). Per the operating principle — *begin with the end; do the hardest first, de-risk when it's cheap* — within j9 land the cheap, biggest lever first: **lazy-TeamCreate + shallow-greet + the AC-6 measured-saving drive**, proving the <60k/89k saving *before* the full contract split. Then **Phase-1** (the split + the offline-gate-assertion retarget). If the cheap measure already clears <60k, the split is the contract-cleanup / residual-~16k play, not load-bearing for the saving — decide then whether it stays in 0.20.3. +2. **T3 after j9 Phase-1 lands** — the slimmed refs must exist before the audit. Step-0 survey decides the collapse fork (cut vs recorded decision). +3. **#344** — already validated (`46224f5f`); merge with the batch. No ordering constraint (zero overlap). + +## Per-member build notes + +### j9 — lazy-teamcreate-shallow-boot · shipped-scaffolding surface · ⚠️ HIGH-STAKES +The 3-phase restructure (full spec in the entity body, AC-1..AC-6). It rewrites the very contract the FO + ensigns run under — test the new contract in isolation (the live `shallow-boot` scenario + the `contractlint` closure test) before merge. **Retarget the two offline-gate assertions** (`TestNoUnexpectedModHookOrPRMergeIntroduced` allowlist; `TestGradeMarkerMatchesContract` source) to the post-split layout as an explicit subtask, and keep `go test ./...` green (AC-5). Clean the 3 Polish residuals from `staff-review.md` (stale AC-count lines; AC-1(b) attribution; AC-6 89k-soft-spot note). + +### #344 — context-budget-spurious-warnings · dispatch-path · DONE (held pre-merge) +Implemented + validated on `spacedock-ensign/context-budget-spurious-warnings` @ `46224f5f` (the `` census skip + the `[1m]`-suffix window promotion). 5 ACs green; golden parity zero-churn; detached audit confirmed the over-suppression guards load-bearing. Just merge with the batch. + +### T3 — fo-contract-prose-audit · contract-cleanup · BLOCKED on j9 Phase-1 +The 4-step audit method (survey → mechanical cut → comm-officer polish) against the slimmed refs. Step-0 survey is the collapse fork: non-empty inventory → code change; empty/trivial → a recorded roadmap decision (AC-4). Steer the survey to KEEP the budget-probe reuse-condition-0 prose (a deliberate cross-host abstraction split, not collapsible duplication). + +## Detached adversarial audit (before merge) + +High-stakes surface: **j9** (shipped contract/scaffolding). Run a read-only detached audit on a throwaway checkout of the merge result before merging j9 — refute that the live scenarios + `contractlint` guards would catch a broken edit. **#344** already had its detached audit. **T3** is behavior-preserving (live scenarios) — routine. + +## Pre-cut antipattern audit (⚠️ before the v0.20.3 tag) + +All merged, tag not yet fired → an INDEPENDENT staff-eng reviewer over the assembled sprint. **Critically: confirm AC-6 actually measured the <60k/89k saving in the live run** — the sprint's whole point. Verify main-PR CI gating. Ship-blockers fixed pre-cut; non-blockers seed the next sprint. + +## Cut + +Fire `v0.20.3` once the three are merged and the pre-cut audit is clean. + +## Out of scope (deferred) + +p2/vc (0.20.4 binary-simplification line); xp (cross-session FO↔Commander comms — the coordination gap this sprint hit live); ey (proof-policy port to shipped scaffolding). diff --git a/docs/roadmap/0203-fo-efficiency/index.md b/docs/roadmap/0203-fo-efficiency/index.md new file mode 100644 index 00000000..5049b885 --- /dev/null +++ b/docs/roadmap/0203-fo-efficiency/index.md @@ -0,0 +1,45 @@ +# 0203 — FO efficiency (0.20.3) + +**Sprint:** the entities matching `sprint: 0203-fo-efficiency` (a query, not a hard-coded list) — `j9` (lazy-teamcreate-shallow-boot), `#344` (context-budget-spurious-warnings), `T3` (fo-contract-prose-audit). +**Theme:** make the first officer cheap to boot and run. + +## Goal (success criterion) + +An FO reaches interactive readiness — greet + state summary + *able to present a gate* — in seconds at **< ~60k** context, versus today's minutes at 126k+. Proven by a live FO-boot drive that **measures** the saving (j9 AC-6), never a grep over the restructured contract. + +## Why + +Boot forensics (`boot-analysis.md`) measured ~160k peak context and ~13.6 min to greet — with no team created and no worker dispatched. Structural, not a bug. + +## Cost levers (ranked) + +| Lever | ~boot cost removed | Needs the split? | +|-------|-------------------:|------------------| +| Lazy-TeamCreate (defer the team-mode prefix re-cache) | **~89k** | no | +| Defer contract reads at greet | ~16k | yes (minimal) | +| Defer the human status-table render | ~8.7k | no | +| Defer mod-file reads | ~6.5k | no | + +## Definition of Done + +0.20.3 ships when, merged to `next` (then `main` at the cut): +- **j9** — the FO contract is split into a boot-resident core + deferred dispatch/merge references; `TeamCreate` deferred off the boot/greet path; shallow-boot-then-greet off `status --boot --json`. AC-1..AC-6 green, including the live shallow-boot scenario, the offline gate staying green post-split, the `contractlint` closure test, and the **measured-saving drive** (greet context < ~60k, no pre-greet ~89k spike). +- **#344** — the context-budget probe emits no spurious `config_drift`/`mixed_models` warnings on healthy members and reads the correct window. (Implemented + validated on `spacedock-ensign/context-budget-spurious-warnings` @ `46224f5f`; held pre-merge — ships with the batch.) +- **T3** — the slimmed FO refs are audited + comm-officer-polished, behavior-preserving (live scenarios green) and measurably smaller — or a recorded roadmap decision if the split left nothing to cut. +- `v0.20.3` cut after the pre-cut antipattern audit is clean. + +## Tasks + +- **j9** (backbone) — contract split → lazy-TeamCreate → shallow-boot-then-greet. The full spec is the entity body. +- **#344** — context-budget spurious-warnings fix (validated, held pre-merge). +- **T3** — residual-prose audit + polish (blocked on j9 Phase-1; collapses to a decision if nothing to cut). + +## Out of scope + +p2/vc (0.20.4 binary-simplification line); xp (cross-session FO↔Commander comms — the coordination gap this sprint surfaced); ey (proof-policy port to shipped scaffolding). + +## Artifacts + +- `staff-review.md` — preflight readiness gap analysis (verdict: READY) +- `dispatch-sprint-execution.md` — cold-boot Commander dispatch package +- `boot-analysis.md` — the boot forensics (evidence base) diff --git a/docs/roadmap/0203-fo-efficiency/staff-review.md b/docs/roadmap/0203-fo-efficiency/staff-review.md new file mode 100644 index 00000000..948135b8 --- /dev/null +++ b/docs/roadmap/0203-fo-efficiency/staff-review.md @@ -0,0 +1,32 @@ +# 0203 FO-efficiency — preflight staff review + +**Verdict: READY** (after one j9 rework cycle). Sprint-wide preflight over the pooled sprint — per-item + cross-cutting. Shaping-FO session, 2026-06-13. + +## Per-item readiness + +- **j9** (lazy-teamcreate-shallow-boot) — **READY.** The contract restructure (boot-resident/deferred split → lazy-TeamCreate → shallow-boot-then-greet). Reviewed in depth: a 4-lens design panel + adversarial re-verify (6 + 3 findings closed), then this sprint preflight surfaced 3 batch-blockers — all fixed in cycle 4 and re-verified. AC set AC-1..AC-6, all external-proven, no prose-grep. +- **#344** (context-budget-spurious-warnings) — **READY** (validated, held pre-merge). Narrow Go fix; 5 ACs green including golden parity zero-churn; a detached adversarial audit confirmed the over-suppression guards load-bearing. Zero file overlap / behavioral interaction with j9/T3. Merges with the batch. +- **T3** (fo-contract-prose-audit) — **READY.** A 4-step audit method against j9's *planned* modules; ACs external (live-scenario preservation + `wc` size-floor + collapse-decision record); correctly sequenced behind j9 Phase-1; collapses to a roadmap decision if the split leaves nothing to cut. + +## Cross-cutting coherence + +- **Zero file overlap.** #344 touches `internal/claudeteam` Go; j9/T3 touch the FO markdown refs + `contractlint`/`ensigncycle` tests. No merge conflict is possible. +- **#344 ↔ j9** meet only at the stable `reuse_ok` surface — the contract references the budget probe as a black box, and neither warning key appears anywhere in the refs. Clean. +- **T3 → j9 Phase-1** dependency is sound + non-circular: T3 designs its method (not a cut-list) against j9's planned modules, with an explicit collapse fork. + +## Material findings — all CLOSED in j9 cycle 4 + +- **M1** — the Phase-1 split breaks two hard-coded offline-gate tests (`TestNoUnexpectedModHookOrPRMergeIntroduced` via the `## Hook:` allowlist; `TestGradeMarkerMatchesContract` via the `TERMINAL_TEARDOWN_BOUNDED` marker). → j9 owns an explicit retarget subtask + **AC-5** (offline gate exits 0 post-split). +- **M2** — AC-4's structural proof was unbuildable (cited a test that reads only `SKILL.md`). → rewritten as a real new `contractlint` `os.Stat`-oracle test. +- **M3** — the sprint's headline goal (<~60k / 89k saving) had no AC. → **AC-6** measured-saving live drive (greet-turn context < ceiling + no pre-greet ~89k spike, off `claude-stream.jsonl`). + +## Residuals — Polish, fold into implementation (non-blocking) + +- j9: three stale historical lines say "AC-1..AC-4" (live set is AC-1..AC-6) — mark superseded. +- j9: AC-1(b) credits the team `config.json` path to a "comm-officer hook" that doesn't ship — loose attribution; the path itself is real. +- j9 AC-6 soft spot: the ~89k is asserted (no team was created in the forensics run); the negative control rides an eager-team fixture — the present/absent cache-creation-spike signal stays falsifiable. +- T3: AC-4's collapse-decision-record cites `README.md`, now the conventional `index.md` — repoint the path at T3 implementation. + +## Provenance + +Boot forensics: `boot-analysis.md`. Per-item ideation: the entity bodies under `docs/dev/.spacedock-state/`. Preflight (4-lens) + j9 re-verify: shaping-FO session, 2026-06-13. From 95e86b83e812ea1ebe28de923b8668cc176ec40c Mon Sep 17 00:00:00 2001 From: CL Kao Date: Sat, 13 Jun 2026 12:58:03 -0700 Subject: [PATCH 07/18] feat(j9): AC-6 per-turn boot-window measurement parser + offline oracle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add journeymetrics.ParseClaudeTurns (per-assistant-turn usage, deduped by message id, with tool_use names) so a caller can measure a single turn's context window — the field ParseClaudeJSONL's whole-run sum cannot report. Add the ensigncycle AC-6 oracle assertShallowBootMeasured: identify the greet turn (last non-dispatch turn) and assert the greet-turn context is below the ~60k ceiling and no pre-greet turn shows the ~89k team-mode prefix re-cache spike. De-risked offline against committed real-shape stream fixtures (a greet-and-stop positive, an eager-team negative) before the live drive relies on it, with independent signal isolation. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../ensigncycle/shallow_boot_measure_test.go | 93 +++++++++++++++++++ .../shallow_boot_measure_unit_test.go | 73 +++++++++++++++ .../testdata/eager-team-boot.stream.jsonl | 5 + .../testdata/shallow-boot-greet.stream.jsonl | 5 + internal/journeymetrics/claude.go | 72 ++++++++++++++ internal/journeymetrics/claude_test.go | 33 +++++++ 6 files changed, 281 insertions(+) create mode 100644 internal/ensigncycle/shallow_boot_measure_test.go create mode 100644 internal/ensigncycle/shallow_boot_measure_unit_test.go create mode 100644 internal/ensigncycle/testdata/eager-team-boot.stream.jsonl create mode 100644 internal/ensigncycle/testdata/shallow-boot-greet.stream.jsonl diff --git a/internal/ensigncycle/shallow_boot_measure_test.go b/internal/ensigncycle/shallow_boot_measure_test.go new file mode 100644 index 00000000..b353fc87 --- /dev/null +++ b/internal/ensigncycle/shallow_boot_measure_test.go @@ -0,0 +1,93 @@ +package ensigncycle + +import ( + "fmt" + + "github.com/spacedock-dev/spacedock/internal/journeymetrics" +) + +// AC-6 boot-window measurement thresholds. The ceiling is the milestone's ~60k +// greet-turn context ceiling; the spike threshold is set below the ~89k team-mode +// prefix re-cache (TeamCreate re-caching the whole conversation prefix to the 1h +// cache) so any cache_creation on that order is caught, while the small +// per-turn cache_creation a healthy boot writes (a few thousand tokens) is not. +const ( + greetContextCeiling = 60000 + teamRecacheSpikeFloor = 60000 +) + +// dispatchToolNames are the tool_use names that mark a worker dispatch / team +// creation — the boundary AC-6 uses to bound the pre-greet window. A turn that +// names one of these is NOT a greet turn. +var dispatchToolNames = map[string]bool{ + "Agent": true, + "Task": true, + "TeamCreate": true, +} + +// greetTurnIndex returns the index of the greet turn in turns: the LAST assistant +// turn that emits no dispatch tool_use (the FO greets via text output and stops, +// it does not dispatch in the same turn). It returns -1 when every turn dispatches +// (no greet was produced). A shallow boot has no dispatch at all, so the greet turn +// is simply the final turn; an eager-team boot fires TeamCreate before the greet, +// so the greet turn is still the final non-dispatch turn and the TeamCreate turn +// stays inside the pre-greet window where the spike check can see it. +func greetTurnIndex(turns []journeymetrics.ClaudeTurn) int { + idx := -1 + for i, t := range turns { + dispatches := false + for _, name := range t.ToolNames { + if dispatchToolNames[name] { + dispatches = true + break + } + } + if !dispatches { + idx = i + } + } + return idx +} + +// assertShallowBootMeasured is the AC-6 measured-saving oracle over a captured +// claude-stream.jsonl: it parses the stream per turn, identifies the greet turn and +// the pre-greet window (turns up to and including the greet turn), and asserts +// +// (1) the greet-turn context (input + cache_read + cache_creation) is below the +// ~60k ceiling, and +// (2) no pre-greet turn shows a cache_creation spike on the order of the ~89k +// team-mode prefix re-cache. +// +// It grades the host's emitted usage numbers — an independent source the contract +// cannot fake — never a prose match. A regression that re-introduced an eager team +// create or a heavy boot read pushes the greet context over the ceiling or surfaces +// the spike, failing this oracle. +func assertShallowBootMeasured(stream string) error { + turns, err := journeymetrics.ParseClaudeTurns([]byte(stream)) + if err != nil { + return fmt.Errorf("parse stream for boot-window measurement: %w", err) + } + return assertShallowBootMeasuredTurns(turns) +} + +// assertShallowBootMeasuredTurns is the turn-level half of the AC-6 oracle, split +// out so the offline unit cases can drive the ceiling and spike checks directly +// without a stream fixture. +func assertShallowBootMeasuredTurns(turns []journeymetrics.ClaudeTurn) error { + if len(turns) == 0 { + return fmt.Errorf("stream carried no assistant turns — nothing to measure") + } + greet := greetTurnIndex(turns) + if greet < 0 { + return fmt.Errorf("every assistant turn dispatched — no greet turn produced") + } + if ctx := turns[greet].Context(); ctx >= greetContextCeiling { + return fmt.Errorf("greet-turn context %d is not below the ~%dk ceiling — a heavy boot read or eager team create regressed the saving", ctx, greetContextCeiling/1000) + } + for i := 0; i <= greet; i++ { + if cc := turns[i].Usage.CacheCreation; cc >= teamRecacheSpikeFloor { + return fmt.Errorf("pre-greet turn %d shows a cache_creation spike of %d (>= %d) — the ~89k team-mode prefix re-cache fired before the greet", i, cc, teamRecacheSpikeFloor) + } + } + return nil +} diff --git a/internal/ensigncycle/shallow_boot_measure_unit_test.go b/internal/ensigncycle/shallow_boot_measure_unit_test.go new file mode 100644 index 00000000..4018d92a --- /dev/null +++ b/internal/ensigncycle/shallow_boot_measure_unit_test.go @@ -0,0 +1,73 @@ +package ensigncycle + +import ( + "os" + "path/filepath" + "testing" + + "github.com/spacedock-dev/spacedock/internal/journeymetrics" +) + +// readMeasureFixture reads a committed claude-stream.jsonl fixture from testdata. +// These are trimmed copies of real stream-json (the journeymetrics precedent — a +// committed testdata/ jsonl, never a ~/.claude/projects per-machine artifact), so +// the AC-6 oracle has an offline positive and negative branch to validate against +// before the live shallow-boot run relies on it. +func readMeasureFixture(t *testing.T, name string) string { + t.Helper() + data, err := os.ReadFile(filepath.Join("testdata", name)) + if err != nil { + t.Fatal(err) + } + return string(data) +} + +// TestAssertShallowBootMeasuredOffline is the AC-6 de-risk: it validates the +// measured-saving oracle against committed real-shape streams BEFORE the live +// shallow-boot run spends a model on it. The positive fixture (a greet-and-stop +// boot, no TeamCreate, greet context under the ceiling) passes; the negative +// fixture (an eager-team boot with the ~89k cache_creation spike before the greet +// and a greet context over the ceiling) fails — proving the measurement +// distinguishes the realized saving from its absence, the AC-6 negative control. +func TestAssertShallowBootMeasuredOffline(t *testing.T) { + if err := assertShallowBootMeasured(readMeasureFixture(t, "shallow-boot-greet.stream.jsonl")); err != nil { + t.Fatalf("shallow-boot positive fixture must pass the measured-saving oracle: %v", err) + } + if err := assertShallowBootMeasured(readMeasureFixture(t, "eager-team-boot.stream.jsonl")); err == nil { + t.Fatal("eager-team negative fixture must FAIL the measured-saving oracle (89k spike before greet + greet context over ceiling) — else the measurement does not distinguish the realized saving from its absence") + } +} + +// TestShallowBootMeasureSignalsAreIndependent isolates the two AC-6 signals so +// neither can be silently dropped: a stream that fails ONLY the ceiling check (a +// heavy greet, no spike) and a stream that fails ONLY the spike check (a pre-greet +// 89k cache_creation, but a light greet) must each go red. +func TestShallowBootMeasureSignalsAreIndependent(t *testing.T) { + // Only the ceiling fails: a single text greet turn whose context exceeds the + // ceiling, with no cache_creation spike anywhere. + heavyGreet := []journeymetrics.ClaudeTurn{ + {ID: "greet", Usage: journeymetrics.TokenTotals{Input: 100, CacheRead: greetContextCeiling, CacheCreation: 0}}, + } + if err := assertShallowBootMeasuredTurns(heavyGreet); err == nil { + t.Fatal("a greet turn whose context exceeds the ceiling (no spike) must fail on the ceiling check") + } + + // Only the spike fails: a pre-greet dispatch turn carrying the ~89k spike, then + // a light text greet under the ceiling. + spikeThenLightGreet := []journeymetrics.ClaudeTurn{ + {ID: "team", Usage: journeymetrics.TokenTotals{Input: 8, CacheCreation: 89000, CacheRead: 16000}, ToolNames: []string{"TeamCreate"}}, + {ID: "greet", Usage: journeymetrics.TokenTotals{Input: 100, CacheRead: 5000, CacheCreation: 0}}, + } + if err := assertShallowBootMeasuredTurns(spikeThenLightGreet); err == nil { + t.Fatal("a pre-greet ~89k cache_creation spike (with a light greet) must fail on the spike check") + } + + // Both clean: a light greet, no pre-greet spike — the realized-saving end-state. + clean := []journeymetrics.ClaudeTurn{ + {ID: "boot", Usage: journeymetrics.TokenTotals{Input: 6, CacheCreation: 900, CacheRead: 18000}, ToolNames: []string{"Bash"}}, + {ID: "greet", Usage: journeymetrics.TokenTotals{Input: 120, CacheRead: 42000, CacheCreation: 400}}, + } + if err := assertShallowBootMeasuredTurns(clean); err != nil { + t.Fatalf("a clean shallow boot (light greet, no spike) must pass: %v", err) + } +} diff --git a/internal/ensigncycle/testdata/eager-team-boot.stream.jsonl b/internal/ensigncycle/testdata/eager-team-boot.stream.jsonl new file mode 100644 index 00000000..649ee159 --- /dev/null +++ b/internal/ensigncycle/testdata/eager-team-boot.stream.jsonl @@ -0,0 +1,5 @@ +{"type":"system","subtype":"init","model":"claude-opus-4-8"} +{"type":"assistant","message":{"id":"msg_boot1","model":"claude-opus-4-8","usage":{"input_tokens":4,"output_tokens":12,"cache_creation_input_tokens":1200,"cache_read_input_tokens":8000},"content":[{"type":"tool_use","id":"toolu_ver","name":"Bash","input":{"command":"spacedock --version"}}]}} +{"type":"assistant","message":{"id":"msg_team","model":"claude-opus-4-8","usage":{"input_tokens":8,"output_tokens":30,"cache_creation_input_tokens":89000,"cache_read_input_tokens":16000},"content":[{"type":"tool_use","id":"toolu_team","name":"TeamCreate","input":{"team_name":"eager"}}]}} +{"type":"assistant","message":{"id":"msg_greet","model":"claude-opus-4-8","usage":{"input_tokens":300,"output_tokens":400,"cache_creation_input_tokens":1000,"cache_read_input_tokens":105000},"content":[{"type":"text","text":"Workflow overview: ... Gate review: ... Decision: approve or reject?"}]}} +{"type":"result","subtype":"success","usage":{"input_tokens":320,"output_tokens":472,"cache_creation_input_tokens":91200,"cache_read_input_tokens":129000},"total_cost_usd":0.31,"result":"Workflow overview: ... Decision: approve or reject?"} diff --git a/internal/ensigncycle/testdata/shallow-boot-greet.stream.jsonl b/internal/ensigncycle/testdata/shallow-boot-greet.stream.jsonl new file mode 100644 index 00000000..7b715ba0 --- /dev/null +++ b/internal/ensigncycle/testdata/shallow-boot-greet.stream.jsonl @@ -0,0 +1,5 @@ +{"type":"system","subtype":"init","model":"claude-opus-4-8"} +{"type":"assistant","message":{"id":"msg_boot1","model":"claude-opus-4-8","usage":{"input_tokens":4,"output_tokens":12,"cache_creation_input_tokens":1200,"cache_read_input_tokens":8000},"content":[{"type":"tool_use","id":"toolu_ver","name":"Bash","input":{"command":"spacedock --version"}}]}} +{"type":"assistant","message":{"id":"msg_boot2","model":"claude-opus-4-8","usage":{"input_tokens":6,"output_tokens":40,"cache_creation_input_tokens":900,"cache_read_input_tokens":18000},"content":[{"type":"tool_use","id":"toolu_boot","name":"Bash","input":{"command":"spacedock status --boot --json"}}]}} +{"type":"assistant","message":{"id":"msg_greet","model":"claude-opus-4-8","usage":{"input_tokens":120,"output_tokens":300,"cache_creation_input_tokens":400,"cache_read_input_tokens":42000},"content":[{"type":"text","text":"Workflow overview: 1 task at the review gate. Gate review: ... Decision: approve or reject?"}]}} +{"type":"result","subtype":"success","usage":{"input_tokens":130,"output_tokens":352,"cache_creation_input_tokens":2500,"cache_read_input_tokens":68000},"total_cost_usd":0.04,"result":"Workflow overview: 1 task at the review gate. Gate review: ... Decision: approve or reject?"} diff --git a/internal/journeymetrics/claude.go b/internal/journeymetrics/claude.go index b05ccce7..decd45b9 100644 --- a/internal/journeymetrics/claude.go +++ b/internal/journeymetrics/claude.go @@ -114,6 +114,78 @@ func ParseClaudeJSONL(data []byte) (ClaudeParseResult, error) { }, nil } +// ClaudeTurn is one deduped assistant turn from a stream-json transcript: its +// per-message token usage and the names of the tool_use blocks it emitted. Unlike +// ParseClaudeJSONL (which SUMS usage across the whole run and prefers the terminal +// result usage), this preserves each turn's usage so a caller can measure a single +// turn's context window — e.g. the greet turn's boot-window context. +type ClaudeTurn struct { + ID string + Usage TokenTotals + // ToolNames is the names of the tool_use blocks in this turn, in order. A + // turn that dispatches a worker carries an "Agent" (or "TeamCreate") name, so a + // caller can split the transcript at the first dispatch turn. + ToolNames []string +} + +// Context returns this turn's context-window size as the boot analysis defines it: +// input + cache_read + cache_creation (output is generation, not context). +func (t ClaudeTurn) Context() int { + return t.Usage.Input + t.Usage.CacheRead + t.Usage.CacheCreation +} + +// ParseClaudeTurns walks the stream-json transcript per assistant turn, deduping +// repeated rows by message ID the same way ParseClaudeJSONL does, and returns one +// ClaudeTurn per distinct assistant message in stream order. It reuses the +// rawTokenUsage field extraction; it does NOT sum or prefer the terminal result +// usage, so each turn's context window is recoverable. Non-JSON lines (folded +// stderr) are skipped, matching ParseClaudeJSONL. +func ParseClaudeTurns(data []byte) ([]ClaudeTurn, error) { + scanner := bufio.NewScanner(bytes.NewReader(data)) + scanner.Buffer(make([]byte, 0, 64*1024), 4*1024*1024) + + seen := map[string]bool{} + var turns []ClaudeTurn + lineNo := 0 + for scanner.Scan() { + lineNo++ + line := strings.TrimSpace(scanner.Text()) + if line == "" || !strings.HasPrefix(line, "{") { + continue + } + var row map[string]json.RawMessage + if err := json.Unmarshal([]byte(line), &row); err != nil { + return nil, fmt.Errorf("line %d: %w", lineNo, err) + } + if rawString(row["type"]) != "assistant" { + continue + } + msg, err := parseClaudeAssistant(row["message"]) + if err != nil { + return nil, fmt.Errorf("line %d assistant: %w", lineNo, err) + } + id := msg.ID + if id == "" { + id = fmt.Sprintf("line-%d", lineNo) + } + if seen[id] { + continue + } + seen[id] = true + var names []string + for _, block := range msg.Content { + if block.Type == "tool_use" { + names = append(names, block.Name) + } + } + turns = append(turns, ClaudeTurn{ID: id, Usage: msg.Usage, ToolNames: names}) + } + if err := scanner.Err(); err != nil { + return nil, err + } + return turns, nil +} + type claudeAssistant struct { ID string Model string diff --git a/internal/journeymetrics/claude_test.go b/internal/journeymetrics/claude_test.go index e64bd303..218bdcb0 100644 --- a/internal/journeymetrics/claude_test.go +++ b/internal/journeymetrics/claude_test.go @@ -94,6 +94,39 @@ func TestParseClaudeJSONLSkipsNonJSONStderrLines(t *testing.T) { } } +func TestParseClaudeTurnsPreservesPerTurnContextAndDedupes(t *testing.T) { + data := readTestdata(t, "claude_terminal_split.stream.jsonl") + + turns, err := ParseClaudeTurns(data) + if err != nil { + t.Fatalf("ParseClaudeTurns: %v", err) + } + // Two distinct assistant messages (msg_1 appears twice — deduped, msg_2 once); + // the terminal result row is not a turn. + if len(turns) != 2 { + t.Fatalf("turns = %d, want 2 (deduped msg_1 + msg_2, no result row)", len(turns)) + } + // msg_1 carried a Bash tool_use with its own per-turn usage (NOT the run sum). + if turns[0].ID != "msg_1" { + t.Errorf("turn[0].ID = %q, want msg_1", turns[0].ID) + } + if turns[0].Usage != (TokenTotals{Input: 100, Output: 10, CacheCreation: 5, CacheRead: 20, Total: 135}) { + t.Errorf("turn[0].Usage = %+v, want the single turn's usage (not the run sum)", turns[0].Usage) + } + if turns[0].Context() != 125 { // input 100 + cache_read 20 + cache_creation 5 + t.Errorf("turn[0].Context() = %d, want 125 (input+cache_read+cache_creation)", turns[0].Context()) + } + if len(turns[0].ToolNames) != 1 || turns[0].ToolNames[0] != "Bash" { + t.Errorf("turn[0].ToolNames = %v, want [Bash]", turns[0].ToolNames) + } + if turns[1].ID != "msg_2" { + t.Errorf("turn[1].ID = %q, want msg_2", turns[1].ID) + } + if len(turns[1].ToolNames) != 0 { + t.Errorf("turn[1].ToolNames = %v, want none (text-only turn)", turns[1].ToolNames) + } +} + func readTestdata(t *testing.T, name string) []byte { t.Helper() data, err := os.ReadFile(filepath.Join("testdata", name)) From b8a97a2737b693e62c965eec2eec1686a7f2a9f3 Mon Sep 17 00:00:00 2001 From: CL Kao Date: Sat, 13 Jun 2026 12:59:32 -0700 Subject: [PATCH 08/18] test(j9): AC-4 boot-resident deferred-load-point reference closure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add TestBootResidentDeferredLoadPointsResolve in the contractlint quarantine: os.ReadFile the two boot-resident FO contract bodies (slimmed shared core + Claude runtime adapter), extract every deferred load-point they name (sibling references/*.md read-paths, lazy spacedock: invocations → skills//SKILL.md, concrete _mods/*.md → canonical mods/), and os.Stat each. The filesystem is the independent oracle, so a body naming a moved or deleted target fails — not a prose-grep. Empty-walk guard + a dangling-target control prove it can fail. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../boot_resident_closure_test.go | 157 ++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 internal/contractlint/boot_resident_closure_test.go diff --git a/internal/contractlint/boot_resident_closure_test.go b/internal/contractlint/boot_resident_closure_test.go new file mode 100644 index 00000000..74427a2e --- /dev/null +++ b/internal/contractlint/boot_resident_closure_test.go @@ -0,0 +1,157 @@ +// ABOUTME: AC-4 reference-closure over the boot-resident FO contract bodies — every +// ABOUTME: deferred load-point they name resolves to a real file on disk (os.Stat oracle). +package contractlint + +import ( + "os" + "path/filepath" + "regexp" + "strings" + "testing" +) + +// bootResidentBodies are the two contract bodies the FO loader inlines/reads at +// boot: the slimmed shared core and the Claude runtime adapter. AC-4 walks these +// (NOT a SKILL.md, which the existing TestUserSkillReferenceClosureResolves reads) +// because the loader reads the bodies directly, and only the bodies name the +// deferred load-points the boot core defers to (a sibling reference path, a bare +// skill invocation, a canonical mod file). +var bootResidentBodies = []string{ + filepath.Join("skills", "first-officer", "references", "first-officer-shared-core.md"), + filepath.Join("skills", "first-officer", "references", "claude-first-officer-runtime.md"), +} + +// bodyReferenceRe matches a sibling reference read-path named in a contract body +// (the dispatch/merge references the split defers to), the same path shape the +// SKILL.md closure check uses, applied to body text. +var bodyReferenceRe = regexp.MustCompile(`references/[A-Za-z0-9_./-]+\.md`) + +// bodySkillRe matches a lazy skill invocation `spacedock:` the boot core +// names as a deferred load point. The first-dispatch / terminal / gate skills +// (using-claude-team, present-gate, feedback-rejection-flow) each resolve to their +// skills//SKILL.md. +var bodySkillRe = regexp.MustCompile(`spacedock:([a-z0-9-]+)`) + +// bodyModRe matches a CONCRETE _mods reference (e.g. `_mods/pr-merge.md`) the boot +// core names as a deferred mod-file load point. Brace-templated placeholders +// (`_mods/{mod_name}.md`) are NOT concrete load-points and are excluded by the +// non-brace character class. +var bodyModRe = regexp.MustCompile(`_mods/([a-z0-9][a-z0-9_.-]*\.md)`) + +// lazyLoadSkills are the skill names a boot-resident body may name as deferred +// load points. The ensign skill is the dispatched-worker contract, not a boot-core +// load point, so it is excluded; the FO-self reference would be a self-load. +var lazyLoadSkills = map[string]bool{ + "using-claude-team": true, + "present-gate": true, + "feedback-rejection-flow": true, +} + +// deferredLoadPoint is one extracted load-point: the on-disk path the body names +// and the literal token that named it (for a useful failure message). +type deferredLoadPoint struct { + path string // repo-relative resolved path + named string // the literal token in the body +} + +// extractDeferredLoadPoints parses one boot-resident body's text and returns every +// deferred load-point it names, resolved to a repo-relative on-disk path: sibling +// reference read-paths (resolved under the FO skill dir), lazy skill invocations +// (resolved to skills//SKILL.md), and concrete _mods files (resolved against +// the canonical mods/ tree the repo ships). It does NOT assert presence/absence of +// any prose — it only collects the paths the body NAMES, for the os.Stat oracle to +// resolve. +func extractDeferredLoadPoints(body string) []deferredLoadPoint { + foSkillDir := filepath.Join("skills", "first-officer") + var out []deferredLoadPoint + seen := map[string]bool{} + add := func(p deferredLoadPoint) { + if seen[p.path] { + return + } + seen[p.path] = true + out = append(out, p) + } + for _, m := range bodyReferenceRe.FindAllString(body, -1) { + if strings.Contains(m, "{") { + continue + } + add(deferredLoadPoint{path: filepath.Join(foSkillDir, m), named: m}) + } + for _, m := range bodySkillRe.FindAllStringSubmatch(body, -1) { + name := m[1] + if !lazyLoadSkills[name] { + continue + } + add(deferredLoadPoint{path: filepath.Join("skills", name, "SKILL.md"), named: m[0]}) + } + for _, m := range bodyModRe.FindAllStringSubmatch(body, -1) { + add(deferredLoadPoint{path: filepath.Join("mods", m[1]), named: m[0]}) + } + return out +} + +// TestBootResidentDeferredLoadPointsResolve is the AC-4 reference-closure guard: a +// genuine structural check, not a prose-grep. For each boot-resident contract body +// it extracts every deferred load-point the body NAMES and os.Stats it. The +// EXPECTED value (the target exists on disk) comes from the FILESYSTEM — an +// independent source the contract text can diverge from — so a body that names a +// deferred reference at a moved/renamed/deleted path fails the stat. It is NOT the +// banned present-here/absent-there heading grep (boundary_guard_test.go): it does +// not assert the body contains or lacks any heading; it asserts every load-point +// the body points at resolves to a real file. The empty-walk guard keeps it from +// passing vacuously. +func TestBootResidentDeferredLoadPointsResolve(t *testing.T) { + root := repoRoot(t) + total := 0 + for _, rel := range bootResidentBodies { + data, err := os.ReadFile(filepath.Join(root, rel)) + if err != nil { + t.Fatalf("read boot-resident body %s: %v", rel, err) + } + points := extractDeferredLoadPoints(string(data)) + for _, p := range points { + total++ + if _, err := os.Stat(filepath.Join(root, p.path)); err != nil { + t.Errorf("%s names deferred load-point %q which resolves to %s — but no such file exists on disk: %v", rel, p.named, p.path, err) + } + } + } + if total == 0 { + t.Fatal("extracted zero deferred load-points from the boot-resident bodies — extraction bug; the closure check would pass vacuously") + } +} + +// TestBootResidentDeferredLoadPointGuardFailsOnDanglingTarget is the AC-4 control: +// it points a boot-resident-style fixture body at a non-existent deferred reference +// and proves the closure logic goes RED, so the guard is shown able to fail (not a +// guard that can only ever pass). It drives the same extraction + os.Stat the real +// guard uses, against a planted fixture, so the control exercises the real code +// path rather than re-implementing it. +func TestBootResidentDeferredLoadPointGuardFailsOnDanglingTarget(t *testing.T) { + root := repoRoot(t) + fixture := "At first dispatch, read references/claude-fo-this-file-does-not-exist.md\n" + + "and at terminal, invoke spacedock:using-claude-team.\n" + points := extractDeferredLoadPoints(fixture) + if len(points) == 0 { + t.Fatal("control fixture extracted no load-points — the dangling-target case never exercises the stat") + } + var sawDangling, sawReal bool + for _, p := range points { + _, err := os.Stat(filepath.Join(root, p.path)) + if strings.Contains(p.named, "does-not-exist") { + if err == nil { + t.Fatalf("control: the dangling reference %q unexpectedly resolved on disk", p.named) + } + sawDangling = true + } else if err == nil { + sawReal = true + } + } + if !sawDangling { + t.Fatal("control: the dangling deferred reference was not extracted — the guard cannot fail on a moved/deleted target") + } + if !sawReal { + t.Fatal("control: the real load-point (using-claude-team) was not resolved — the discriminator has nothing to contrast the dangling case against") + } +} From aa83d95c2d08794924c2820e46ad4c55727efc85 Mon Sep 17 00:00:00 2001 From: CL Kao Date: Sat, 13 Jun 2026 13:01:43 -0700 Subject: [PATCH 09/18] test(j9): pin pr_state live merge state + gh-absent degraded branch The shallow-boot greet and the S7b before-greet merged-PR sweep rest on status --boot --json's pr_state.entries[].state reflecting LIVE gh merge state, not the stored pr field. Pin it: a stubbed gh reporting MERGED for a PR-bearing non-terminal entity must surface as state=MERGED (live), and an absent gh must surface as pr_state.status="gh not available" with no entries (the M6 degraded branch the greet keys off to report merge state UNKNOWN). Offline, deterministic. Co-Authored-By: Claude Opus 4.8 (1M context) --- internal/status/live_prstate_pin_test.go | 122 ++++++++++++++++++ .../live-prstate-workflow/010-pr-pending.md | 15 +++ .../testdata/live-prstate-workflow/README.md | 25 ++++ 3 files changed, 162 insertions(+) create mode 100644 internal/status/live_prstate_pin_test.go create mode 100644 internal/status/testdata/live-prstate-workflow/010-pr-pending.md create mode 100644 internal/status/testdata/live-prstate-workflow/README.md diff --git a/internal/status/live_prstate_pin_test.go b/internal/status/live_prstate_pin_test.go new file mode 100644 index 00000000..291e8af5 --- /dev/null +++ b/internal/status/live_prstate_pin_test.go @@ -0,0 +1,122 @@ +// ABOUTME: Shallow-boot accuracy pin — pr_state.entries[].state in `status --boot +// ABOUTME: --json` reflects LIVE gh merge state (gh present) or an explicit unknown (gh absent). +package status + +import ( + "encoding/json" + "os" + "path/filepath" + "runtime" + "testing" +) + +// writeStubGh writes a `gh` shim into a fresh temp dir that prints the given +// merge state for `gh pr view ... --json state --jq .state` and returns the dir. +// The shim lets the offline pin drive checkPRStates' live `gh pr view` shell-out +// deterministically — no network, no real PR. +func writeStubGh(t *testing.T, state string) string { + t.Helper() + if runtime.GOOS == "windows" { + t.Skip("gh stub shim is a POSIX shell script") + } + dir := t.TempDir() + script := "#!/bin/sh\n" + + "# Stub gh: emit a fixed PR state for `gh pr view ... --json state --jq .state`.\n" + + "echo " + state + "\n" + path := filepath.Join(dir, "gh") + if err := os.WriteFile(path, []byte(script), 0o755); err != nil { + t.Fatal(err) + } + return dir +} + +// bootWithPATH runs `status --boot --json` over the workflow root with PATH set to +// pathValue in BOTH the process env (so checkPRStates' bare exec.Command("gh") +// resolves the shim) and the Request env (so lookupExecutable finds it). It returns +// the parsed pr_state section. +func bootPRState(t *testing.T, root, pathValue string) (status string, entries []map[string]string) { + t.Helper() + t.Setenv("PATH", pathValue) // checkPRStates runs exec.Command("gh") against the process PATH + env := []string{ + "PYTHONUTF8=1", + "LANG=C.UTF-8", + "LC_ALL=C.UTF-8", + "USER=pinned-actor", + "HOME=" + t.TempDir(), + "PATH=" + pathValue, + } + out, errOut, code := runNative(t, root, env, "--workflow-dir", root, "--boot", "--json") + if code != 0 { + t.Fatalf("--boot --json exit=%d stderr=%q", code, errOut) + } + var boot struct { + PRState struct { + Status string `json:"status"` + Entries []map[string]string `json:"entries"` + } `json:"pr_state"` + } + if err := json.Unmarshal([]byte(out), &boot); err != nil { + t.Fatalf("parse --boot --json: %v\n%s", err, out) + } + return boot.PRState.Status, boot.PRState.Entries +} + +// TestBootPRStateCarriesLiveMergeState is the shallow-boot accuracy pin: with `gh` +// on PATH, a PR-bearing non-terminal entity's pr_state entry reflects the LIVE +// merge state (`gh pr view` → MERGED), not just the stored `pr:` field. The +// shallow-boot greet and the S7b before-greet merged-PR sweep both rest on this; a +// regression that dropped the live `gh pr view` and echoed only the stored field +// would not report a freshly-merged PR, breaking both. The stub `gh` makes the +// live state deterministic and offline. +func TestBootPRStateCarriesLiveMergeState(t *testing.T) { + root, err := filepath.Abs(filepath.Join("testdata", "live-prstate-workflow")) + if err != nil { + t.Fatal(err) + } + stubDir := writeStubGh(t, "MERGED") + + status, entries := bootPRState(t, root, stubDir) + if status != "ok" { + t.Fatalf("pr_state.status = %q, want ok (gh present)", status) + } + if len(entries) != 1 { + t.Fatalf("pr_state has %d entries, want 1 (the PR-pending entity)", len(entries)) + } + e := entries[0] + if e["pr"] != "#42" { + t.Fatalf("pr_state entry pr = %q, want #42 (the stored field)", e["pr"]) + } + // The load-bearing pin: the serialized state is the LIVE gh state, not the + // stored field. The stub reports MERGED for a stored pr of #42, so a MERGED + // state proves the boot ran `gh pr view` live and serialized its result. + if e["state"] != "MERGED" { + t.Fatalf("pr_state entry state = %q, want MERGED (the LIVE gh state) — the boot must run `gh pr view` live, not echo the stored pr field", e["state"]) + } +} + +// TestBootPRStateGhAbsentReportsUnknown pins the M6 degraded branch: when `gh` is +// absent from PATH, checkPRStates returns status "gh not available" with NO +// entries. The shallow-boot greet keys off this to state PR merge status is +// UNKNOWN (S7b skipped) rather than asserting an unknowable state. A PATH stripped +// of `gh` reproduces the branch. +func TestBootPRStateGhAbsentReportsUnknown(t *testing.T) { + root, err := filepath.Abs(filepath.Join("testdata", "live-prstate-workflow")) + if err != nil { + t.Fatal(err) + } + // An isolated empty bin dir as PATH: no `gh`, but also no `git` etc. — boot only + // needs the workflow files for this section, and the PR-state probe is the part + // under test. If boot needs git, it is found via the stored fixture, not PATH. + emptyDir := t.TempDir() + // Guard the fixture: there must be no `gh` anywhere on this PATH. + if _, statErr := os.Stat(filepath.Join(emptyDir, "gh")); statErr == nil { + t.Fatal("empty PATH dir unexpectedly contains a gh shim") + } + status, entries := bootPRState(t, root, emptyDir) + if status != "gh not available" { + t.Fatalf("pr_state.status = %q, want \"gh not available\" (gh absent)", status) + } + if len(entries) != 0 { + t.Fatalf("pr_state has %d entries with gh absent, want 0 — the greet must report UNKNOWN, not a stale state", len(entries)) + } +} diff --git a/internal/status/testdata/live-prstate-workflow/010-pr-pending.md b/internal/status/testdata/live-prstate-workflow/010-pr-pending.md new file mode 100644 index 00000000..7f6045c8 --- /dev/null +++ b/internal/status/testdata/live-prstate-workflow/010-pr-pending.md @@ -0,0 +1,15 @@ +--- +id: "010" +title: PR-pending non-terminal entity +status: implementation +pr: "#42" +score: "0.50" +source: roadmap +--- +# PR-pending non-terminal entity + +A non-terminal entity carrying a `pr:` field. At boot, `checkPRStates` runs +`gh pr view 42 --json state` LIVE and serializes the result into +`pr_state.entries[].state`. The stored `pr:` is `#42`; the live state is whatever +`gh` reports (MERGED in the pin test), proving the boot envelope carries live +merge state, not the stored field. diff --git a/internal/status/testdata/live-prstate-workflow/README.md b/internal/status/testdata/live-prstate-workflow/README.md new file mode 100644 index 00000000..ba2112c1 --- /dev/null +++ b/internal/status/testdata/live-prstate-workflow/README.md @@ -0,0 +1,25 @@ +--- +entity-type: task +entity-label: task +entity-label-plural: tasks +id-style: sequential +stages: + defaults: + worktree: false + concurrency: 1 + states: + - name: backlog + initial: true + - name: implementation + worktree: true + - name: done + terminal: true +--- + +# Live-PR-State Fixture Workflow + +Pins the shallow-boot accuracy dependency: a PR-bearing non-terminal entity's +`pr_state` entry in `status --boot --json` must reflect the LIVE merge state (from +`gh pr view`), not just the stored `pr:` field. The shallow-boot greet and the +S7b before-greet merged-PR sweep both rest on this. A stubbed `gh` on PATH supplies +the live state in the test. From 5f4baaf9e1aec1cf4c4e3f6763109ca31eb9455c Mon Sep 17 00:00:00 2001 From: CL Kao Date: Sat, 13 Jun 2026 13:03:12 -0700 Subject: [PATCH 10/18] feat(j9): P2 lazy-TeamCreate + P3 shallow-boot-then-greet MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P2 (Claude runtime adapter): the team-harness Skill invocation moves from "at startup" to "before the first team-mode dispatch" — a greet-and-stop boot never creates a team and never pays the ~89k team-mode prefix re-cache. P3 (shared-core Startup): slim step 4 to the README frontmatter (defer the body); read status --boot --json (no mod-file read, no team); split MODS into the boot-resident MODS-REPORT vs the deferred RUN-STARTUP-HOOKS action and PR_STATE into the boot-resident report vs the S7b action; add S7b, the before-greet merged-PR sweep gated on a MERGED pr_state entry (reads pr-merge.md only when there is a merge to advance, skips on gh-absent); end with greet-then- stop, with all expensive deferrals (team, dispatch/merge modules, comm-officer) past the greet. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../references/claude-first-officer-runtime.md | 4 ++-- .../references/first-officer-shared-core.md | 13 ++++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/skills/first-officer/references/claude-first-officer-runtime.md b/skills/first-officer/references/claude-first-officer-runtime.md index e2f2c5ae..a243ced7 100644 --- a/skills/first-officer/references/claude-first-officer-runtime.md +++ b/skills/first-officer/references/claude-first-officer-runtime.md @@ -4,11 +4,11 @@ This file defines how the shared first-officer core executes on Claude Code. ## Team Creation -At startup (after reading the README, before dispatch), invoke the generic Claude-team-harness discipline: +Before the first team-mode dispatch (the first `Agent()` call that uses a `team_name`), invoke the generic Claude-team-harness discipline: Skill(skill="spacedock:using-claude-team") -This loads the generic team lifecycle — deferred team-tool ToolSearch hop, TeamCreate-first sequencing and naming, the TeamCreate recovery procedure and the failure-recovery ladder, Degraded Mode, Awaiting Completion, and Terminal Team Teardown. Invoke it before the first team-mode tool call in the session. The spacedock-specific decisions below stay inline; the generic blocks they reference (`## Degraded Mode`, `## Awaiting Completion`, `## Terminal Team Teardown`) live in that skill, not in this file. +This loads the generic team lifecycle — deferred team-tool ToolSearch hop, TeamCreate-first sequencing and naming, the TeamCreate recovery procedure and the failure-recovery ladder, Degraded Mode, Awaiting Completion, and Terminal Team Teardown. Invoke it before the first team-mode tool call in the session — NOT at boot. A boot that greets and stops for input never dispatches, so it never creates a team and never pays the team-mode prefix re-cache. The spacedock-specific decisions below stay inline; the generic blocks they reference (`## Degraded Mode`, `## Awaiting Completion`, `## Terminal Team Teardown`) live in that skill, not in this file. In single-entity mode, skip team creation. Use bare-mode dispatch for all agent spawning — the Agent tool without `team_name` blocks until the subagent completes, which prevents premature session termination in `-p` mode. diff --git a/skills/first-officer/references/first-officer-shared-core.md b/skills/first-officer/references/first-officer-shared-core.md index 762c63ca..6d85cdd6 100644 --- a/skills/first-officer/references/first-officer-shared-core.md +++ b/skills/first-officer/references/first-officer-shared-core.md @@ -13,18 +13,21 @@ Shared first-officer semantics. Keep aligned with `agents/first-officer.md` and In every class, do NOT proceed to discovery or `--boot`. 2. Discover the project root with `git rev-parse --show-toplevel`. 3. Discover the workflow directory. Prefer an explicit user-provided path; otherwise `spacedock status --discover`: one path → use it; zero → report no workflow found; multiple → present the list (or fail with an ambiguity error in single-entity mode). -4. Read `{workflow_dir}/README.md` for mission, entity labels, stage ordering and defaults from `stages.defaults` / `stages.states`, and stage properties (`initial`, `terminal`, `gate`, `worktree`, `concurrency`, `feedback-to`, `agent`). -5. Run `spacedock status --boot` for all startup information in one call. Output sections: - - **MODS** — registered hooks by lifecycle point (startup, idle, merge). Run startup hooks before normal dispatch. +4. Read the `{workflow_dir}/README.md` **frontmatter** only — mission line, entity labels (`entity-label` / `entity-label-plural`), `id-style`, and stage taxonomy: stage names/ordering and the per-stage flags the greet and gate need (`initial`, `terminal`, `gate`, `worktree`, `feedback-to`, `agent`) from `stages.defaults` / `stages.states`. DEFER the README body (per-stage prose, proof policy, templates, CI docs); the boot JSON does not carry the stage taxonomy, so the frontmatter read stays before-greet, but the body loads only when the phase that consumes it runs (a dispatch copies a stage subsection; the merge ceremony reads `merge:` policy). A greet-and-stop boot never reads the body. +5. Run `spacedock status --boot --json` for all startup information in one call. Consume it as JSON (every value a string); the human-formatted table is NOT rendered for the FO's own reasoning. The before-greet boot is all READS — none reads a mod file or creates a team. Sections: + - **MODS** (MODS-REPORT) — the `mods` map names which hooks are registered at which lifecycle point (startup, idle, merge). Reading the map does NOT read any mod file; it is what lets the greet *report* a registered hook (a pending merge-PR advancement, a comm-officer spawn) without opening the mod. Actually running the startup hooks (RUN-STARTUP-HOOKS) is deferred: the comm-officer spawn defers to first dispatch (it needs a live team); the pr-merge startup-hook advancement runs before-greet at S7b below, gated on an actually-merged PR. - **ID_STYLE** — `sequential`, `sd-b32`, or `slug`. - **NEXT_ID** — strategy-dependent ID candidate (not a reservation for `sd-b32`; `n/a (id-style: slug)` for `slug`). - **MIN_PREFIX** — `sd-b32` only; currently `MIN_PREFIX: 2`. - **ORPHANS** — worktree fields cross-referenced against filesystem and git state. Report anomalies; do not auto-redispatch. - - **PR_STATE** — PR-pending entities with current merge state. Advance merged PRs. + - **PR_STATE** — PR-pending entities with current LIVE merge state. This is the boot-resident report the greet renders from; advancing a merged PR is the S7b action below, not a read. - **DISPATCHABLE** — entities ready for dispatch (same as `--next`). + - **TEAM_STATE** — whether a team is already present; the greet reports it but does NOT create one. - **STATE_BACKEND** — `split-root` or `single-root`, the resolved entity dir, and whether it is present. The split-root halt-gate below keys off this. 6. **Split-root state halt-gate.** If `state_backend == split-root` AND `entity_dir_present == false`, the state checkout is NOT initialized (orphan branch on origin without a linked worktree — fresh clone or removed worktree). The boot table would render EMPTY and `--validate` VALID — a silent failure. HALT dispatch, report "state not initialized," and run (or prompt the captain to run) `spacedock state init` (manual fallback: `git fetch origin && git worktree add `). Re-read `--boot` and proceed only once `entity_dir_present == true`. -7. **Split-root pull-on-boot.** Before the first dispatch, `git -C pull --rebase origin ` to integrate peers' state (one pull at boot, NOT per-read). On CONFLICT, follow the rebase-conflict halt in **State Management** below: HALT, `git rebase --abort`, surface the conflict, and stop — do not dispatch against an unmerged state tree. +7. **Split-root pull-on-boot.** Before the greet, `git -C pull --rebase origin ` to integrate peers' state (one pull at boot, NOT per-read). On CONFLICT, follow the rebase-conflict halt in **State Management** below: HALT, `git rebase --abort`, surface the conflict, and stop — do not dispatch against an unmerged state tree. +8. **Merged-PR sweep (before-greet).** For each `pr_state` entry whose `state == "MERGED"` and whose entity status is non-terminal, read `_mods/pr-merge.md` and run its startup-hook advancement (clear `mod-block`, terminalize `verdict=PASSED`, archive, remove the worktree). Skip this step entirely when no such entry exists — the common boot reads zero mod files and pays nothing. When `pr_state.status == "gh not available"`, the merge state is unknowable: skip the sweep (the pr-merge mod's own "warn the captain and skip PR state checks") and treat merge status as UNKNOWN in the greet, not as a stale or absent state. This is the one mod-file read correctness-bound to the greet: a boot that greets and stops never enters the event loop, so a merged PR would be reported off live `pr_state` but never advanced unless it is advanced here. +9. **Greet the captain, then stop for input.** Compose a state summary from the boot JSON (orphans, PR state including any S7b-advanced entities, dispatchables, team state) and the README frontmatter (entity label, stage taxonomy, gate flags), and present it. With `gh` absent, state PR merge status is UNKNOWN for PR-bearing entities ("{N} PR-pending entit{y/ies}; merge state unknown — `gh` not available") rather than asserting an unknowable state. If an entity sits at a `gate: true` stage ready for review, present the gate (gates are captain-facing text, not team messages — no team is needed). Then STOP for input — do NOT auto-dispatch. The expensive deferrals (the team via `## Team Creation`, the dispatch and merge reference modules, the comm-officer spawn) all stay past the greet; the FO reaches them when the captain's direction first triggers a dispatch or a terminal merge. ## Status Viewer From fa30d171b8304d39c35d96cc69dcb249f2c9cae2 Mon Sep 17 00:00:00 2001 From: CL Kao Date: Sat, 13 Jun 2026 13:10:25 -0700 Subject: [PATCH 11/18] test(j9): add the shallow-boot shared runtime scenario (AC-1/AC-2/AC-6) Wire a new host-neutral shallow-boot scenario per the README 4-step procedure: the definition + meta-test pin + doc-lock seed block; a fixture (a gate-check at a human gate + a PR-bearing entity whose stubbed gh reports MERGED, the canonical pr-merge mod registered) and prompt; a host-neutral durable-state assertion (assertShallowBoot: greet + gate presented, S7b merged-PR advanced+archived before-greet, no team config on disk, no dispatch); an offline negative case proving each sub-assertion goes red; Claude + Codex runners + a Pi coverage entry. The Claude runner also grades AC-2 (no TeamCreate before greet, over the tool stream) and AC-6 (greet-turn context below the ~60k ceiling, no pre-greet ~89k cache_creation spike, over the captured token stream). Parity/definition/doc-lock guards pass at zero spend. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/specs/scenario-testing-principles.md | 1 + .../ensigncycle/claude_live_runner_test.go | 71 ++++++++ .../ensigncycle/codex_live_runner_test.go | 29 ++++ .../ensigncycle/pi_shared_coverage_test.go | 4 + .../ensigncycle/shallow_boot_assert_test.go | 158 ++++++++++++++++++ .../shallow_boot_fixture_live_test.go | 63 +++++++ .../ensigncycle/shallow_boot_measure_test.go | 36 ++++ .../shallow_boot_measure_unit_test.go | 13 ++ internal/ensigncycle/shared_fixtures_test.go | 91 ++++++++++ .../ensigncycle/shared_scenarios_meta_test.go | 1 + .../shared_scenarios_negative_test.go | 82 +++++++++ internal/ensigncycle/shared_scenarios_test.go | 8 + 12 files changed, 557 insertions(+) create mode 100644 internal/ensigncycle/shallow_boot_assert_test.go create mode 100644 internal/ensigncycle/shallow_boot_fixture_live_test.go diff --git a/docs/specs/scenario-testing-principles.md b/docs/specs/scenario-testing-principles.md index 3def6c94..b782a68f 100644 --- a/docs/specs/scenario-testing-principles.md +++ b/docs/specs/scenario-testing-principles.md @@ -60,6 +60,7 @@ The first foundation is the host-neutral runtime scenarios already shipped and h - `feedback-3-cycle-escalation` — on the third consecutive REJECTED validation the FO escalates to the human instead of auto-bouncing a fourth time. - `merge-hook-guardrail` — the FO cannot bypass a registered merge hook by terminalizing without pr, mod-block, or force. - `filing` — the FO files a new seed entity via the atomic `spacedock new ` path, not the drift-prone `--next-id` + hand-write pair. +- `shallow-boot` — a freshly-booted FO greets and reports accurate state, advances a merged PR before-greet (S7b), with no team created and no worker dispatched, then stops for input. These IDs are the code-backed source of truth. They mirror the `sharedRuntimeScenarios()` table in `internal/ensigncycle`; the seed IDs declared above must equal that table. This block is machine-readable so a lock test can bind the doc to the code and red on drift in either direction — adding, dropping, or renaming a scenario on one side without the other. This is what makes the doc the human-readable face of a code-backed truth rather than prose bound to nothing. diff --git a/internal/ensigncycle/claude_live_runner_test.go b/internal/ensigncycle/claude_live_runner_test.go index 892d8f61..b1f36708 100644 --- a/internal/ensigncycle/claude_live_runner_test.go +++ b/internal/ensigncycle/claude_live_runner_test.go @@ -42,6 +42,34 @@ type claudeLiveRunner struct { env []string model string artifactRoot string + // home is the isolated HOME the env sets (a per-run temp dir). The shallow-boot + // scenario checks ~/.claude/teams/{...}/config.json under it for the + // lazy-TeamCreate proof — scoped to THIS run, never a stale prior team. + home string +} + +// withPATHPrefix returns env with dir prepended to its PATH entry, so a stub +// binary in dir resolves before any real one. The shallow-boot runner uses it to +// put the stub `gh` (reporting MERGED) on the FO subprocess PATH. +func withPATHPrefix(env []string, dir string) []string { + out := make([]string, 0, len(env)+1) + found := false + for _, kv := range env { + if rest, ok := strings.CutPrefix(kv, "PATH="); ok { + found = true + if rest != "" { + out = append(out, "PATH="+dir+string(os.PathListSeparator)+rest) + } else { + out = append(out, "PATH="+dir) + } + continue + } + out = append(out, kv) + } + if !found { + out = append(out, "PATH="+dir) + } + return out } type claudeScenarioResult struct { @@ -103,6 +131,7 @@ func claudeScenarioRunners() map[string]func(*testing.T, claudeLiveRunner, share "feedback-3-cycle-escalation": runClaudeFeedback3CycleEscalationScenario, "merge-hook-guardrail": runClaudeMergeHookGuardrailScenario, "filing": runClaudeFilingScenario, + "shallow-boot": runClaudeShallowBootScenario, } } @@ -120,12 +149,14 @@ func newClaudeLiveRunner(t *testing.T) claudeLiveRunner { env := isolatedClaudeEnv(t, os.Getenv("HOME")) env = withBinaryOnPath(env, binary) + home, _ := envValue(env, "HOME") return claudeLiveRunner{ binary: binary, repoRoot: repo, env: env, model: model, artifactRoot: claudeLiveArtifactDir(t, "claude-shared-scenarios"), + home: home, } } @@ -225,6 +256,46 @@ func runClaudeFilingScenario(t *testing.T, runner claudeLiveRunner, scenario sha emitClaudeScenarioMetrics(t, scenario, result, runner.model) } +// runClaudeShallowBootScenario drives the real FO against the shallow-boot fixture +// (a gate-check entity at a human gate + a PR-bearing entity whose stubbed `gh` +// reports MERGED) with a per-run isolated team root, and grades the durable +// end-state: the FO greets and presents the gate, S7b advances+archives the merged +// PR before-greet, NO team config lands on disk, and NO worker is dispatched. It +// then asserts the AC-2 behavioral signal (no TeamCreate before the greet) and the +// AC-6 measured signal (greet-turn context below the ~60k ceiling, no pre-greet +// ~89k cache_creation spike) over the captured stream. +func runClaudeShallowBootScenario(t *testing.T, runner claudeLiveRunner, scenario sharedRuntimeScenario) { + t.Helper() + workflowRoot := t.TempDir() + fixture := writeShallowBootWorkflow(t, workflowRoot) + gateBefore := readFile(t, fixture.gateEntityPath) + + // The stub `gh` (reporting MERGED) must resolve on the FO subprocess PATH so the + // boot's live pr_state probe and the pr-merge startup hook both see the merge. + scenarioRunner := runner + scenarioRunner.env = withPATHPrefix(runner.env, fixture.stubGhDir) + + result := scenarioRunner.run(t, scenario, workflowRoot, shallowBootPrompt()) + + // The Claude team root is {home}/.claude/teams — the exact path the comm-officer + // startup hook membership-checks and TeamCreate writes a team config.json under. + teamRoot := filepath.Join(runner.home, ".claude", "teams") + obs := gatherShallowBootObservation(t, workflowRoot, teamRoot, fixture, gateBefore, result.finalMessage) + if err := assertShallowBoot(obs); err != nil { + t.Fatalf("%v\nFinal message:\n%s\nArtifacts: %s", err, result.finalMessage, result.artifactDir) + } + // AC-2: no TeamCreate before the greet (behavioral, over the tool-call sequence). + if err := assertNoTeamCreateBeforeGreet(result.stream); err != nil { + t.Fatalf("%v\nArtifacts: %s", err, result.artifactDir) + } + // AC-6: the greet-turn context is below the ceiling and no pre-greet 89k + // cache_creation spike (measured, over the captured token stream). + if err := assertShallowBootMeasured(result.stream); err != nil { + t.Fatalf("%v\nArtifacts: %s", err, result.artifactDir) + } + emitClaudeScenarioMetrics(t, scenario, result, runner.model) +} + // run launches the real `spacedock claude` front door for one shared scenario and // returns the (finalMessage, full stream) the shared assertions consume. The // launch shape is the spike WINNER: --plugin-dir + --skip-contract-check are the diff --git a/internal/ensigncycle/codex_live_runner_test.go b/internal/ensigncycle/codex_live_runner_test.go index 5fd4413e..e6c1b847 100644 --- a/internal/ensigncycle/codex_live_runner_test.go +++ b/internal/ensigncycle/codex_live_runner_test.go @@ -76,6 +76,7 @@ func codexScenarioRunners() map[string]func(*testing.T, codexLiveRunner, sharedR "feedback-3-cycle-escalation": runCodexFeedback3CycleEscalationScenario, "merge-hook-guardrail": runCodexMergeHookGuardrailScenario, "filing": runCodexFilingScenario, + "shallow-boot": runCodexShallowBootScenario, } } @@ -236,6 +237,34 @@ func runCodexFilingScenario(t *testing.T, runner codexLiveRunner, scenario share emitCodexScenarioMetrics(t, scenario, result) } +// runCodexShallowBootScenario drives the real FO against the shallow-boot fixture +// and grades the SAME host-neutral durable end-state assertShallowBoot the Claude +// runner feeds: the FO greets and presents the gate, S7b advances+archives the +// merged PR before-greet, and NO worker is dispatched (the gate entity is +// unchanged, not archived, no worktree). Codex has no Claude team root, so the +// no-team-config check is host-neutral-vacuous (empty teamRoot); the no-dispatch +// proof rides the durable gate-unchanged + no-worktree facts. The AC-2/AC-6 Claude +// token-stream measurements are Claude-specific and live in the Claude runner. +func runCodexShallowBootScenario(t *testing.T, runner codexLiveRunner, scenario sharedRuntimeScenario) { + t.Helper() + workflowRoot := t.TempDir() + fixture := writeShallowBootWorkflow(t, workflowRoot) + gateBefore := readFile(t, fixture.gateEntityPath) + + // The stub `gh` (reporting MERGED) must resolve on the FO subprocess PATH so the + // boot's live pr_state probe and the pr-merge startup hook both see the merge. + scenarioRunner := runner + scenarioRunner.env = withPATHPrefix(runner.env, fixture.stubGhDir) + + result := scenarioRunner.run(t, scenario, workflowRoot, shallowBootPrompt()) + + obs := gatherShallowBootObservation(t, workflowRoot, "", fixture, gateBefore, result.finalMessage) + if err := assertShallowBoot(obs); err != nil { + t.Fatalf("%v\nFinal message:\n%s\nArtifacts: %s", err, result.finalMessage, result.artifactDir) + } + emitCodexScenarioMetrics(t, scenario, result) +} + // run launches `codex exec --json` for one shared scenario. Liveness is the SAME // streamWatcher the Claude runner and the live cycle use — one mechanism, no second // impl. drainToExit runs the process to exit accumulating the full --json diff --git a/internal/ensigncycle/pi_shared_coverage_test.go b/internal/ensigncycle/pi_shared_coverage_test.go index e2688169..a0d0bbcc 100644 --- a/internal/ensigncycle/pi_shared_coverage_test.go +++ b/internal/ensigncycle/pi_shared_coverage_test.go @@ -31,6 +31,10 @@ func piSharedScenarioCoverageMap() map[string]piSharedScenarioCoverage { mode: "gap", reason: "Pi currently has durable live coverage for subagent dispatch/front-door setup, but not a live-safe shared first-officer filing runner.", }, + "shallow-boot": { + mode: "gap", + reason: "Pi currently has durable live coverage for subagent dispatch/front-door setup, but not a live-safe shared first-officer shallow-boot runner.", + }, } } diff --git a/internal/ensigncycle/shallow_boot_assert_test.go b/internal/ensigncycle/shallow_boot_assert_test.go new file mode 100644 index 00000000..018836eb --- /dev/null +++ b/internal/ensigncycle/shallow_boot_assert_test.go @@ -0,0 +1,158 @@ +package ensigncycle + +import ( + "fmt" + "os" + "path/filepath" + "regexp" + "strings" + "testing" +) + +// shallowBootObservation is the host-neutral set of durable, on-disk facts the +// shallow-boot scenario grades, plus the FO's final message. The runner gathers +// these from THIS run's filesystem (per-run temp HOME for the team root, the +// workflow root for entity/archive/worktree state) so the assertion never reads a +// stale prior team or a transcript phrase as authoritative. +type shallowBootObservation struct { + // finalMessage is the FO's final greet output. + finalMessage string + // gateBefore / gateAfter is the gate-check entity frontmatter before and after + // the boot — it must be byte-identical (the FO presented the gate, did not + // dispatch or self-approve). + gateBefore string + gateAfter string + // mergedAfter is the merged-PR entity's durable state read from wherever it + // lives after the boot (the archive, once S7b advances it). Empty when the file + // is gone from the active dir AND absent from the archive. + mergedAfter string + // mergedArchived is true when the merged-PR entity was moved to _archive/. + mergedArchived bool + // gateWorktreeCreated is true when a .worktrees/ dir was created for the gate + // entity (it must NOT be — no dispatch happened). + gateWorktreeCreated bool + // gateArchived is true when the gate entity was archived (it must NOT be — it is + // parked at its gate). + gateArchived bool + // teamConfigOnDisk is true when a team config.json exists under THIS run's team + // root (it must NOT be — lazy-TeamCreate means no team is created at boot). + teamConfigOnDisk bool +} + +var ( + mergedTerminalStatus = regexp.MustCompile(`(?im)^status:\s*done\s*$`) + mergedVerdictPassed = regexp.MustCompile(`(?im)^verdict:\s*PASSED\s*$`) + mergedModBlockClear = regexp.MustCompile(`(?im)^mod-block:\s*$`) +) + +// assertShallowBoot is the host-neutral AC-1 oracle over the run's durable on-disk +// state and final message. It grades, on independent on-disk facts (never a +// transcript phrase as the sole signal): +// +// (a) the greet presents a gate review + decision prompt; +// (a2) the S7b merged-PR entity is advanced before-greet — terminal frontmatter +// (done / verdict PASSED / mod-block cleared) AND archived (the M3 proof); +// (b) NO team artifact on disk (lazy-TeamCreate) AND no worker dispatched (the +// gate entity is unchanged, not archived, no worktree created); +// (c) the FO stopped for input (it presented a gate, did not advance it). +// +// The absence-of-team-config is the lazy-TeamCreate proof; the unchanged gate +// frontmatter is the shallow-boot / no-dispatch proof; the advanced+archived merged +// entity is the S7b proof. +func assertShallowBoot(o shallowBootObservation) error { + // (b) lazy-TeamCreate: no team artifact created at boot. + if o.teamConfigOnDisk { + return fmt.Errorf("a team config.json exists under this run's team root — the boot created a team (lazy-TeamCreate was not honored)") + } + // (b) no-dispatch: the gate entity must be byte-identical and not advanced. + if o.gateBefore != o.gateAfter { + return fmt.Errorf("the gated entity's frontmatter changed during boot — a worker was dispatched or the gate self-resolved") + } + if !reviewStatus.MatchString(o.gateAfter) { + return fmt.Errorf("the gated entity is no longer at status: review — it was advanced past its gate") + } + if completedSet.MatchString(o.gateAfter) || verdictSetFM.MatchString(o.gateAfter) { + return fmt.Errorf("the gated entity has completed/verdict set — the boot self-approved instead of presenting the gate") + } + if o.gateArchived { + return fmt.Errorf("the gated entity was archived during boot — it must stay parked at its gate") + } + if o.gateWorktreeCreated { + return fmt.Errorf("a worktree was created for the gated entity — a dispatch happened at boot") + } + // (a2) S7b: the merged-PR entity is advanced and archived before-greet. + if !o.mergedArchived { + return fmt.Errorf("the merged-PR entity was not archived — S7b did not advance it before the greet") + } + if !mergedTerminalStatus.MatchString(o.mergedAfter) { + return fmt.Errorf("the merged-PR entity is not at the terminal stage (status: done) — S7b advancement is incomplete") + } + if !mergedVerdictPassed.MatchString(o.mergedAfter) { + return fmt.Errorf("the merged-PR entity has no verdict: PASSED — S7b advancement is incomplete") + } + if !mergedModBlockClear.MatchString(o.mergedAfter) { + return fmt.Errorf("the merged-PR entity still carries a mod-block — S7b did not clear it on advancement") + } + // (a) the greet presents a gate review + decision prompt. + lowerFinal := strings.ToLower(o.finalMessage) + if !strings.Contains(lowerFinal, "gate review:") || !strings.Contains(lowerFinal, "decision:") { + return fmt.Errorf("the greet did not present a gate review and decision prompt") + } + return nil +} + +// gatherShallowBootObservation reads the run's durable on-disk state into a +// shallowBootObservation: the gate entity's post-boot frontmatter, the merged +// entity's state (from the archive once S7b advances it, else its active path), the +// archive/worktree facts, and the team-config-on-disk check under the host's team +// root. It is host-neutral over the entity/archive/worktree state; teamRoot is the +// host's team-config root (Claude: {home}/.claude/teams) — an empty teamRoot means +// the host writes no team config there and the check is vacuously false. +func gatherShallowBootObservation(t *testing.T, workflowRoot, teamRoot string, fx shallowBootFixture, gateBefore, finalMessage string) shallowBootObservation { + t.Helper() + o := shallowBootObservation{ + finalMessage: finalMessage, + gateBefore: gateBefore, + gateAfter: readFileAllowMissing(fx.gateEntityPath), + } + // The merged entity lives in _archive once S7b advances it; before that it stays + // at its active path. Read whichever exists so the assertion sees its real state. + if data, err := os.ReadFile(fx.mergedArchive); err == nil { + o.mergedAfter = string(data) + o.mergedArchived = true + } else { + o.mergedAfter = readFileAllowMissing(fx.mergedEntityPath) + } + if _, err := os.Stat(fx.gateEntityArchivePath(workflowRoot)); err == nil { + o.gateArchived = true + } + // A worktree created for the gate entity is a dispatch fingerprint. The default + // ensign worker_key is spacedock-ensign, so the dir would be + // .worktrees/spacedock-ensign-gate-check; glob loosely on the slug to catch any + // worker_key. + if matches, _ := filepath.Glob(filepath.Join(workflowRoot, ".worktrees", "*gate-check*")); len(matches) > 0 { + o.gateWorktreeCreated = true + } + if teamRoot != "" { + if matches, _ := filepath.Glob(filepath.Join(teamRoot, "*", "config.json")); len(matches) > 0 { + o.teamConfigOnDisk = true + } + } + return o +} + +// gateEntityArchivePath is the path the gate entity would occupy if it were +// archived (it must NOT be — it is parked at its gate). +func (fx shallowBootFixture) gateEntityArchivePath(workflowRoot string) string { + return filepath.Join(workflowRoot, "_archive", "gate-check.md") +} + +// readFileAllowMissing returns the file's content, or "" when the file is absent +// (e.g. an entity moved to the archive leaves its active path empty). +func readFileAllowMissing(path string) string { + data, err := os.ReadFile(path) + if err != nil { + return "" + } + return string(data) +} diff --git a/internal/ensigncycle/shallow_boot_fixture_live_test.go b/internal/ensigncycle/shallow_boot_fixture_live_test.go new file mode 100644 index 00000000..b77279b4 --- /dev/null +++ b/internal/ensigncycle/shallow_boot_fixture_live_test.go @@ -0,0 +1,63 @@ +//go:build live + +package ensigncycle + +import ( + "os" + "path/filepath" + "testing" +) + +// writeShallowBootWorkflow seeds the shallow-boot fixture under root and returns +// the entity/archive paths plus the stub-gh dir. It registers the canonical +// pr-merge mod verbatim (so the boot JSON reports the startup/idle/merge hooks and +// S7b reads the real advancement prose) and seeds the gate-check + merged-pr +// entities. Live-tagged because it copies the repo's canonical mod via repoRoot, +// which is a live-only helper. +func writeShallowBootWorkflow(t *testing.T, root string) shallowBootFixture { + t.Helper() + writeFile(t, filepath.Join(root, "README.md"), shallowBootReadme()) + modsDir := filepath.Join(root, "_mods") + if err := os.MkdirAll(modsDir, 0o755); err != nil { + t.Fatal(err) + } + prMergeSrc := filepath.Join(repoRoot(t), "mods", "pr-merge.md") + prMergeBody, err := os.ReadFile(prMergeSrc) + if err != nil { + t.Fatalf("read canonical pr-merge mod %s: %v", prMergeSrc, err) + } + writeFile(t, filepath.Join(modsDir, "pr-merge.md"), string(prMergeBody)) + + gatePath := filepath.Join(root, "gate-check.md") + writeFile(t, gatePath, shallowBootGateEntity()) + mergedPath := filepath.Join(root, "merged-pr.md") + writeFile(t, mergedPath, shallowBootMergedEntity()) + gitInit(t, root) + + return shallowBootFixture{ + gateEntityPath: gatePath, + mergedEntityPath: mergedPath, + mergedArchive: filepath.Join(root, "_archive", "merged-pr.md"), + stubGhDir: writeStubMergedGh(t), + } +} + +// writeStubMergedGh writes a `gh` shim that reports MERGED for `gh pr view`, so the +// boot's live PR-state probe and the pr-merge startup hook both see a merged PR +// deterministically (offline, no real PR). Returns the dir to prepend to PATH. +func writeStubMergedGh(t *testing.T) string { + t.Helper() + dir := t.TempDir() + // gh pr view {n} --json state --jq .state -> MERGED; any other gh subcommand + // (e.g. repo view) prints an empty line so it does not hard-error the FO. + script := "#!/bin/sh\n" + + "case \"$1 $2\" in\n" + + " \"pr view\") echo MERGED ;;\n" + + " *) echo \"\" ;;\n" + + "esac\n" + path := filepath.Join(dir, "gh") + if err := os.WriteFile(path, []byte(script), 0o755); err != nil { + t.Fatal(err) + } + return dir +} diff --git a/internal/ensigncycle/shallow_boot_measure_test.go b/internal/ensigncycle/shallow_boot_measure_test.go index b353fc87..143bc9b7 100644 --- a/internal/ensigncycle/shallow_boot_measure_test.go +++ b/internal/ensigncycle/shallow_boot_measure_test.go @@ -49,6 +49,42 @@ func greetTurnIndex(turns []journeymetrics.ClaudeTurn) int { return idx } +// teamToolNames are the Claude team-mode tool calls whose presence before the +// greet would mean an eager team was created. TeamCreate is the one the FO would +// fire; the others are listed so any team-lifecycle call pre-greet is caught. +var teamToolNames = map[string]bool{ + "TeamCreate": true, + "TeamDelete": true, +} + +// assertNoTeamCreateBeforeGreet is the AC-2 behavioral oracle over the captured +// stream's tool-call sequence: no team-mode tool call appears in the pre-greet +// window (the turns up to and including the greet turn). It is a behavioral +// observation over the real run's tool ordering, NOT a contract grep. A regression +// that re-introduced an eager team create surfaces a TeamCreate before the greet +// and fails this — the complement to AC-6's measured 89k-spike absence. +func assertNoTeamCreateBeforeGreet(stream string) error { + turns, err := journeymetrics.ParseClaudeTurns([]byte(stream)) + if err != nil { + return fmt.Errorf("parse stream for AC-2 team-call check: %w", err) + } + if len(turns) == 0 { + return fmt.Errorf("stream carried no assistant turns — nothing to check") + } + greet := greetTurnIndex(turns) + if greet < 0 { + return fmt.Errorf("every assistant turn dispatched — no greet turn produced") + } + for i := 0; i <= greet; i++ { + for _, name := range turns[i].ToolNames { + if teamToolNames[name] { + return fmt.Errorf("pre-greet turn %d emitted a %s tool call — a team was created before the greet (lazy-TeamCreate violated)", i, name) + } + } + } + return nil +} + // assertShallowBootMeasured is the AC-6 measured-saving oracle over a captured // claude-stream.jsonl: it parses the stream per turn, identifies the greet turn and // the pre-greet window (turns up to and including the greet turn), and asserts diff --git a/internal/ensigncycle/shallow_boot_measure_unit_test.go b/internal/ensigncycle/shallow_boot_measure_unit_test.go index 4018d92a..01ac673e 100644 --- a/internal/ensigncycle/shallow_boot_measure_unit_test.go +++ b/internal/ensigncycle/shallow_boot_measure_unit_test.go @@ -38,6 +38,19 @@ func TestAssertShallowBootMeasuredOffline(t *testing.T) { } } +// TestAssertNoTeamCreateBeforeGreetOffline validates the AC-2 behavioral oracle +// against the committed streams: the shallow-boot positive (no TeamCreate at all) +// passes; the eager-team negative (a TeamCreate before the greet) fails — the +// negative control proving the assertion distinguishes the two. +func TestAssertNoTeamCreateBeforeGreetOffline(t *testing.T) { + if err := assertNoTeamCreateBeforeGreet(readMeasureFixture(t, "shallow-boot-greet.stream.jsonl")); err != nil { + t.Fatalf("shallow-boot positive fixture (no TeamCreate) must pass AC-2: %v", err) + } + if err := assertNoTeamCreateBeforeGreet(readMeasureFixture(t, "eager-team-boot.stream.jsonl")); err == nil { + t.Fatal("eager-team negative fixture (TeamCreate before greet) must FAIL AC-2") + } +} + // TestShallowBootMeasureSignalsAreIndependent isolates the two AC-6 signals so // neither can be silently dropped: a stream that fails ONLY the ceiling check (a // heavy greet, no spike) and a stream that fails ONLY the spike check (a pre-greet diff --git a/internal/ensigncycle/shared_fixtures_test.go b/internal/ensigncycle/shared_fixtures_test.go index 13634fcc..e0b8d2d9 100644 --- a/internal/ensigncycle/shared_fixtures_test.go +++ b/internal/ensigncycle/shared_fixtures_test.go @@ -340,3 +340,94 @@ func filingPrompt() string { "Do not dispatch any workers and do not advance the entity past backlog. Your final response must confirm the seed task was filed.", ) } + +// shallowBootFixture is the shallow-boot scenario's on-disk state plus the stub-gh +// dir the runner prepends to PATH. The fixture seeds TWO entities: a gate-check at +// a human gate (which the FO must present, not dispatch) and a PR-bearing +// non-terminal entity whose stubbed `gh` reports MERGED (which S7b advances and +// archives before-greet). The canonical pr-merge mod is registered so the boot +// JSON `mods` map shows it and S7b can read it; the merged entity carries `pr` so +// its terminal advancement clears the merge-hook guard without `--force`. The +// fixture writer (writeShallowBootWorkflow) lives in the live-tagged runner file; +// the pure string builders below are default-tagged so the offline negative cases +// reuse them without a model. +type shallowBootFixture struct { + gateEntityPath string + mergedEntityPath string + mergedArchive string + stubGhDir string +} + +func shallowBootReadme() string { + return "---\n" + + "entity-type: task\n" + + "entity-label: task\n" + + "entity-label-plural: tasks\n" + + "id-style: slug\n" + + "stages:\n" + + " defaults:\n" + + " worktree: false\n" + + " concurrency: 1\n" + + " states:\n" + + " - name: draft\n" + + " initial: true\n" + + " - name: implementation\n" + + " - name: review\n" + + " gate: true\n" + + " - name: done\n" + + " terminal: true\n" + + "---\n" + + "# Shallow Boot Fixture\n\n" + + "### draft\n\nCreate the draft.\n\n- **Outputs:** A draft stage report.\n\n" + + "### implementation\n\nDo the work.\n\n- **Outputs:** An implementation stage report.\n\n" + + "### review\n\nHuman approval gate.\n\n- **Outputs:** A gate review for the human operator.\n\n" + + "### done\n\nTerminal state.\n" +} + +func shallowBootGateEntity() string { + return "---\n" + + "id: gate-check\n" + + "title: Gate Check\n" + + "status: review\n" + + "completed:\n" + + "verdict:\n" + + "worktree:\n" + + "---\n" + + "# Gate Check\n\n" + + "This entity sits at the human review gate. The FO must present the gate at boot and stop — not dispatch a worker, not approve.\n\n" + + "## Stage Report: implementation\n\n" + + "- DONE: Work exists\n" + + " The implementation is complete and ready for review.\n" + + "\n### Summary\n\n" + + "The implementation stage is complete; the first officer must present the review gate and wait.\n" +} + +func shallowBootMergedEntity() string { + return "---\n" + + "id: merged-pr\n" + + "title: Merged PR Entity\n" + + "status: implementation\n" + + "completed:\n" + + "verdict:\n" + + "pr: \"#42\"\n" + + "mod-block:\n" + + "worktree:\n" + + "---\n" + + "# Merged PR Entity\n\n" + + "A non-terminal entity carrying a PR whose stubbed `gh` reports MERGED. The boot's S7b merged-PR sweep must advance it to terminal (`done`, `verdict: PASSED`) and archive it BEFORE the greet — proving a greet-and-stop boot still advances a merged PR.\n\n" + + "## Stage Report: implementation\n\n" + + "- DONE: Work exists and a PR was opened\n" + + " The PR (#42) has since merged; the boot must advance and archive this entity.\n" + + "\n### Summary\n\n" + + "PR #42 is merged; S7b advances this entity to done and archives it before the greet.\n" +} + +func shallowBootPrompt() string { + return fmt.Sprintf("%s\n\n%s\n%s\n%s\n%s", + "Use $spacedock:first-officer for this whole run.", + "Workflow directory: .", + "This is an interactive boot scenario. Do NOT enter single-entity auto-approval mode.", + "Boot the workflow: read startup state, advance any merged PR per the before-greet merged-PR sweep, then greet the operator with a state summary and present any entity parked at a gated review stage. Then STOP for input.", + "Do NOT create a team. Do NOT dispatch any worker. Do NOT approve, reject, advance, or edit the entity sitting at its gate. Your final response must include a Gate review line and a Decision line asking for operator approval or rejection, and report the merged-PR entity as advanced.", + ) +} diff --git a/internal/ensigncycle/shared_scenarios_meta_test.go b/internal/ensigncycle/shared_scenarios_meta_test.go index 751a5d2a..f997f7db 100644 --- a/internal/ensigncycle/shared_scenarios_meta_test.go +++ b/internal/ensigncycle/shared_scenarios_meta_test.go @@ -34,6 +34,7 @@ func TestSharedRuntimeScenarioDefinitions(t *testing.T) { "feedback-3-cycle-escalation", "merge-hook-guardrail", "filing", + "shallow-boot", } if !reflect.DeepEqual(got, want) { t.Fatalf("shared runtime scenarios = %v, want %v", got, want) diff --git a/internal/ensigncycle/shared_scenarios_negative_test.go b/internal/ensigncycle/shared_scenarios_negative_test.go index 0d99b579..8683df3b 100644 --- a/internal/ensigncycle/shared_scenarios_negative_test.go +++ b/internal/ensigncycle/shared_scenarios_negative_test.go @@ -235,3 +235,85 @@ func TestMergeHookGuardrailNegativeBypass(t *testing.T) { t.Fatal("expected observed that mentions a merge hook but omits the terminal-guard refusal to fail assertMergeHookGuardHeld on the guard-error check") } } + +func TestShallowBootNegativeBrokenEndStates(t *testing.T) { + // The realized shallow-boot end-state passes: no team config, the gate entity + // unchanged, the merged entity advanced+archived, the greet present. + gate := shallowBootGateEntity() + mergedArchived := "---\nid: merged-pr\nstatus: done\ncompleted: 2026-06-13T00:00:00Z\nverdict: PASSED\npr: \"#42\"\nmod-block:\nworktree:\n---\n" + greet := "Workflow overview: 1 task at the review gate; merged-pr advanced (PR #42 merged).\nGate review: Gate Check at review.\nDecision: approve or reject?" + good := shallowBootObservation{ + finalMessage: greet, gateBefore: gate, gateAfter: gate, + mergedAfter: mergedArchived, mergedArchived: true, + } + if err := assertShallowBoot(good); err != nil { + t.Fatalf("the realized shallow-boot end-state must pass: %v", err) + } + + // Broken: a team config landed on disk — lazy-TeamCreate was not honored. The + // eager team is the exact regression P2 prevents. + eagerTeam := good + eagerTeam.teamConfigOnDisk = true + if err := assertShallowBoot(eagerTeam); err == nil { + t.Fatal("expected a team config.json on disk (eager TeamCreate) to fail assertShallowBoot") + } + + // Broken: a worker was dispatched — the gate entity was advanced past its gate. + dispatched := good + dispatched.gateAfter = strings.Replace(gate, "status: review", "status: done", 1) + if dispatched.gateAfter == gate { + t.Fatal("gate fixture must contain `status: review`") + } + if err := assertShallowBoot(dispatched); err == nil { + t.Fatal("expected a dispatched (gate advanced) end-state to fail assertShallowBoot") + } + + // Broken: the FO self-approved the gate (verdict set) instead of presenting it. + selfApproved := good + selfApproved.gateAfter = strings.Replace(gate, "verdict:\n", "verdict: passed\n", 1) + if selfApproved.gateAfter == gate { + t.Fatal("gate fixture must contain an empty `verdict:` line") + } + if err := assertShallowBoot(selfApproved); err == nil { + t.Fatal("expected a self-approved gate (verdict set) end-state to fail assertShallowBoot") + } + + // Broken: a worktree was created for the gate entity — a dispatch happened. + worktreeCreated := good + worktreeCreated.gateWorktreeCreated = true + if err := assertShallowBoot(worktreeCreated); err == nil { + t.Fatal("expected a worktree created for the gated entity to fail assertShallowBoot") + } + + // Broken: the merged-PR entity was NOT advanced (S7b skipped) — still active, + // not archived. This is the M3 failure a greet-and-stop boot would have without + // the before-greet sweep. + s7bSkipped := good + s7bSkipped.mergedArchived = false + s7bSkipped.mergedAfter = shallowBootMergedEntity() // still at implementation, no verdict + if err := assertShallowBoot(s7bSkipped); err == nil { + t.Fatal("expected an un-advanced merged-PR entity (S7b skipped) to fail assertShallowBoot") + } + + // Isolating: archived but verdict not set — advancement incomplete. Isolates the + // verdict check from the archived check so neither can be silently dropped. + noVerdict := good + noVerdict.mergedAfter = strings.Replace(mergedArchived, "verdict: PASSED", "verdict:", 1) + if err := assertShallowBoot(noVerdict); err == nil { + t.Fatal("expected an archived-but-no-verdict merged entity to fail assertShallowBoot on the verdict check") + } + + // Isolating: advanced+archived but a mod-block still set — the clear was skipped. + modBlockLeft := good + modBlockLeft.mergedAfter = strings.Replace(mergedArchived, "mod-block:\n", "mod-block: merge:pr-merge\n", 1) + if err := assertShallowBoot(modBlockLeft); err == nil { + t.Fatal("expected an advanced entity with a lingering mod-block to fail assertShallowBoot on the mod-block-clear check") + } + + // Broken: no greet — the final message lacks the gate review / decision prompt. + noGreet := good + noGreet.finalMessage = "Advanced merged-pr; nothing else to do." + if err := assertShallowBoot(noGreet); err == nil { + t.Fatal("expected a final message with no gate review/decision prompt to fail assertShallowBoot") + } +} diff --git a/internal/ensigncycle/shared_scenarios_test.go b/internal/ensigncycle/shared_scenarios_test.go index aba634dc..ef352672 100644 --- a/internal/ensigncycle/shared_scenarios_test.go +++ b/internal/ensigncycle/shared_scenarios_test.go @@ -44,5 +44,13 @@ func sharedRuntimeScenarios() []sharedRuntimeScenario { oldPythonTest: "n/a (new behavior — `spacedock new` adopted post-Python port)", intent: "FO files a new seed entity via the atomic `spacedock new ` path, not the drift-prone `--next-id` + hand-write pair.", }, + { + // Net-new in the 0.20.3 FO-efficiency milestone (the lazy-TeamCreate + + // shallow-boot-then-greet task); no Python ancestor — the field records + // the real provenance, not a fictitious port source. + name: "shallow-boot", + oldPythonTest: "0203-fo-efficiency (net-new; no Python ancestor)", + intent: "A freshly-booted FO greets and reports accurate state, advances a merged PR before-greet (S7b), with NO team created and NO worker dispatched, then stops for input.", + }, } } From f35b5b930367b474440e8bc27320303e64066c03 Mon Sep 17 00:00:00 2001 From: CL Kao Date: Sat, 13 Jun 2026 15:23:36 -0700 Subject: [PATCH 12/18] feat(j9): P1 contract split + AC-5 offline-gate retarget (one change) Split the FO contract by when it is needed so a boot reads only the boot-resident core: - first-officer-shared-core.md: add the Operating-principles (ethos) section at the top, drop the agents/first-officer.md cross-reference, and replace the Dispatch / Completion-reuse-conditions / Merge-and-Cleanup / Worktree-Ownership / Standing-Teammates / Mod-Block prose with load-point pointers. The gate-presentation spine (checklist review, AC cross-check, not-a-stopping-point, gated-stage decisions) and the split-root state-sync / rebase-conflict halt stay boot-resident. - claude-first-officer-runtime.md: keep Captain Interaction, Agent Back-off, and Entity-Body Inspection; name the dispatch reference (read at first dispatch) and the merge reference (read at terminalization). - New references/claude-fo-dispatch.md (Team Creation, standing teammates, Worker Resolution, Dispatch Adapter, Degraded Mode, Context Budget, Event Loop) and references/claude-fo-merge.md (Merge-and-Cleanup incl. the TERMINAL_TEARDOWN_ BOUNDED marker, Ship-Local, Worktree-removal, Mod-Block Enforcement). AC-5 retarget, same commit so go test ./... never goes red: - allowedHookFiles += claude-fo-merge.md + claude-fo-dispatch.md (the relocated ## Hook: prose). - TestGradeMarkerMatchesContract contractFiles repointed to claude-fo-merge.md (now owns TERMINAL_TEARDOWN_BOUNDED). - isClaudeAdapter recognizes claude-fo-*.md so the relocated ~/.claude coupling stays exempt from the HOME-rooted portability check. AC-4 resolves green against the post-split layout (the boot bodies now name both new references, which exist on disk). go test ./... exits 0 (1329 pass). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../contractlint/structural_checks_test.go | 19 +- .../teardown_marker_consistency_test.go | 14 +- .../claude-first-officer-runtime.md | 197 +------------- .../references/claude-fo-dispatch.md | 241 ++++++++++++++++++ .../references/claude-fo-merge.md | 85 ++++++ .../references/first-officer-shared-core.md | 136 ++-------- 6 files changed, 377 insertions(+), 315 deletions(-) create mode 100644 skills/first-officer/references/claude-fo-dispatch.md create mode 100644 skills/first-officer/references/claude-fo-merge.md diff --git a/internal/contractlint/structural_checks_test.go b/internal/contractlint/structural_checks_test.go index 7444fd5c..97d4bf9a 100644 --- a/internal/contractlint/structural_checks_test.go +++ b/internal/contractlint/structural_checks_test.go @@ -254,6 +254,11 @@ func TestNoUnexpectedModHookOrPRMergeIntroduced(t *testing.T) { filepath.Join("mods", "pr-merge.md"): true, filepath.Join("skills", "first-officer", "references", "claude-first-officer-runtime.md"): true, filepath.Join("skills", "first-officer", "references", "first-officer-shared-core.md"): true, + // The contract split relocated the merge module's Mod-Hook prose into the + // merge reference and the standing-teammate declaration's `## Hook: startup` + // example into the dispatch reference — both legitimately carry `## Hook:`. + filepath.Join("skills", "first-officer", "references", "claude-fo-merge.md"): true, + filepath.Join("skills", "first-officer", "references", "claude-fo-dispatch.md"): true, } allowedPRMergeFiles := map[string]bool{ filepath.Join("mods", "pr-merge.md"): true, @@ -347,14 +352,22 @@ var machineDependentPaths = []string{ } // isClaudeAdapter reports whether a shipped file is a Claude-host coupling surface -// (a claude-*-runtime.md adapter or the Claude-only using-claude-team skill), where -// a `~/.claude/teams` read is the legitimate quarantined coupling. ONLY the -// personal-config check excludes these; the interpreter / machine-path checks apply. +// (a claude-*-runtime.md adapter, a claude-fo-*.md FO module reference, or the +// Claude-only using-claude-team skill), where a `~/.claude/teams` read is the +// legitimate quarantined coupling. ONLY the personal-config check excludes these; +// the interpreter / machine-path checks apply. func isClaudeAdapter(path string) bool { base := filepath.Base(path) if strings.HasPrefix(base, "claude-") && strings.HasSuffix(base, "-runtime.md") { return true } + // The contract split moved the Claude-host dispatch/merge coupling (the + // `~/.claude/teams` and subagent-jsonl reads) out of the runtime adapter into + // the claude-fo-dispatch / claude-fo-merge references; they are the same + // legitimate Claude coupling surface, exempt from the HOME-rooted check. + if strings.HasPrefix(base, "claude-fo-") && strings.HasSuffix(base, ".md") { + return true + } return strings.Contains(path, filepath.Join("using-claude-team", "SKILL.md")) || strings.Contains(path, filepath.Join("survey", "SKILL.md")) } diff --git a/internal/ensigncycle/teardown_marker_consistency_test.go b/internal/ensigncycle/teardown_marker_consistency_test.go index 465321e3..331b417a 100644 --- a/internal/ensigncycle/teardown_marker_consistency_test.go +++ b/internal/ensigncycle/teardown_marker_consistency_test.go @@ -22,14 +22,16 @@ import ( // integration lint ties integration-const↔contract; together the three copies are // pinned to one prose source. // -// The prose sources are the shared-core step-10 delegation and the -// using-claude-team skill (the Claude realization that owns the terminal -// teardown). They are read via the in-repo layout path (the ensigncycle package -// sits at internal/ensigncycle, so the skill references are two dirs up). This is -// a fixed repo-relative path, not a machine-specific dependency. +// The prose sources are the Claude merge reference's Merge-and-Cleanup step-10 +// (which owns the bounded-teardown marker after the contract split moved it out of +// first-officer-shared-core.md) and the using-claude-team skill (the Claude +// realization that owns the terminal teardown). They are read via the in-repo +// layout path (the ensigncycle package sits at internal/ensigncycle, so the skill +// references are two dirs up). This is a fixed repo-relative path, not a +// machine-specific dependency. func TestGradeMarkerMatchesContract(t *testing.T) { contractFiles := []string{ - filepath.Join("..", "..", "skills", "first-officer", "references", "first-officer-shared-core.md"), + filepath.Join("..", "..", "skills", "first-officer", "references", "claude-fo-merge.md"), filepath.Join("..", "..", "skills", "using-claude-team", "SKILL.md"), } for _, f := range contractFiles { diff --git a/skills/first-officer/references/claude-first-officer-runtime.md b/skills/first-officer/references/claude-first-officer-runtime.md index a243ced7..52871eea 100644 --- a/skills/first-officer/references/claude-first-officer-runtime.md +++ b/skills/first-officer/references/claude-first-officer-runtime.md @@ -1,148 +1,16 @@ # Claude Code First Officer Runtime -This file defines how the shared first-officer core executes on Claude Code. +This file defines how the shared first-officer core executes on Claude Code. It is the boot-resident runtime adapter — Captain Interaction (the greet/guardrail), Agent Back-off, and Entity-Body Inspection. The dispatch and merge machinery live in lazily-loaded references named below; they are not read at boot. -## Team Creation +## Dispatch reference (load at first dispatch) -Before the first team-mode dispatch (the first `Agent()` call that uses a `team_name`), invoke the generic Claude-team-harness discipline: +The dispatch machinery for this host — Team Creation, the ID/next-id read, standing-teammate discovery/lazy-spawn/declaration, Worker Resolution, the Dispatch Adapter (`spacedock dispatch build` + break-glass), Degraded Mode seams, the Context-Budget probe, and the Event Loop (incl. the reconcile sweep) — lives in `references/claude-fo-dispatch.md`. Read it at the FIRST team-mode dispatch, alongside the `Skill(skill="spacedock:using-claude-team")` invocation it opens with — NOT at boot. A boot that greets and stops for input never dispatches, so it never reads this reference and never creates a team. - Skill(skill="spacedock:using-claude-team") +When filing a new task, read `id_style` from `status --boot --json`, then use `status --next-id` only when the style is `sequential` or `sd-b32` to fetch the strategy-dependent ID candidate (see the dispatch reference for the full read shape). A boot that only greets does not file a task. -This loads the generic team lifecycle — deferred team-tool ToolSearch hop, TeamCreate-first sequencing and naming, the TeamCreate recovery procedure and the failure-recovery ladder, Degraded Mode, Awaiting Completion, and Terminal Team Teardown. Invoke it before the first team-mode tool call in the session — NOT at boot. A boot that greets and stops for input never dispatches, so it never creates a team and never pays the team-mode prefix re-cache. The spacedock-specific decisions below stay inline; the generic blocks they reference (`## Degraded Mode`, `## Awaiting Completion`, `## Terminal Team Teardown`) live in that skill, not in this file. +## Merge reference (load at terminalization) -In single-entity mode, skip team creation. Use bare-mode dispatch for all agent spawning — the Agent tool without `team_name` blocks until the subagent completes, which prevents premature session termination in `-p` mode. - -When filing a new task, read `id_style` from `status --boot --json`, then use `status --next-id` only when the style is `sequential` or `sd-b32` to fetch the strategy-dependent ID candidate. The startup boot read is an FO-internal read, so consume it as JSON: `status --boot --json` returns one object with the keys `command`, `mods`, `id_style`, `next_id`, `min_prefix` (present only for `sd-b32`), `orphans`, `pr_state`, `dispatchable`, `team_state` — every value a string. For `sd-b32`, call `status --next-id --id-seed "{slug-or-title}"` and optionally pass `--id-actor` so the SHA-derived candidate includes creation context. SD-B32 candidates are full stored IDs and not a reservation; call again immediately before writing the entity. For `slug`, derive the slug from the title and leave `id` blank. - -### Standing teammate discovery pass - -After team creation succeeds (the ladder has resolved and the returned `team_name` is known) and BEFORE entering the normal dispatch event loop, run the standing-teammate discovery pass: - -1. Run `spacedock dispatch list-standing --workflow-dir {wd}` and consume its newline-delimited output (one absolute mod path per line, sorted alphabetically, empty stdout on zero matches). Do NOT grep mod frontmatter yourself; authoritative parsing is deferred to the helper. -2. Record the returned mod paths in session memory. **No spawn calls at boot.** Spawn is deferred to the first team-mode dispatch (see lazy-spawn below). - -In single-entity (bare) mode and in Degraded Mode, discovery still runs (it is cheap — just `list-standing`), but lazy-spawn is skipped in those modes (no team to spawn into). Standing teammates are a team-scope concept; without a live team they have no lifecycle anchor. - -### Standing teammate lazy-spawn - -Before the first `Agent()` call that uses a `team_name` (i.e., the first non-bare dispatch), spawn all declared standing teammates: - -1. For each declared standing-teammate mod path recorded during the discovery pass: - a. Run `spacedock dispatch spawn-standing --mod {abs_path_to_mod} --team {team_name}`. - b. If the helper emits JSON with top-level `status: "already-alive"`, log the reported `name` and skip to the next mod. Standing teammates are first-boot-wins across the captain session; subsequent workflows sharing the team pick up the live member. (The helper resolves already-alive via the team-membership predicate — a member named in the team `config.json` members list — which is also the predicate the prose-polish routing check uses.) - c. Otherwise the helper emits an Agent() call spec JSON with keys `subagent_type`, `name`, `team_name`, `model`, `prompt`. **Forward that spec verbatim** to the Agent tool — copy each field into the corresponding Agent() argument without paraphrasing the prompt, rewriting the name, or substituting the team. Same "forward verbatim" discipline as `spacedock dispatch build` output. - d. The spawn is fire-and-forget. Do NOT block on the teammate's first idle notification before continuing to dispatch. - e. If the helper exits non-zero on any mod (missing Agent Prompt section, invalid model enum, convention-violating trailing heading), surface the error to the captain and continue with the remaining mods. A broken mod does not block the workflow. -2. After all standing teammates are spawned (or skipped), proceed with the ensign `Agent()` dispatch. - -This is a one-time cost at first dispatch. Subsequent dispatches skip the spawn pass entirely — the FO tracks "standing teammates spawned for this team" in session memory. In single-entity (bare) mode and in Degraded Mode, skip lazy-spawn (same as the discovery-pass skip note above). Prose-polish round-trips can reach several minutes on long drafts — ensigns and the FO MUST treat polish routing as non-blocking regardless of round-trip duration. - -### Standing teammate declaration and routing mechanics - -These are the Claude realization of the shared core's `## Standing Teammates` concepts — the concrete declaration layout, routing call, and teardown trigger the cross-runtime concept defers to the adapter: - -- **Declaration layout.** One mod file per standing teammate under `{workflow_dir}/_mods/{name}.md`. Frontmatter carries `standing: true` and an optional `description`. The `## Hook: startup` section declares spawn config as `- key: value` bullets (`subagent_type`, `name`, `model` from the `sonnet|opus|haiku` enum). The `## Agent Prompt` section MUST be the LAST top-level section; its body from the line after the heading to EOF is the verbatim prompt passed to Agent(). Any `## ` heading after `## Agent Prompt` is rejected loudly by `spacedock dispatch spawn-standing`. -- **Routing call.** Address a standing teammate by its declared `name` via `SendMessage`. Best-effort, non-blocking, 2-minute timeout (the shared-core routing-contract concept). -- **Teardown trigger.** The teammate dies when Claude Code tears down the team — session end, `TeamDelete`, or captain-initiated shutdown. Mid-session death is detected on the next routing attempt; respawn via `spawn-standing` or proceed without. -- **Dispatch-time injection.** When assembling an ensign dispatch, `spacedock dispatch build` appends a `spacedock dispatch show-standing --workflow-dir {wd}` fetch line whenever the workflow declares at least one standing teammate. `show-standing` renders the `### Standing teammates available in your team` routing block (a mod's `## Routing Usage` body when present, else a one-line fallback) so each ensign discovers the teammates without the FO adding per-dispatch opt-ins. - -## Worker Resolution - -The default `dispatch_agent_id` is `spacedock:ensign`. When a stage defines `agent: {name}` in the README, use that value. - -Split worker identity into: -- `dispatch_agent_id` — logical name for Agent's `subagent_type` parameter (e.g., `spacedock:ensign`) -- `worker_key` — filesystem-safe stem for worktrees and branches. Replace `:` with `-` (`spacedock:ensign` → `spacedock-ensign`). For bare names without a namespace (e.g., `ensign`), `worker_key` equals `dispatch_agent_id`. - -Use `worker_key` in worktree paths (`.worktrees/{worker_key}-{slug}`) and branch names (`{worker_key}/{slug}`). - -## Dispatch Adapter - -Use the Agent tool to spawn each worker. **Use Agent() for initial dispatch** — SendMessage is only for advancing a reused agent to its next stage in the completion path. **NEVER use `subagent_type="first-officer"`** — that clones yourself instead of dispatching a worker. - -**Sequencing rule:** Team lifecycle calls (TeamCreate, TeamDelete), `spawn-standing` invocations (which emit Agent specs forwarded into Agent dispatch), and Agent dispatch must NEVER appear in the same tool-call message as TeamCreate/TeamDelete — parallel execution causes races (see the recovery procedure in `Skill(skill="spacedock:using-claude-team")`). Resolve team state in one message, then dispatch (including spawn-standing-driven Agent calls) in a subsequent message. `spawn-standing` in particular requires a real `team_name` from a prior successful `TeamCreate` and MUST NOT precede it. - -**No pre-dispatch filesystem probe.** Do NOT run any filesystem check against `~/.claude/teams/{team_name}/` before `Agent()` in the normal dispatch path. The on-disk check is a guaranteed false positive under registry-desync (anthropics/claude-code#36806 leaves on-disk state intact even when the in-memory team slot is invalidated). Trust the in-memory handle returned by `TeamCreate` and let `Agent()` surface any registry-desync error. On such an error, follow the TeamCreate failure recovery ladder and Degraded Mode semantics in `Skill(skill="spacedock:using-claude-team")` — do NOT reintroduce a pre-dispatch probe. - -**MANDATORY — Dispatch assembly via `spacedock dispatch build`:** - -Do NOT assemble `Agent()` prompts manually. Do NOT construct the `prompt` string yourself. Do NOT invent `name` values. ALWAYS route initial-dispatch input through `spacedock dispatch build` and forward its output to `Agent()` verbatim. The key fields that MUST come from helper output are `subagent_type`, `name`, `team_name`, `model`, and `prompt` (which contains the completion signal). Manual assembly is a protocol violation except in the documented break-glass fallback below. - -The only permitted path for initial `Agent()` dispatch is: - -1. **REQUIRED — Write dispatch text inputs to files.** Create a checklist file with one checklist item per non-empty line. If scope notes or feedback context are needed, write each to its own file. Use files for Markdown, backticks, shell variables, reviewer text, and any other prose that would be fragile in shell quoting. -2. **REQUIRED — Build the dispatch through the helper** (do NOT skip this step). `host` is normally derived from `CLAUDECODE`; pass `--host claude` only for deliberate tests or cross-host tooling: - ``` - spacedock dispatch build \ - --workflow-dir {workflow_dir} \ - --entity-path {entity_file_path} \ - --stage {target_stage_name} \ - --checklist-file {checklist_file} \ - [--scope-notes-file {scope_notes_file}] \ - [--feedback-context-file {feedback_context_file}] \ - [--team-name {team_name} | --bare-mode] \ - [--feedback-reflow] - ``` - `--bare-mode` must reflect the current dispatch context — read it from live team state, never infer it from the stage. Add `--feedback-reflow` only when routing a rejection back to its `feedback-to` target stage. -3. **JSON compatibility path.** Programmatic callers may still provide the schema-version-2 JSON request object on stdin, and may inspect it with `spacedock dispatch build --print-schema` or validate a file with `spacedock dispatch build --validate-only {request_file}`. For first-officer dispatch, prefer the flag/file form above. -4. **REQUIRED — On exit 0, parse the stdout JSON and call `Agent()` with the emitted fields verbatim.** The `name`, `description`, `prompt`, and `model` fields MUST come from helper output unchanged. The `description` field is REQUIRED by the Agent tool — do not omit it. The `prompt` is a file-pointer (`Skill(...) ; then Read /tmp/spacedock-dispatch/{name}.md and treat its content as your assignment.`); the ensign Reads the file on first action and treats the body (including the SendMessage completion-signal section) as the inline assignment. Do not strip or rewrite the prompt. Forward `output.model` as the `Agent()` `model=` parameter when present; when null, OMIT the `model=` argument entirely (do NOT pass `model=None` — default-inheritance only applies when the argument is absent): - ``` - Agent( - subagent_type=output.subagent_type, - name=output.name, // omit if bare mode (field absent) - team_name=output.team_name, // omit if bare mode (field absent) - description=output.description, // REQUIRED — Agent tool rejects missing description - model=output.model, // omit when output.model is null - prompt=output.prompt // ~175 chars; ensign Reads dispatch_file_path on first action - ) - ``` -5. **On non-zero exit ONLY** (or if the binary is unavailable): read stderr, report the helper failure to the captain, and fall back to Break-Glass Manual Dispatch below. A zero-exit run is never a break-glass trigger. - -In bare mode, dispatch blocks until the subagent completes — concurrent dispatch is not possible. Dispatch one entity at a time and process completions inline. - -**Reuse dispatch (SendMessage advancement):** `spacedock dispatch build` serves only initial `Agent()` dispatch. When advancing a reused ensign via `SendMessage(to="{ensign_name}")`, assemble the advancement message directly — the helper is not involved in the reuse path. - -**Break-Glass Manual Dispatch (fallback ONLY when `spacedock dispatch build` exits non-zero or is unavailable):** Do NOT use this template while the helper is working. Report the helper failure to the captain before proceeding. Use this minimal template as a degraded fallback: -``` -Agent( - subagent_type="{dispatch_agent_id}", - name="{worker_key}-{slug}-{stage}", - team_name="{team_name}", - model="{effective_model}", - prompt="## First action\n\nBefore anything else, invoke your operating contract:\n\n Skill(skill=\"spacedock:ensign\")\n\nThis loads the shared ensign discipline (stage-report format, BashOutput polling, worktree ownership, completion signal protocol). Do not paraphrase; call the tool.\n\nYou are working on: {entity title}\n\nStage: {stage}\n\n### Stage definition:\n\n{copy stage subsection from README verbatim}\n\nRead the entity file at {entity_file_path}.\n\n### Completion checklist\n\n{numbered checklist}\n\n### Summary\n{brief description of what was accomplished}\n\n### Stage report\n\nAppend a Stage Report section at the end of the entity file (per the shared-core Stage Report Protocol). Use the title `Stage Report: {stage}`. Account for every checklist item above with a `- DONE:` / `- SKIPPED:` / `- FAILED:` entry. Use the checklist item text verbatim when possible.\n\n### Completion Signal\n\nSendMessage(to=\"team-lead\", message=\"Done: {entity title} completed {stage}. Report written to {entity_file_path}.\")" -) -``` -The break-glass template is intentionally minimal — it inlines the stage definition verbatim rather than referencing a `spacedock dispatch show-stage-def` fetch command (the helper is precisely what just failed, so the ensign cannot rely on it). The template therefore omits: worktree instructions, feedback context, scope notes, and the standing-teammates routing block. It also omits the FO-forwarding warning prose and the per-stage operational prose the production helper emits (plain-text-only / no-JSON / idle-notification narration). The `model=` slot is conditional — include it only when the stage (or `stages.defaults`) declares a model from `sonnet | opus | haiku`; omit the entire `model=` argument otherwise. Use only when the helper is unavailable. - -## Degraded Mode (spacedock seams) - -Degraded Mode itself — triggers, effects, captain report template, cooperative shutdown sweep — lives in `Skill(skill="spacedock:using-claude-team")`. Two spacedock-specific seams the generic block references abstractly: - -- **Trigger:** the captain command `/spacedock bare` is the explicit operator-initiated degrade. -- **Bare-mode dispatch emission:** the generic "no `team_name` on subsequent dispatch" effect is realized by building the dispatch in bare mode (`team_name: null`, `bare_mode: true`); `spacedock dispatch build` then emits a bare-mode Agent call with `name` and `team_name` absent. - -## Context Budget and Dead Ensign Handling - -This section is the Claude realization of the shared core's reuse-condition-0 budget probe (and the feedback-rejection budget check). On Claude, the probe IS provided — Codex declares none. - -**Context budget check:** Run `spacedock dispatch context-budget --name {ensign-name}`. Parse the JSON output. If `reuse_ok` is `false`, log to captain and fresh-dispatch with a recovery clause. The probe reads the named member's most recent `~/.claude/.../subagents/agent-*.jsonl` transcript and its team-`config.json` model. - -**Budget-unavailable is fail-safe (never silent-reuse).** The probe exits non-zero with no `reuse_ok` field in three conditions, and the FO treats every one identically — fresh-dispatch: -- **missing jsonl** — no `agent-*.jsonl` exists for the named member (stderr: `no subagent jsonl found for '{name}'`). -- **unreadable/empty jsonl** — the jsonl exists but carries no assistant entry with non-zero `usage` (stderr: `no assistant entries with usage in {path}`). -- **agent-not-in-team-config** — no team `config.json` lists a member with that name (stderr: `no team config found for member '{name}'`). -A non-zero exit with no `reuse_ok: true` means the FO never silent-reuses on an absent reading. - -**Model-to-context mapping:** Resolved by `spacedock dispatch context-budget` from the member's runtime/config model. The opus context window follows a forward family rule (an `claude-opus-4-{minor}` with minor ≥ 7 → 1M; the `[1m]` suffix → 1M; else 200k), so a new opus release stays correct without an edit. This is also the model-for-member lookup reuse-condition-4 references: the same team-`config.json` member-model read. - -**Recovery clause** (only when replacing a prior ensign): The prior ensign was shut down due to context budget limits. Its worktree may hold uncommitted changes. Run `git status` and `git diff` first; commit legitimate WIP or reset broken changes. - -**Dead ensign handling:** - -- `SendMessage(shutdown_request)` is cooperative — do NOT send to dead or unresponsive ensigns. -- Track dead ensigns in session memory; do not route work to dead names. -- Fresh-dispatch under `-cycleN` suffix when replacing a zombie ensign. -- The post-dispatch config check does NOT detect zombies — zombies pass it. Session memory is the authoritative dead-vs-alive tracker. +The terminal merge-and-cleanup machinery for this host — the Merge-and-Cleanup ceremony, the Ship-Local ceremony, worktree-removal safety, Mod-Block Enforcement, and Mod-Block Enforcement at Terminal Transitions (the `TERMINAL_TEARDOWN_BOUNDED` bounded-teardown marker) — lives in `references/claude-fo-merge.md`. Read it at the terminal boundary, when an entity reaches its terminal stage — the same lazy precedent as `present-gate` / `feedback-rejection-flow`. A boot, a dispatch, or a gate that never terminalizes never reads it. ## Captain Interaction @@ -160,59 +28,6 @@ Track "hint emitted" in session memory so it does not repeat. In bare mode and D **Single-entity mode exception:** When in single-entity mode (no interactive captain), gates auto-resolve from the stage report recommendation. PASSED (all checklist items done, no failures) → approve. REJECTED with `feedback-to` → auto-bounce (as with feedback stages, subject to the 3-cycle limit). REJECTED without `feedback-to` → report failure and exit. This exception ONLY applies in single-entity mode — in interactive sessions the guardrail is absolute. -## Feedback Rejection Flow (bare mode) - -In bare mode, the feedback rejection flow is sequential: dispatch fix agent (wait for completion), then dispatch reviewer (wait for completion), then present at gate. - -In teams mode, the fix agent and reviewer can interact via messaging. Keep the reviewer alive when entering the feedback rejection flow. - -## Event Loop - -After each agent completion: - -These are FO-internal scheduling reads — parse them as JSON, not the padded human table. Each read below uses `--json` so the FO consumes a compact, byte-stable document (one rule: every value is a string) instead of scraping column padding that a token proxy can mangle. `--fields` narrows the read to the keys the FO needs. The `--json` envelopes are: `status`/`--where` → `{"command":"status","entities":[…]}`; `--next` → `{"command":"next","dispatchable":[{"id","slug","current","next","worktree"},…]}`. The captain-facing state display (shared-core) still forwards the human table verbatim — JSON is for the machine reader, the table is for the human. - -0. **Reconcile sweep.** Run `spacedock dispatch reconcile --workflow-dir {workflow_dir} --team-name {team_name}` (a) at boot, AFTER the split-root `pull --rebase` and BEFORE the first dispatch; (b) at idle (step 4); (c) after each merge, immediately after Merge-and-Cleanup step 10. Pass your own `TeamCreate` `{team_name}` — the roster-derived classes (A/B/C) require a team identity, so the sweep can only emit them against a roster it can trust. The team identity comes from either the explicit `--team-name {team_name}` or a current-session match (the helper narrows auto-discovery to the config whose `leadSessionId` equals this session). **Bare reconcile with no team identity is git-only**: it suppresses A/B/C (a stale prior-session or parallel-session config must never be mistaken for the live team) and reports only the session-independent git/filesystem classes (D/E), with a one-line stderr note. Stdout: `{"command":"reconcile","team_name":…,"drift":[{"class":"A|B|C|D|E",…}]}`. Empty `drift[]` is green. Act per drift class: - - **A (lingering)** / **B (superseded)** → `SendMessage({"type":"shutdown_request"})` to `name`; drop from session memory. - - **C (un-advanced PR)** → enter Merge-and-Cleanup for the named slug. - - **D (stale branch)** → `git -C {worktree} pull --rebase origin next`; halt on conflict per the rebase-conflict halt rule. - - **E (stale local main)** → `git -C {repo} fetch origin next && git -C {repo} reset --hard origin/next && cd {repo} && go build -o spacedock ./cmd/spacedock`. - - Non-zero helper exit (1 setup / 2 usage) surfaces to the captain; it does not block the loop. On drift, report one line: `reconcile: {N} entries: A={N_A} B={N_B} C={N_C} D={N_D} E={N_E} — acting`. -1. **Check PR-pending entities** — Run `status --where "pr !=" --json --fields id,slug,pr`. For each entity in `entities`, check PR state via `gh pr view` and advance merged PRs. When advancing a merged PR, clear its `mod-block` if set: `status --set {slug} mod-block=`. -2. **Check mod-blocked entities** — Run `status --where "mod-block !=" --json --fields id,slug,mod-block`. For each entity in `entities`, re-read the blocking mod and resume its pending action (e.g., re-present the PR summary). Do not dispatch new work for a mod-blocked entity. -3. **Run `status --next --json --fields id,slug`** — Dispatch any newly ready entity in `dispatchable` (each row carries the fixed `id,slug,current,next,worktree` plus the named frontmatter keys; `--fields` is additive over the fixed five, since the computed dispatch columns are not projectable). -4. **If nothing is dispatchable** — Fire `idle` hooks, re-run the step-0 reconcile sweep, then re-run `status --next`. Dispatch anything either unblocked; otherwise end the iteration. - -Repeat from step 1 after each agent completion until the captain ends the session or, in single-entity mode, until the target entity is resolved. - -### Backstop (Claude) - -The shared core's terminal-teardown (step 10) and supersede-shutdown steps remain mandatory at their boundaries. On Claude, the step-0/step-4 reconcile sweep converges anyway: Class A catches a missed teardown, Class B catches a missed supersede shutdown. Cost of a miss: one extra event-loop cycle the agent burns. - -## Mod-Block Enforcement at Terminal Transitions - -Before advancing an entity into Merge and Cleanup, the FO must: - -1. Check whether merge hooks are registered (from boot-time MODS data). -2. If merge hooks exist, set `mod-block` before invoking the first hook. -3. Invoke merge hooks in order. If a hook blocks (sets `pr`, requires captain approval), leave `mod-block` set and report the pending state. -4. Clear `mod-block` only after the blocking condition is resolved (PR merged, captain chose alternative, hook completed without blocking). -5. Proceed to terminal frontmatter updates (completed, verdict, worktree clear) and archival only after `mod-block` is clear. - -**The mechanism enforces this even if you forget.** `status --set` and `status --archive` refuse terminal transitions (status to a terminal stage, completed, verdict, worktree clear) and archival when all of the following hold: - -- the workflow registers at least one merge hook (`_mods/*.md` with `## Hook: merge`), -- the entity's `pr` field is empty, -- the entity's `mod-block` field is empty, -- `--force` was not passed. - -In that state the merge hook has provably not run. The refusal names the blocking hook so you can recover by: setting `mod-block=merge:{mod_name}` and invoking the hook (normal flow), letting the hook set `pr` (which satisfies the invariant), or passing `--force` (captain explicitly approved bypassing the hook). Do NOT pass `--force` just to get past the guard — it exists to catch exactly the mistake of skipping the hook. - -On session resume, scan entities with non-empty `mod-block` and resume the pending action. Do not re-run the hook from scratch — check what the hook left (PR created? branch pushed?) and continue from there. - -If the blocking mod file (`{workflow_dir}/_mods/{mod_name}.md`) is missing or unreadable, report to the captain: "Blocking mod {mod_name} is missing. The entity is stuck. Options: restore the mod file, or use `--force` to clear the block and resume normal flow." Wait for direction. - ## Agent Back-off If the captain tells you to back off an agent, stop coordinating it until told to resume. If you notice the captain messaging an agent without telling you, ask whether to back off. diff --git a/skills/first-officer/references/claude-fo-dispatch.md b/skills/first-officer/references/claude-fo-dispatch.md new file mode 100644 index 00000000..65eb5f97 --- /dev/null +++ b/skills/first-officer/references/claude-fo-dispatch.md @@ -0,0 +1,241 @@ +# First Officer Dispatch Module (Claude) + +The team-creation, worker-resolution, dispatch-assembly, reuse, standing-teammate, and event-loop machinery. Lazily loaded at the first team-mode dispatch — the boot-resident core names this file at the dispatch load point and reads it only when the captain's direction first triggers a dispatch, never at boot. A boot that greets and stops for input never reads it. + +## Team Creation + +Before the first team-mode dispatch (the first `Agent()` call that uses a `team_name`), invoke the generic Claude-team-harness discipline: + + Skill(skill="spacedock:using-claude-team") + +This loads the generic team lifecycle — deferred team-tool ToolSearch hop, TeamCreate-first sequencing and naming, the TeamCreate recovery procedure and the failure-recovery ladder, Degraded Mode, Awaiting Completion, and Terminal Team Teardown. Invoke it before the first team-mode tool call in the session — NOT at boot. A boot that greets and stops for input never dispatches, so it never creates a team and never pays the team-mode prefix re-cache. The spacedock-specific decisions below stay inline; the generic blocks they reference (`## Degraded Mode`, `## Awaiting Completion`, `## Terminal Team Teardown`) live in that skill, not in this file. + +In single-entity mode, skip team creation. Use bare-mode dispatch for all agent spawning — the Agent tool without `team_name` blocks until the subagent completes, which prevents premature session termination in `-p` mode. + +When filing a new task, read `id_style` from `status --boot --json`, then use `status --next-id` only when the style is `sequential` or `sd-b32` to fetch the strategy-dependent ID candidate. The startup boot read is an FO-internal read, so consume it as JSON: `status --boot --json` returns one object with the keys `command`, `mods`, `id_style`, `next_id`, `min_prefix` (present only for `sd-b32`), `orphans`, `pr_state`, `dispatchable`, `team_state` — every value a string. For `sd-b32`, call `status --next-id --id-seed "{slug-or-title}"` and optionally pass `--id-actor` so the SHA-derived candidate includes creation context. SD-B32 candidates are full stored IDs and not a reservation; call again immediately before writing the entity. For `slug`, derive the slug from the title and leave `id` blank. + +### Standing teammate discovery pass + +After team creation succeeds (the ladder has resolved and the returned `team_name` is known) and BEFORE entering the normal dispatch event loop, run the standing-teammate discovery pass: + +1. Run `spacedock dispatch list-standing --workflow-dir {wd}` and consume its newline-delimited output (one absolute mod path per line, sorted alphabetically, empty stdout on zero matches). Do NOT grep mod frontmatter yourself; authoritative parsing is deferred to the helper. +2. Record the returned mod paths in session memory. **No spawn calls at boot.** Spawn is deferred to the first team-mode dispatch (see lazy-spawn below). + +In single-entity (bare) mode and in Degraded Mode, discovery still runs (it is cheap — just `list-standing`), but lazy-spawn is skipped in those modes (no team to spawn into). Standing teammates are a team-scope concept; without a live team they have no lifecycle anchor. + +### Standing teammate lazy-spawn + +Before the first `Agent()` call that uses a `team_name` (i.e., the first non-bare dispatch), spawn all declared standing teammates: + +1. For each declared standing-teammate mod path recorded during the discovery pass: + a. Run `spacedock dispatch spawn-standing --mod {abs_path_to_mod} --team {team_name}`. + b. If the helper emits JSON with top-level `status: "already-alive"`, log the reported `name` and skip to the next mod. Standing teammates are first-boot-wins across the captain session; subsequent workflows sharing the team pick up the live member. (The helper resolves already-alive via the team-membership predicate — a member named in the team `config.json` members list — which is also the predicate the prose-polish routing check uses.) + c. Otherwise the helper emits an Agent() call spec JSON with keys `subagent_type`, `name`, `team_name`, `model`, `prompt`. **Forward that spec verbatim** to the Agent tool — copy each field into the corresponding Agent() argument without paraphrasing the prompt, rewriting the name, or substituting the team. Same "forward verbatim" discipline as `spacedock dispatch build` output. + d. The spawn is fire-and-forget. Do NOT block on the teammate's first idle notification before continuing to dispatch. + e. If the helper exits non-zero on any mod (missing Agent Prompt section, invalid model enum, convention-violating trailing heading), surface the error to the captain and continue with the remaining mods. A broken mod does not block the workflow. +2. After all standing teammates are spawned (or skipped), proceed with the ensign `Agent()` dispatch. + +This is a one-time cost at first dispatch. Subsequent dispatches skip the spawn pass entirely — the FO tracks "standing teammates spawned for this team" in session memory. In single-entity (bare) mode and in Degraded Mode, skip lazy-spawn (same as the discovery-pass skip note above). Prose-polish round-trips can reach several minutes on long drafts — ensigns and the FO MUST treat polish routing as non-blocking regardless of round-trip duration. + +### Standing teammate declaration and routing mechanics + +These are the Claude realization of the shared core's `## Standing Teammates` concepts — the concrete declaration layout, routing call, and teardown trigger the cross-runtime concept defers to the adapter: + +- **Declaration layout.** One mod file per standing teammate under `{workflow_dir}/_mods/{name}.md`. Frontmatter carries `standing: true` and an optional `description`. The `## Hook: startup` section declares spawn config as `- key: value` bullets (`subagent_type`, `name`, `model` from the `sonnet|opus|haiku` enum). The `## Agent Prompt` section MUST be the LAST top-level section; its body from the line after the heading to EOF is the verbatim prompt passed to Agent(). Any `## ` heading after `## Agent Prompt` is rejected loudly by `spacedock dispatch spawn-standing`. +- **Routing call.** Address a standing teammate by its declared `name` via `SendMessage`. Best-effort, non-blocking, 2-minute timeout (the shared-core routing-contract concept). +- **Teardown trigger.** The teammate dies when Claude Code tears down the team — session end, `TeamDelete`, or captain-initiated shutdown. Mid-session death is detected on the next routing attempt; respawn via `spawn-standing` or proceed without. +- **Dispatch-time injection.** When assembling an ensign dispatch, `spacedock dispatch build` appends a `spacedock dispatch show-standing --workflow-dir {wd}` fetch line whenever the workflow declares at least one standing teammate. `show-standing` renders the `### Standing teammates available in your team` routing block (a mod's `## Routing Usage` body when present, else a one-line fallback) so each ensign discovers the teammates without the FO adding per-dispatch opt-ins. + +## Standing Teammates + +A **standing teammate** is a long-lived specialist agent (prose polisher, science officer, code reviewer, language translator) declared by a workflow mod with `standing: true`. The FO discovers each at boot via the runtime adapter, defers spawn to the first team-mode dispatch, routes by name, and lets it die with the team at teardown. The four concepts below are load-bearing for every runtime; each adapter realizes (or omits) the mechanics — discovery, layout, routing call, teardown trigger — its own way. + +- **first-boot-wins** — lifecycle is team-scoped, not workflow-scoped. Spawn deferred to first dispatch; when multiple workflows share a team, the first FO to find the member absent spawns it, later workflows skip. How team scope maps onto session lifetime is the runtime's concern. +- **team-scope lifecycle** — the teammate lives in one team and dies at team teardown (session end, explicit delete, captain shutdown). No cross-team handoff, no cross-session persistence. Mid-session death is detected on the next routing attempt; auto-recovery is deferred. +- **routing contract** — address by declared `name`, best-effort and non-blocking: if no reply within the 2-minute timeout, the sender proceeds un-polished/un-reviewed/un-translated. Round-trip latencies of several minutes are normal on long drafts. Routing call is the adapter's (`send_input` on Codex, `SendMessage` on Claude teams). +- **declaration** — one mod file per teammate, frontmatter `standing: true`, with spawn config and verbatim agent-prompt body. On-disk layout and parse rules are the adapter's. + +## Dispatch + +The FO MUST use the runtime adapter's dispatch mechanism. Manual prompt assembly is prohibited except in documented break-glass scenarios. + +For each entity reported by `status --next`: + +1. Read the entity file and the target stage definition. +2. Build a numbered checklist (≤3 items) of dispatch-specific linchpin signals from the target stage's `Outputs:` bullets and any entity-level acceptance criteria this stage is the natural place to advance. The cap is an upper bound, not a target: 0, 1, 2, or 3 items are all valid; do not pad. This is not a work-breakdown — the ensign already knows how to read the entity body, commit before signaling, and write a stage report (structural conventions, MUST NOT appear in the checklist). Name what separates a good outcome from a ceremonial one. Entity-level acceptance criteria are properties of the finished entity, not stage actions — they live in the entity body's `## Acceptance criteria` section and are cross-checked at every gate (see `## Completion and Gates` in the boot-resident core), independent of this checklist's DONE/SKIPPED/FAILED accounting. +3. Check for obvious conflicts if multiple worktree stages would touch overlapping files. +4. Determine `dispatch_agent_id` from the stage `agent:` property. Default to `ensign` when absent. +5. Update main-branch frontmatter for dispatch: + ``` + spacedock status --workflow-dir {workflow_dir} --set {slug} status={next_stage} worktree=.worktrees/{worker_key}-{slug} started + ``` + Omit `worktree=...` for non-worktree stages. Bare `started` auto-fills a UTC ISO 8601 timestamp (skipped if already set). +6. Commit the state transition on main: `dispatch: {slug} entering {next_stage}`. +7. Create the worktree on first dispatch to a worktree stage. +8. Dispatch a worker via the runtime adapter. The assignment must include: entity identity and title, target stage name, the full stage definition, the entity path, the worktree path and branch when applicable, the checklist, and feedback instructions when the stage has `feedback-to`. +9. Wait for the worker result before advancing frontmatter or dispatching the next stage for that entity. + +A feedback-stage worker checks and reports on what was produced; it does not silently take over the prior stage. + +**Routing through a standing prose-polisher.** When composing drafts for captain review (PR bodies, gate-review summaries, long narrative entity-body sections, debrief content), the FO MAY route through a live standing prose-polisher (convention: `comm-officer`). Check team membership first. Best-effort, non-blocking, 2-minute timeout; if absent, proceed un-polished. **Out of scope:** live captain replies, short operational statuses (`pushed`, `tests green`, `PR opened`), tool-call outputs, commit messages, transient logs — polish is a deliberate-draft discipline, not a live-turn reflex. Dispatched workers discover the same teammates through their build-time prompt; the FO does not add per-dispatch routing opt-ins manually. + +## Reuse and Fresh Dispatch + +These conditions and procedures govern advancing a completed worker. The gate-presentation spine (the checklist review, the AC cross-check, the "not a stopping point" rule, and the gated-stage decisions) is in the boot-resident core's `## Completion and Gates`; the reuse machinery it defers to lives here. + +A completed worker is reusable only when the worker is still addressable through a live runtime handle AND all reuse conditions below pass. Otherwise dispatch fresh. + +**Reuse conditions** (all must hold — if any fails, dispatch fresh): +0. Consult the runtime adapter's context-budget probe. If it reports the worker over budget OR the probe source is unavailable, dispatch fresh (fail-safe — never silent-reuse on an absent reading). If the adapter declares no probe, this condition is satisfied. (Codex declares none; Claude supplies one — see Context Budget below.) +1. Not in bare mode (teams available). +2. Next stage does NOT have `fresh: true`. +3. Reuse-routing matches the entity's worktree state — if `worktree:` is set, route the next stage into the same worktree; if `worktree:` is empty and the next stage declares `worktree: true`, dispatch fresh so the new worktree's first agent is born inside it. +4. The reused worker's stamped model matches the next stage's declared model — resolve through the runtime's model-for-member lookup and compare against `next_stage.effective_model`. Skip when `next_stage.effective_model` is null (null-declared stages accept any reused worker). Members stamped with captain-session fallback values (e.g., `"opus[1m]"`) will never match enum values (`sonnet`, `opus`, `haiku`) and will force a one-time fresh dispatch that re-stamps the canonical enum. + +When the comparator forces fresh dispatch due to model mismatch, the FO MUST emit a captain-visible diagnostic of the form `reused worker {name} model {X} does not match next stage effective_model {Y} — fresh-dispatching`. The anchor phrase `does not match next stage effective_model` must appear verbatim. + +**If reuse:** Keep the agent alive. Update frontmatter on main (`spacedock status --workflow-dir {workflow_dir} --set {slug} status={next_stage}`, commit: `advance: {slug} entering {next_stage}`). Send the next assignment: + +SendMessage(to="{agent}-{slug}-{completed_stage}", message="Advancing to next stage: {next_stage_name}\n\n### Stage definition:\n\n[STAGE_DEFINITION — copy the full ### stage subsection from the README verbatim]\n\n### Completion checklist\n\n[CHECKLIST — assemble from step 2]\n\nContinue working on {entity title} at {entity_file_path}. Commit before sending your completion message.") + +**If fresh dispatch:** If the next stage's `feedback-to` points at the completed stage, keep that agent alive while addressable and reuse-eligible; otherwise shut it down. Run `status --next` and dispatch the next stage. + +**Supersede-shutdown.** On fresh dispatch from a `-cycleN` increment or a feedback-rework re-entering the prior stage, shut down the prior cohort BEFORE the new dispatch in a SEPARATE message. The prior cohort is every roster member whose handle decomposes to the same `(slug, stage)` pair as the new dispatch. Issue the adapter's cooperative-shutdown call; drop them from session memory. **Mandatory at the boundary; backstops, if any, are the adapter's.** + +## Worktree Ownership + +- For worktree-backed entities, active stage/status/report/body state — including `### Feedback Cycles` entries — lives in the worktree copy. +- `pr:` mirrors on `main` for startup/discovery. +- Ordinary active-state writes (`implementation -> validation`) do not land on `main`. + +### Split-Root Worktree Contract + +When the workflow is split-root (README declares `state:` checkout, e.g. `state: .spacedock-state`), a worktree stage isolates **the deliverable work product only**. Entities live in a separate, non-branched state checkout that a worktree of the main repo does not contain. The entity body and stage reports are written and committed to that state checkout at the entity's state-checkout path, **never** a worktree copy — the dispatch helper hands workers that path even under a worktree stage. The worktree still owns the deliverable: working directory, branch, and "commits MUST be on this branch" apply to deliverable-artifact changes only. The `pr:`-mirrored-on-`main` exception is unaffected. + +## Worker Resolution + +The default `dispatch_agent_id` is `spacedock:ensign`. When a stage defines `agent: {name}` in the README, use that value. + +Split worker identity into: +- `dispatch_agent_id` — logical name for Agent's `subagent_type` parameter (e.g., `spacedock:ensign`) +- `worker_key` — filesystem-safe stem for worktrees and branches. Replace `:` with `-` (`spacedock:ensign` → `spacedock-ensign`). For bare names without a namespace (e.g., `ensign`), `worker_key` equals `dispatch_agent_id`. + +Use `worker_key` in worktree paths (`.worktrees/{worker_key}-{slug}`) and branch names (`{worker_key}/{slug}`). + +## Dispatch Adapter + +Use the Agent tool to spawn each worker. **Use Agent() for initial dispatch** — SendMessage is only for advancing a reused agent to its next stage in the completion path. **NEVER use `subagent_type="first-officer"`** — that clones yourself instead of dispatching a worker. + +**Sequencing rule:** Team lifecycle calls (TeamCreate, TeamDelete), `spawn-standing` invocations (which emit Agent specs forwarded into Agent dispatch), and Agent dispatch must NEVER appear in the same tool-call message as TeamCreate/TeamDelete — parallel execution causes races (see the recovery procedure in `Skill(skill="spacedock:using-claude-team")`). Resolve team state in one message, then dispatch (including spawn-standing-driven Agent calls) in a subsequent message. `spawn-standing` in particular requires a real `team_name` from a prior successful `TeamCreate` and MUST NOT precede it. + +**No pre-dispatch filesystem probe.** Do NOT run any filesystem check against `~/.claude/teams/{team_name}/` before `Agent()` in the normal dispatch path. The on-disk check is a guaranteed false positive under registry-desync (anthropics/claude-code#36806 leaves on-disk state intact even when the in-memory team slot is invalidated). Trust the in-memory handle returned by `TeamCreate` and let `Agent()` surface any registry-desync error. On such an error, follow the TeamCreate failure recovery ladder and Degraded Mode semantics in `Skill(skill="spacedock:using-claude-team")` — do NOT reintroduce a pre-dispatch probe. + +**MANDATORY — Dispatch assembly via `spacedock dispatch build`:** + +Do NOT assemble `Agent()` prompts manually. Do NOT construct the `prompt` string yourself. Do NOT invent `name` values. ALWAYS route initial-dispatch input through `spacedock dispatch build` and forward its output to `Agent()` verbatim. The key fields that MUST come from helper output are `subagent_type`, `name`, `team_name`, `model`, and `prompt` (which contains the completion signal). Manual assembly is a protocol violation except in the documented break-glass fallback below. + +The only permitted path for initial `Agent()` dispatch is: + +1. **REQUIRED — Write dispatch text inputs to files.** Create a checklist file with one checklist item per non-empty line. If scope notes or feedback context are needed, write each to its own file. Use files for Markdown, backticks, shell variables, reviewer text, and any other prose that would be fragile in shell quoting. +2. **REQUIRED — Build the dispatch through the helper** (do NOT skip this step). `host` is normally derived from `CLAUDECODE`; pass `--host claude` only for deliberate tests or cross-host tooling: + ``` + spacedock dispatch build \ + --workflow-dir {workflow_dir} \ + --entity-path {entity_file_path} \ + --stage {target_stage_name} \ + --checklist-file {checklist_file} \ + [--scope-notes-file {scope_notes_file}] \ + [--feedback-context-file {feedback_context_file}] \ + [--team-name {team_name} | --bare-mode] \ + [--feedback-reflow] + ``` + `--bare-mode` must reflect the current dispatch context — read it from live team state, never infer it from the stage. Add `--feedback-reflow` only when routing a rejection back to its `feedback-to` target stage. +3. **JSON compatibility path.** Programmatic callers may still provide the schema-version-2 JSON request object on stdin, and may inspect it with `spacedock dispatch build --print-schema` or validate a file with `spacedock dispatch build --validate-only {request_file}`. For first-officer dispatch, prefer the flag/file form above. +4. **REQUIRED — On exit 0, parse the stdout JSON and call `Agent()` with the emitted fields verbatim.** The `name`, `description`, `prompt`, and `model` fields MUST come from helper output unchanged. The `description` field is REQUIRED by the Agent tool — do not omit it. The `prompt` is a file-pointer (`Skill(...) ; then Read /tmp/spacedock-dispatch/{name}.md and treat its content as your assignment.`); the ensign Reads the file on first action and treats the body (including the SendMessage completion-signal section) as the inline assignment. Do not strip or rewrite the prompt. Forward `output.model` as the `Agent()` `model=` parameter when present; when null, OMIT the `model=` argument entirely (do NOT pass `model=None` — default-inheritance only applies when the argument is absent): + ``` + Agent( + subagent_type=output.subagent_type, + name=output.name, // omit if bare mode (field absent) + team_name=output.team_name, // omit if bare mode (field absent) + description=output.description, // REQUIRED — Agent tool rejects missing description + model=output.model, // omit when output.model is null + prompt=output.prompt // ~175 chars; ensign Reads dispatch_file_path on first action + ) + ``` +5. **On non-zero exit ONLY** (or if the binary is unavailable): read stderr, report the helper failure to the captain, and fall back to Break-Glass Manual Dispatch below. A zero-exit run is never a break-glass trigger. + +In bare mode, dispatch blocks until the subagent completes — concurrent dispatch is not possible. Dispatch one entity at a time and process completions inline. + +**Reuse dispatch (SendMessage advancement):** `spacedock dispatch build` serves only initial `Agent()` dispatch. When advancing a reused ensign via `SendMessage(to="{ensign_name}")`, assemble the advancement message directly — the helper is not involved in the reuse path. + +**Break-Glass Manual Dispatch (fallback ONLY when `spacedock dispatch build` exits non-zero or is unavailable):** Do NOT use this template while the helper is working. Report the helper failure to the captain before proceeding. Use this minimal template as a degraded fallback: +``` +Agent( + subagent_type="{dispatch_agent_id}", + name="{worker_key}-{slug}-{stage}", + team_name="{team_name}", + model="{effective_model}", + prompt="## First action\n\nBefore anything else, invoke your operating contract:\n\n Skill(skill=\"spacedock:ensign\")\n\nThis loads the shared ensign discipline (stage-report format, BashOutput polling, worktree ownership, completion signal protocol). Do not paraphrase; call the tool.\n\nYou are working on: {entity title}\n\nStage: {stage}\n\n### Stage definition:\n\n{copy stage subsection from README verbatim}\n\nRead the entity file at {entity_file_path}.\n\n### Completion checklist\n\n{numbered checklist}\n\n### Summary\n{brief description of what was accomplished}\n\n### Stage report\n\nAppend a Stage Report section at the end of the entity file (per the shared-core Stage Report Protocol). Use the title `Stage Report: {stage}`. Account for every checklist item above with a `- DONE:` / `- SKIPPED:` / `- FAILED:` entry. Use the checklist item text verbatim when possible.\n\n### Completion Signal\n\nSendMessage(to=\"team-lead\", message=\"Done: {entity title} completed {stage}. Report written to {entity_file_path}.\")" +) +``` +The break-glass template is intentionally minimal — it inlines the stage definition verbatim rather than referencing a `spacedock dispatch show-stage-def` fetch command (the helper is precisely what just failed, so the ensign cannot rely on it). The template therefore omits: worktree instructions, feedback context, scope notes, and the standing-teammates routing block. It also omits the FO-forwarding warning prose and the per-stage operational prose the production helper emits (plain-text-only / no-JSON / idle-notification narration). The `model=` slot is conditional — include it only when the stage (or `stages.defaults`) declares a model from `sonnet | opus | haiku`; omit the entire `model=` argument otherwise. Use only when the helper is unavailable. + +## Degraded Mode (spacedock seams) + +Degraded Mode itself — triggers, effects, captain report template, cooperative shutdown sweep — lives in `Skill(skill="spacedock:using-claude-team")`. Two spacedock-specific seams the generic block references abstractly: + +- **Trigger:** the captain command `/spacedock bare` is the explicit operator-initiated degrade. +- **Bare-mode dispatch emission:** the generic "no `team_name` on subsequent dispatch" effect is realized by building the dispatch in bare mode (`team_name: null`, `bare_mode: true`); `spacedock dispatch build` then emits a bare-mode Agent call with `name` and `team_name` absent. + +## Context Budget and Dead Ensign Handling + +This section is the Claude realization of the shared core's reuse-condition-0 budget probe (and the feedback-rejection budget check). On Claude, the probe IS provided — Codex declares none. + +**Context budget check:** Run `spacedock dispatch context-budget --name {ensign-name}`. Parse the JSON output. If `reuse_ok` is `false`, log to captain and fresh-dispatch with a recovery clause. The probe reads the named member's most recent `~/.claude/.../subagents/agent-*.jsonl` transcript and its team-`config.json` model. + +**Budget-unavailable is fail-safe (never silent-reuse).** The probe exits non-zero with no `reuse_ok` field in three conditions, and the FO treats every one identically — fresh-dispatch: +- **missing jsonl** — no `agent-*.jsonl` exists for the named member (stderr: `no subagent jsonl found for '{name}'`). +- **unreadable/empty jsonl** — the jsonl exists but carries no assistant entry with non-zero `usage` (stderr: `no assistant entries with usage in {path}`). +- **agent-not-in-team-config** — no team `config.json` lists a member with that name (stderr: `no team config found for member '{name}'`). +A non-zero exit with no `reuse_ok: true` means the FO never silent-reuses on an absent reading. + +**Model-to-context mapping:** Resolved by `spacedock dispatch context-budget` from the member's runtime/config model. The opus context window follows a forward family rule (an `claude-opus-4-{minor}` with minor ≥ 7 → 1M; the `[1m]` suffix → 1M; else 200k), so a new opus release stays correct without an edit. This is also the model-for-member lookup reuse-condition-4 references: the same team-`config.json` member-model read. + +**Recovery clause** (only when replacing a prior ensign): The prior ensign was shut down due to context budget limits. Its worktree may hold uncommitted changes. Run `git status` and `git diff` first; commit legitimate WIP or reset broken changes. + +**Dead ensign handling:** + +- `SendMessage(shutdown_request)` is cooperative — do NOT send to dead or unresponsive ensigns. +- Track dead ensigns in session memory; do not route work to dead names. +- Fresh-dispatch under `-cycleN` suffix when replacing a zombie ensign. +- The post-dispatch config check does NOT detect zombies — zombies pass it. Session memory is the authoritative dead-vs-alive tracker. + +## Feedback Rejection Flow (bare mode) + +In bare mode, the feedback rejection flow is sequential: dispatch fix agent (wait for completion), then dispatch reviewer (wait for completion), then present at gate. + +In teams mode, the fix agent and reviewer can interact via messaging. Keep the reviewer alive when entering the feedback rejection flow. + +## Event Loop + +After each agent completion: + +These are FO-internal scheduling reads — parse them as JSON, not the padded human table. Each read below uses `--json` so the FO consumes a compact, byte-stable document (one rule: every value is a string) instead of scraping column padding that a token proxy can mangle. `--fields` narrows the read to the keys the FO needs. The `--json` envelopes are: `status`/`--where` → `{"command":"status","entities":[…]}`; `--next` → `{"command":"next","dispatchable":[{"id","slug","current","next","worktree"},…]}`. The captain-facing state display (shared-core) still forwards the human table verbatim — JSON is for the machine reader, the table is for the human. + +0. **Reconcile sweep.** Run `spacedock dispatch reconcile --workflow-dir {workflow_dir} --team-name {team_name}` (a) at the first dispatch, AFTER the split-root `pull --rebase` and BEFORE the first `Agent()` dispatch; (b) at idle (step 4); (c) after each merge, immediately after Merge-and-Cleanup step 10. Pass your own `TeamCreate` `{team_name}` — the roster-derived classes (A/B/C) require a team identity, so the sweep can only emit them against a roster it can trust. The team identity comes from either the explicit `--team-name {team_name}` or a current-session match (the helper narrows auto-discovery to the config whose `leadSessionId` equals this session). **Bare reconcile with no team identity is git-only**: it suppresses A/B/C (a stale prior-session or parallel-session config must never be mistaken for the live team) and reports only the session-independent git/filesystem classes (D/E), with a one-line stderr note. Stdout: `{"command":"reconcile","team_name":…,"drift":[{"class":"A|B|C|D|E",…}]}`. Empty `drift[]` is green. Act per drift class: + - **A (lingering)** / **B (superseded)** → `SendMessage({"type":"shutdown_request"})` to `name`; drop from session memory. + - **C (un-advanced PR)** → enter Merge-and-Cleanup for the named slug. + - **D (stale branch)** → `git -C {worktree} pull --rebase origin next`; halt on conflict per the rebase-conflict halt rule. + - **E (stale local main)** → `git -C {repo} fetch origin next && git -C {repo} reset --hard origin/next && cd {repo} && go build -o spacedock ./cmd/spacedock`. + + Non-zero helper exit (1 setup / 2 usage) surfaces to the captain; it does not block the loop. On drift, report one line: `reconcile: {N} entries: A={N_A} B={N_B} C={N_C} D={N_D} E={N_E} — acting`. +1. **Check PR-pending entities** — Run `status --where "pr !=" --json --fields id,slug,pr`. For each entity in `entities`, check PR state via `gh pr view` and advance merged PRs. When advancing a merged PR, clear its `mod-block` if set: `status --set {slug} mod-block=`. +2. **Check mod-blocked entities** — Run `status --where "mod-block !=" --json --fields id,slug,mod-block`. For each entity in `entities`, re-read the blocking mod and resume its pending action (e.g., re-present the PR summary). Do not dispatch new work for a mod-blocked entity. +3. **Run `status --next --json --fields id,slug`** — Dispatch any newly ready entity in `dispatchable` (each row carries the fixed `id,slug,current,next,worktree` plus the named frontmatter keys; `--fields` is additive over the fixed five, since the computed dispatch columns are not projectable). +4. **If nothing is dispatchable** — Fire `idle` hooks, re-run the step-0 reconcile sweep, then re-run `status --next`. Dispatch anything either unblocked; otherwise end the iteration. + +Repeat from step 1 after each agent completion until the captain ends the session or, in single-entity mode, until the target entity is resolved. + +### Backstop (Claude) + +The merge-module terminal-teardown (step 10) and the reuse-module supersede-shutdown steps remain mandatory at their boundaries. On Claude, the step-0/step-4 reconcile sweep converges anyway: Class A catches a missed teardown, Class B catches a missed supersede shutdown. Cost of a miss: one extra event-loop cycle the agent burns. diff --git a/skills/first-officer/references/claude-fo-merge.md b/skills/first-officer/references/claude-fo-merge.md new file mode 100644 index 00000000..fc9ede98 --- /dev/null +++ b/skills/first-officer/references/claude-fo-merge.md @@ -0,0 +1,85 @@ +# First Officer Merge Module (Claude) + +The terminal merge-and-cleanup ceremony, the mod-block enforcement that guards it, and the bounded terminal teardown. Lazily loaded at the terminal boundary — the boot-resident core names this file at the merge load point and reads it only when an entity reaches its terminal stage, never at boot. + +## Merge and Cleanup + +When an entity reaches its terminal stage: + +1. If merge hooks are registered, set the mod-block before invoking: + `spacedock status --workflow-dir {workflow_dir} --set {slug} mod-block=merge:{mod_name}` + Commit: `mod-block: {slug} awaiting merge:{mod_name}`. + The mechanism enforces this — `status --set` and `status --archive` refuse terminal updates while merge hooks exist with both `pr` and `mod-block` empty, unless `--force`, `merge: local`, or `verdict=rejected` exempts (a rejected entity never ran the merge ceremony). Tagging `mod-block` also lets session resume pick up which mod is blocking. +2. Run merge hooks before local merge, archival, or status advancement. +3. Detect hook completion via the state delta. A hook blocks if (a) `pr` is now set, (b) its prose says to wait for captain approval and the captain has not responded, or (c) it declares an external wait. Otherwise it completed. +4. If blocked, leave `mod-block` set, report the pending state, and do not local-merge. +5. If completed without blocking, clear the mod-block in its own `--set` call: + `spacedock status --workflow-dir {workflow_dir} --set {slug} mod-block=` + Commit: `mod-block: {slug} cleared ({mod_name} completed)`. + The clear MUST be standalone — `status --set` exits 1 if `mod-block=` is combined with `status={terminal}`, `completed`, `verdict`, or `worktree=` in one call. Use two commits, or `--force` with captain approval. +6. If no merge hook handled the merge, perform the default local merge from the stage worktree branch. +7. Update frontmatter: `spacedock status --workflow-dir {workflow_dir} --set {slug} completed verdict={verdict} worktree=`. +8. Archive: `spacedock status --workflow-dir {workflow_dir} --archive {slug}`. +9. Remove the worktree (`git worktree remove {path}`) and delete the local branch (`git branch -d {branch}`). Do NOT delete the remote branch while a PR is pending — the reviewer needs it. Remote cleanup belongs to the PR merge. +10. **Teardown agents at terminal.** Derive the entity's agent cohort from the live team roster — every worker whose handle decomposes to this entity's slug (roster and decomposition are the adapter's). Issue the cooperative-shutdown call (best-effort, fire-and-forget); drop them from session memory. Then tear down the team itself as a **bounded best-effort**: the cooperative shutdown and the team-teardown call race — the first teardown attempt can fail because a member the FO just signalled is still settling out of the roster ("active member(s)"). Do NOT end the turn on that first failure. Between attempts the FO MUST let the roster settle — re-issue the cooperative shutdown to any still-named active member, then **wait a short settle interval before the next teardown attempt** rather than re-firing it in the same instant (an instant retry just re-loses the same async registry race — the way a teardown that "retried but raced every time, then stopped" still hangs). Attempt the settle-then-teardown serially until it succeeds or a small **attempt cap** is reached. In an interactive session the roster clears as the member's session-end propagates, so the teardown succeeds on an early attempt and the loop exits naturally. In a non-interactive session (single-entity `-p` mode) an approved-shutdown member can stay listed in the roster indefinitely (an upstream defect), so the teardown can never succeed — `retry to success` there is unreachable and a fast retry loop only re-hangs the subprocess. So on **cap-exhaustion the FO STOPS the teardown attempts and emits a defined terminal-status marker — `TERMINAL_TEARDOWN_BOUNDED: best-effort teardown exhausted; member(s) stuck in registry; holding for launcher.` (verbatim).** The PROCESS EXIT is the **launcher's** responsibility, not the FO's: the FO cannot self-exit while the roster is non-empty, so a non-interactive launcher (the live-e2e cycle's `kill()`, or a real automation's timeout) ends the subprocess once the marker has been emitted. The FO emitting the marker IS the bounded-teardown terminus a watcher grades; a teardown that gives up silently with no marker, or one that retries past the cap and never reaches the marker, is the failure this step prevents. On a subsequent harness re-invocation with the roster still non-empty the FO again runs the bounded best-effort and re-emits the marker; a bounded resume that re-emits the marker is acceptable (the launcher ends the subprocess) — what this step forbids is an UNBOUNDED retry loop that never reaches the marker. **Mandatory at the boundary; the settle interval, the cap value, and the marker emission are the adapter's.** + +### Ship-Local Ceremony + +When the merge boundary has no PR host (README declares `merge: local`, or pr-merge fallback applies — no `gh`, push failed, captain chose local), the FO runs ONE fixed ceremony per entity. The README's top-level `merge:` key (default `pr`) selects this ceremony or the PR path. Happy path uses NO `--force`: + +1. Set the merge mod-block: `spacedock status --workflow-dir {workflow_dir} --set {slug} mod-block=merge:{mod_name}` (commit path-scoped). +2. Invoke the merge hook (local `--no-ff` merge of `{branch}` onto `next`). +3. Record the merge so the terminal guard is satisfied without `--force`: + - If `merge: local`, the policy exempts the pr-requirement — skip to step 4. + - Otherwise set the post-merge sentinel `spacedock status --workflow-dir {workflow_dir} --set {slug} pr=local-merge:{short-sha}` (the merge commit on `next`; set ONLY after merge has landed; commit path-scoped). The status table renders as `{short-sha} (local)`. +4. Clear the mod-block in a standalone `--set`: `spacedock status --workflow-dir {workflow_dir} --set {slug} mod-block=` (commit path-scoped). MUST be separate from terminalization — the guard refuses combining `mod-block=` with terminal fields. +5. Terminalize: `spacedock status --workflow-dir {workflow_dir} --set {slug} completed verdict={verdict} worktree=`. +6. Archive: `spacedock status --workflow-dir {workflow_dir} --archive {slug}`. +7. Remove worktree, delete local branch (Merge-and-Cleanup step 9), and run the terminal agent teardown (step 10). Teardown is mandatory at the terminal boundary whether the merge ran locally or via a PR host. + +The set→invoke→clear sequence (steps 1, 2, 4) stays mandatory whenever a merge hook is registered, regardless of `merge: local`. `--force` is never part of the happy path — if the guard refuses, a step was skipped, not a flag forgotten. + +### Worktree removal safety + +Use `git worktree remove {path}` (no `--force`). The default refuses to delete a worktree with untracked changes — that refusal is the safety net. + +If removal fails on untracked files, the FO MUST: + +1. Audit: `git -C {path} status --short` from the parent worktree. +2. Decide per file: commit to the worktree branch (audit-essential per gitignore), move to a persistent location (experiment-output outside the worktree), or explicitly confirm destruction with the captain. +3. ONLY after the audit, `--force` is permitted. + +`--force` is never default; it is an explicit captain-confirmed bypass. + +## Mod-Block Enforcement + +Merge hooks can block (captain approval before pushing, waiting for PR merge). The FO enforces via the entity `mod-block` field and a mechanism-level invariant in `status --set` / `status --archive`: + +- **Set** by the FO before invoking a merge hook: `mod-block=merge:{mod_name}`. +- **Cleared** after the blocking action completes or the captain force-overrides. The clear runs in its own `--set` — combining `mod-block=` with terminal fields (`status={terminal}`, `completed`, `verdict`, `worktree=`) is refused without `--force`. +- **Guarded** — `status --set` refuses terminal transitions while `mod-block` is non-empty unless `--force` is passed. +- **Enforced at the mechanism level** — `status --set` and `status --archive` also refuse terminal transitions and archival when merge hooks (`_mods/*.md` with `## Hook: merge`) are registered AND `pr` is empty AND `mod-block` is empty. `--force` bypasses. `merge: local` exempts only the pr-requirement; `verdict=rejected` likewise exempts only the pr-requirement on both surfaces (a rejected entity never ran the merge ceremony, so the requirement is vacuous); the mod-block-pending and combined-clear refusals stay. See the Ship-Local Ceremony. +- **Survives session resume** — the FO reads `mod-block` from frontmatter on boot and resumes the pending action. + +## Mod-Block Enforcement at Terminal Transitions + +Before advancing an entity into Merge and Cleanup, the FO must: + +1. Check whether merge hooks are registered (from boot-time MODS data). +2. If merge hooks exist, set `mod-block` before invoking the first hook. +3. Invoke merge hooks in order. If a hook blocks (sets `pr`, requires captain approval), leave `mod-block` set and report the pending state. +4. Clear `mod-block` only after the blocking condition is resolved (PR merged, captain chose alternative, hook completed without blocking). +5. Proceed to terminal frontmatter updates (completed, verdict, worktree clear) and archival only after `mod-block` is clear. + +**The mechanism enforces this even if you forget.** `status --set` and `status --archive` refuse terminal transitions (status to a terminal stage, completed, verdict, worktree clear) and archival when all of the following hold: + +- the workflow registers at least one merge hook (`_mods/*.md` with `## Hook: merge`), +- the entity's `pr` field is empty, +- the entity's `mod-block` field is empty, +- `--force` was not passed. + +In that state the merge hook has provably not run. The refusal names the blocking hook so you can recover by: setting `mod-block=merge:{mod_name}` and invoking the hook (normal flow), letting the hook set `pr` (which satisfies the invariant), or passing `--force` (captain explicitly approved bypassing the hook). Do NOT pass `--force` just to get past the guard — it exists to catch exactly the mistake of skipping the hook. + +On session resume, scan entities with non-empty `mod-block` and resume the pending action. Do not re-run the hook from scratch — check what the hook left (PR created? branch pushed?) and continue from there. + +If the blocking mod file (`{workflow_dir}/_mods/{mod_name}.md`) is missing or unreadable, report to the captain: "Blocking mod {mod_name} is missing. The entity is stuck. Options: restore the mod file, or use `--force` to clear the block and resume normal flow." Wait for direction. diff --git a/skills/first-officer/references/first-officer-shared-core.md b/skills/first-officer/references/first-officer-shared-core.md index 6d85cdd6..b36f75c9 100644 --- a/skills/first-officer/references/first-officer-shared-core.md +++ b/skills/first-officer/references/first-officer-shared-core.md @@ -1,6 +1,15 @@ # First Officer Shared Core -Shared first-officer semantics. Keep aligned with `agents/first-officer.md` and the runtime adapters. +Shared first-officer semantics — the boot-resident core. The dispatch and merge machinery live in lazily-loaded references this core names at their load points (the dispatch reference at first dispatch, the merge reference at terminalization); they are not read at boot. + +## Operating principles (ethos) + +You are dispatcher and responsible for making sure the work is done by the crew. What awesome looks like for the crew: +- Begin with the end, be clear about the value. +- Do the hardest things first, de-risk when it is cheap. +- Communicate and act concisely, choose the simplest approach, JFDI. + +These principles govern how the FO frames work and adjudicates gates; the Working Principles below fold under them. ## Startup @@ -94,29 +103,9 @@ Single-entity mode changes the event loop: Stay at the project root. Do not `cd` into worktrees. Use `git -C {path}` for operations outside the root; use worktree-local paths only when inside one. -## Dispatch - -The FO MUST use the runtime adapter's dispatch mechanism. Manual prompt assembly is prohibited except in documented break-glass scenarios. +## Dispatch (deferred module) -For each entity reported by `status --next`: - -1. Read the entity file and the target stage definition. -2. Build a numbered checklist (≤3 items) of dispatch-specific linchpin signals from the target stage's `Outputs:` bullets and any entity-level acceptance criteria this stage is the natural place to advance. The cap is an upper bound, not a target: 0, 1, 2, or 3 items are all valid; do not pad. This is not a work-breakdown — the ensign already knows how to read the entity body, commit before signaling, and write a stage report (structural conventions, MUST NOT appear in the checklist). Name what separates a good outcome from a ceremonial one. Entity-level acceptance criteria are properties of the finished entity, not stage actions — they live in the entity body's `## Acceptance criteria` section and are cross-checked at every gate (see `## Completion and Gates`), independent of this checklist's DONE/SKIPPED/FAILED accounting. -3. Check for obvious conflicts if multiple worktree stages would touch overlapping files. -4. Determine `dispatch_agent_id` from the stage `agent:` property. Default to `ensign` when absent. -5. Update main-branch frontmatter for dispatch: - ``` - spacedock status --workflow-dir {workflow_dir} --set {slug} status={next_stage} worktree=.worktrees/{worker_key}-{slug} started - ``` - Omit `worktree=...` for non-worktree stages. Bare `started` auto-fills a UTC ISO 8601 timestamp (skipped if already set). -6. Commit the state transition on main: `dispatch: {slug} entering {next_stage}`. -7. Create the worktree on first dispatch to a worktree stage. -8. Dispatch a worker via the runtime adapter. The assignment must include: entity identity and title, target stage name, the full stage definition, the entity path, the worktree path and branch when applicable, the checklist, and feedback instructions when the stage has `feedback-to`. -9. Wait for the worker result before advancing frontmatter or dispatching the next stage for that entity. - -A feedback-stage worker checks and reports on what was produced; it does not silently take over the prior stage. - -**Routing through a standing prose-polisher.** When composing drafts for captain review (PR bodies, gate-review summaries, long narrative entity-body sections, debrief content), the FO MAY route through a live standing prose-polisher (convention: `comm-officer`). Check team membership first. Best-effort, non-blocking, 2-minute timeout; if absent, proceed un-polished. **Out of scope:** live captain replies, short operational statuses (`pushed`, `tests green`, `PR opened`), tool-call outputs, commit messages, transient logs — polish is a deliberate-draft discipline, not a live-turn reflex. Dispatched workers discover the same teammates through their build-time prompt; the FO does not add per-dispatch routing opt-ins manually. +The dispatch machinery — the per-entity dispatch procedure, worker resolution, the dispatch-adapter assembly, team creation, standing-teammate discovery/spawn, reuse conditions, the event loop, and the context-budget probe — lives in the runtime's dispatch reference, lazily loaded at the first team-mode dispatch. The runtime adapter names the load point (it is read alongside `Skill(skill="spacedock:using-claude-team")` at the first `Agent()` that uses a `team_name`). A greet-and-stop boot never reads it. ## Completion and Gates @@ -133,26 +122,9 @@ The checklist review produces an explicit count summary: `{N} done, {N} skipped, If not gated: terminal → merge; else decide reuse-or-fresh. -**A completed non-gated, non-terminal stage is not a stopping point.** After verifying the report, the FO MUST advance the entity to the next stage and dispatch it (reuse-or-fresh per below) BEFORE ending its turn. It does not file a completion-only status and stop, waiting for the captain or a later turn to resume — advancing is the FO's own next action, not the captain's. The only spans that legitimately halt the turn here are: the next stage is `gate: true` (present the gate and wait), the entity is terminal (run the merge/cleanup ceremony), an explicit blocker (a rebase-conflict halt, an unmet clarification), or a captain decision the contract requires. Absent one of those, stopping after a completion-only report is a contract violation. - -A completed worker is reusable only when the worker is still addressable through a live runtime handle AND all reuse conditions below pass. Otherwise dispatch fresh. +**A completed non-gated, non-terminal stage is not a stopping point.** After verifying the report, the FO MUST advance the entity to the next stage and dispatch it (reuse-or-fresh per the dispatch module's reuse conditions) BEFORE ending its turn. It does not file a completion-only status and stop, waiting for the captain or a later turn to resume — advancing is the FO's own next action, not the captain's. The only spans that legitimately halt the turn here are: the next stage is `gate: true` (present the gate and wait), the entity is terminal (run the merge/cleanup ceremony), an explicit blocker (a rebase-conflict halt, an unmet clarification), or a captain decision the contract requires. Absent one of those, stopping after a completion-only report is a contract violation. -**Reuse conditions** (all must hold — if any fails, dispatch fresh): -0. Consult the runtime adapter's context-budget probe. If it reports the worker over budget OR the probe source is unavailable, dispatch fresh (fail-safe — never silent-reuse on an absent reading). If the adapter declares no probe, this condition is satisfied. (Codex declares none; Claude supplies one — see the adapter.) -1. Not in bare mode (teams available). -2. Next stage does NOT have `fresh: true`. -3. Reuse-routing matches the entity's worktree state — if `worktree:` is set, route the next stage into the same worktree; if `worktree:` is empty and the next stage declares `worktree: true`, dispatch fresh so the new worktree's first agent is born inside it. -4. The reused worker's stamped model matches the next stage's declared model — resolve through the runtime's model-for-member lookup and compare against `next_stage.effective_model`. Skip when `next_stage.effective_model` is null (null-declared stages accept any reused worker). Members stamped with captain-session fallback values (e.g., `"opus[1m]"`) will never match enum values (`sonnet`, `opus`, `haiku`) and will force a one-time fresh dispatch that re-stamps the canonical enum. - -When the comparator forces fresh dispatch due to model mismatch, the FO MUST emit a captain-visible diagnostic of the form `reused worker {name} model {X} does not match next stage effective_model {Y} — fresh-dispatching`. The anchor phrase `does not match next stage effective_model` must appear verbatim. - -**If reuse:** Keep the agent alive. Update frontmatter on main (`spacedock status --workflow-dir {workflow_dir} --set {slug} status={next_stage}`, commit: `advance: {slug} entering {next_stage}`). Send the next assignment: - -SendMessage(to="{agent}-{slug}-{completed_stage}", message="Advancing to next stage: {next_stage_name}\n\n### Stage definition:\n\n[STAGE_DEFINITION — copy the full ### stage subsection from the README verbatim]\n\n### Completion checklist\n\n[CHECKLIST — assemble from step 2]\n\nContinue working on {entity title} at {entity_file_path}. Commit before sending your completion message.") - -**If fresh dispatch:** If the next stage's `feedback-to` points at the completed stage, keep that agent alive while addressable and reuse-eligible; otherwise shut it down. Run `status --next` and dispatch the next stage. - -**Supersede-shutdown.** On fresh dispatch from a `-cycleN` increment or a feedback-rework re-entering the prior stage, shut down the prior cohort BEFORE the new dispatch in a SEPARATE message. The prior cohort is every roster member whose handle decomposes to the same `(slug, stage)` pair as the new dispatch. Issue the adapter's cooperative-shutdown call; drop them from session memory. **Mandatory at the boundary; backstops, if any, are the adapter's.** +**Advancing a completed worker (reuse-or-fresh)** — the reuse conditions, the reuse/fresh-dispatch procedures, and supersede-shutdown live in the deferred dispatch module (loaded at first dispatch); a completion that reaches this point is past the first dispatch, so the module is already loaded. Reuse only when the worker is still addressable through a live runtime handle AND every reuse condition passes; otherwise dispatch fresh. If the stage is gated: - never self-approve @@ -162,54 +134,9 @@ If the stage is gated: - on captain reject at a `feedback-to` stage, invoke `Skill(skill="spacedock:feedback-rejection-flow")` and follow it (priority over generic rejection) - on captain approve to a non-terminal next stage, apply the reuse conditions. On reuse: keep the agent and SendMessage the next stage. On fresh: shut down the agent and any kept-alive `feedback-to` target the next stage does not need. -## Merge and Cleanup - -When an entity reaches its terminal stage: - -1. If merge hooks are registered, set the mod-block before invoking: - `spacedock status --workflow-dir {workflow_dir} --set {slug} mod-block=merge:{mod_name}` - Commit: `mod-block: {slug} awaiting merge:{mod_name}`. - The mechanism enforces this — `status --set` and `status --archive` refuse terminal updates while merge hooks exist with both `pr` and `mod-block` empty, unless `--force`, `merge: local`, or `verdict=rejected` exempts (a rejected entity never ran the merge ceremony). Tagging `mod-block` also lets session resume pick up which mod is blocking. -2. Run merge hooks before local merge, archival, or status advancement. -3. Detect hook completion via the state delta. A hook blocks if (a) `pr` is now set, (b) its prose says to wait for captain approval and the captain has not responded, or (c) it declares an external wait. Otherwise it completed. -4. If blocked, leave `mod-block` set, report the pending state, and do not local-merge. -5. If completed without blocking, clear the mod-block in its own `--set` call: - `spacedock status --workflow-dir {workflow_dir} --set {slug} mod-block=` - Commit: `mod-block: {slug} cleared ({mod_name} completed)`. - The clear MUST be standalone — `status --set` exits 1 if `mod-block=` is combined with `status={terminal}`, `completed`, `verdict`, or `worktree=` in one call. Use two commits, or `--force` with captain approval. -6. If no merge hook handled the merge, perform the default local merge from the stage worktree branch. -7. Update frontmatter: `spacedock status --workflow-dir {workflow_dir} --set {slug} completed verdict={verdict} worktree=`. -8. Archive: `spacedock status --workflow-dir {workflow_dir} --archive {slug}`. -9. Remove the worktree (`git worktree remove {path}`) and delete the local branch (`git branch -d {branch}`). Do NOT delete the remote branch while a PR is pending — the reviewer needs it. Remote cleanup belongs to the PR merge. -10. **Teardown agents at terminal.** Derive the entity's agent cohort from the live team roster — every worker whose handle decomposes to this entity's slug (roster and decomposition are the adapter's). Issue the cooperative-shutdown call (best-effort, fire-and-forget); drop them from session memory. Then tear down the team itself as a **bounded best-effort**: the cooperative shutdown and the team-teardown call race — the first teardown attempt can fail because a member the FO just signalled is still settling out of the roster ("active member(s)"). Do NOT end the turn on that first failure. Between attempts the FO MUST let the roster settle — re-issue the cooperative shutdown to any still-named active member, then **wait a short settle interval before the next teardown attempt** rather than re-firing it in the same instant (an instant retry just re-loses the same async registry race — the way a teardown that "retried but raced every time, then stopped" still hangs). Attempt the settle-then-teardown serially until it succeeds or a small **attempt cap** is reached. In an interactive session the roster clears as the member's session-end propagates, so the teardown succeeds on an early attempt and the loop exits naturally. In a non-interactive session (single-entity `-p` mode) an approved-shutdown member can stay listed in the roster indefinitely (an upstream defect), so the teardown can never succeed — `retry to success` there is unreachable and a fast retry loop only re-hangs the subprocess. So on **cap-exhaustion the FO STOPS the teardown attempts and emits a defined terminal-status marker — `TERMINAL_TEARDOWN_BOUNDED: best-effort teardown exhausted; member(s) stuck in registry; holding for launcher.` (verbatim).** The PROCESS EXIT is the **launcher's** responsibility, not the FO's: the FO cannot self-exit while the roster is non-empty, so a non-interactive launcher (the live-e2e cycle's `kill()`, or a real automation's timeout) ends the subprocess once the marker has been emitted. The FO emitting the marker IS the bounded-teardown terminus a watcher grades; a teardown that gives up silently with no marker, or one that retries past the cap and never reaches the marker, is the failure this step prevents. On a subsequent harness re-invocation with the roster still non-empty the FO again runs the bounded best-effort and re-emits the marker; a bounded resume that re-emits the marker is acceptable (the launcher ends the subprocess) — what this step forbids is an UNBOUNDED retry loop that never reaches the marker. **Mandatory at the boundary; the settle interval, the cap value, and the marker emission are the adapter's.** +## Merge and Cleanup (deferred module) -### Ship-Local Ceremony - -When the merge boundary has no PR host (README declares `merge: local`, or pr-merge fallback applies — no `gh`, push failed, captain chose local), the FO runs ONE fixed ceremony per entity. The README's top-level `merge:` key (default `pr`) selects this ceremony or the PR path. Happy path uses NO `--force`: - -1. Set the merge mod-block: `spacedock status --workflow-dir {workflow_dir} --set {slug} mod-block=merge:{mod_name}` (commit path-scoped). -2. Invoke the merge hook (local `--no-ff` merge of `{branch}` onto `next`). -3. Record the merge so the terminal guard is satisfied without `--force`: - - If `merge: local`, the policy exempts the pr-requirement — skip to step 4. - - Otherwise set the post-merge sentinel `spacedock status --workflow-dir {workflow_dir} --set {slug} pr=local-merge:{short-sha}` (the merge commit on `next`; set ONLY after merge has landed; commit path-scoped). The status table renders as `{short-sha} (local)`. -4. Clear the mod-block in a standalone `--set`: `spacedock status --workflow-dir {workflow_dir} --set {slug} mod-block=` (commit path-scoped). MUST be separate from terminalization — the guard refuses combining `mod-block=` with terminal fields. -5. Terminalize: `spacedock status --workflow-dir {workflow_dir} --set {slug} completed verdict={verdict} worktree=`. -6. Archive: `spacedock status --workflow-dir {workflow_dir} --archive {slug}`. -7. Remove worktree, delete local branch (Merge-and-Cleanup step 9), and run the terminal agent teardown (step 10). Teardown is mandatory at the terminal boundary whether the merge ran locally or via a PR host. - -The set→invoke→clear sequence (steps 1, 2, 4) stays mandatory whenever a merge hook is registered, regardless of `merge: local`. `--force` is never part of the happy path — if the guard refuses, a step was skipped, not a flag forgotten. - -### Worktree removal safety - -Use `git worktree remove {path}` (no `--force`). The default refuses to delete a worktree with untracked changes — that refusal is the safety net. - -If removal fails on untracked files, the FO MUST: - -1. Audit: `git -C {path} status --short` from the parent worktree. -2. Decide per file: commit to the worktree branch (audit-essential per gitignore), move to a persistent location (experiment-output outside the worktree), or explicitly confirm destruction with the captain. -3. ONLY after the audit, `--force` is permitted. - -`--force` is never default; it is an explicit captain-confirmed bypass. +The terminal merge-and-cleanup ceremony — the set→invoke→clear mod-block sequence, the Ship-Local ceremony, worktree-removal safety, the mod-block enforcement, and the bounded terminal teardown (the `TERMINAL_TEARDOWN_BOUNDED` marker) — lives in the runtime's merge reference, lazily loaded at the terminal boundary. The FO reaches it the same way it reaches `present-gate` / `feedback-rejection-flow`: by naming the load point when an entity reaches its terminal stage. The runtime adapter names the merge reference. A boot, a dispatch, or a gate that never terminalizes never reads it. ## State Management @@ -217,15 +144,11 @@ If removal fails on untracked files, the FO MUST: - Assign entity IDs through `id-style`; validate active plus archived entities before trusting status output. - Commit state changes at dispatch and merge boundaries. -## Worktree Ownership - -- For worktree-backed entities, active stage/status/report/body state — including `### Feedback Cycles` entries — lives in the worktree copy. -- `pr:` mirrors on `main` for startup/discovery. -- Ordinary active-state writes (`implementation -> validation`) do not land on `main`. +The worktree-ownership rules (which active state lives in the worktree copy vs. `main`, and the split-root deliverable-isolation contract) travel with the deferred dispatch module — they matter only once a worktree stage dispatches. The concurrency-safe commit / multi-writer sync / rebase-conflict-halt rules below stay boot-resident: the Startup pull-on-boot step fires before any dispatch. -### Split-Root Worktree Contract +### Split-Root State Sync -When the workflow is split-root (README declares `state:` checkout, e.g. `state: .spacedock-state`), a worktree stage isolates **the deliverable work product only**. Entities live in a separate, non-branched state checkout that a worktree of the main repo does not contain. The entity body and stage reports are written and committed to that state checkout at the entity's state-checkout path, **never** a worktree copy — the dispatch helper hands workers that path even under a worktree stage. The worktree still owns the deliverable: working directory, branch, and "commits MUST be on this branch" apply to deliverable-artifact changes only. The `pr:`-mirrored-on-`main` exception is unaffected. +When the workflow is split-root (README declares `state:` checkout, e.g. `state: .spacedock-state`), the state branch is shared via `origin` and committed concurrency-safe. **Concurrency-safe state commits.** The state checkout is a single non-branched git index. A bare `git add -A` / `git commit` sweeps up a sibling writer's staged entity, cross-attributing or clobbering it. Every writer MUST commit concurrency-safe, in preference: @@ -273,26 +196,9 @@ Supported lifecycle points: - `idle` - `merge` -Hooks are additive and run alphabetically by mod filename. - -### Mod-Block Enforcement - -Merge hooks can block (captain approval before pushing, waiting for PR merge). The FO enforces via the entity `mod-block` field and a mechanism-level invariant in `status --set` / `status --archive`: - -- **Set** by the FO before invoking a merge hook: `mod-block=merge:{mod_name}`. -- **Cleared** after the blocking action completes or the captain force-overrides. The clear runs in its own `--set` — combining `mod-block=` with terminal fields (`status={terminal}`, `completed`, `verdict`, `worktree=`) is refused without `--force`. -- **Guarded** — `status --set` refuses terminal transitions while `mod-block` is non-empty unless `--force` is passed. -- **Enforced at the mechanism level** — `status --set` and `status --archive` also refuse terminal transitions and archival when merge hooks (`_mods/*.md` with `## Hook: merge`) are registered AND `pr` is empty AND `mod-block` is empty. `--force` bypasses. `merge: local` exempts only the pr-requirement; `verdict=rejected` likewise exempts only the pr-requirement on both surfaces (a rejected entity never ran the merge ceremony, so the requirement is vacuous); the mod-block-pending and combined-clear refusals stay. See the Ship-Local Ceremony. -- **Survives session resume** — the FO reads `mod-block` from frontmatter on boot and resumes the pending action. - -## Standing Teammates - -A **standing teammate** is a long-lived specialist agent (prose polisher, science officer, code reviewer, language translator) declared by a workflow mod with `standing: true`. The FO discovers each at boot via the runtime adapter, defers spawn to the first team-mode dispatch, routes by name, and lets it die with the team at teardown. The four concepts below are load-bearing for every runtime; each adapter realizes (or omits) the mechanics — discovery, layout, routing call, teardown trigger — its own way. +Hooks are additive and run alphabetically by mod filename. The MODS-REPORT at boot reads the boot JSON `mods` map (which hooks are registered at which point) without opening a mod file. The mod-block enforcement that guards a terminal transition travels with the deferred merge module, loaded at terminalization. -- **first-boot-wins** — lifecycle is team-scoped, not workflow-scoped. Spawn deferred to first dispatch; when multiple workflows share a team, the first FO to find the member absent spawns it, later workflows skip. How team scope maps onto session lifetime is the runtime's concern. -- **team-scope lifecycle** — the teammate lives in one team and dies at team teardown (session end, explicit delete, captain shutdown). No cross-team handoff, no cross-session persistence. Mid-session death is detected on the next routing attempt; auto-recovery is deferred. -- **routing contract** — address by declared `name`, best-effort and non-blocking: if no reply within the 2-minute timeout, the sender proceeds un-polished/un-reviewed/un-translated. Round-trip latencies of several minutes are normal on long drafts. Routing call is the adapter's (`send_input` on Codex, `SendMessage` on Claude teams). -- **declaration** — one mod file per teammate, frontmatter `standing: true`, with spawn config and verbatim agent-prompt body. On-disk layout and parse rules are the adapter's. +The standing-teammate concepts (first-boot-wins lifecycle, team-scope teardown, the by-name routing contract, the declaration layout) travel with the deferred dispatch module — they apply only once a team exists at first dispatch. ## Clarification and Communication From 0bb7e0252807431686436f9722c9a63a2c39a7ef Mon Sep 17 00:00:00 2001 From: CL Kao Date: Sat, 13 Jun 2026 16:04:58 -0700 Subject: [PATCH 13/18] test(j9): deterministic offline repro of the AC-3 single-entity reviewer-reuse finding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pin the END-STATE shape the live single-entity (-p) Claude rejection-flow run produces — two bare validation Agent spawns — without running the model or touching the validator's live run (backlog seed e3z, bare-mode-coverage-baseline). The two bare spawns red assertClaudeReviewerReuse on the >1-validation-spawn #141 keepalive violation (the live failure's shape); a team-mode control (one reviewer reused by agentId) passes, proving the red is caused by the extra bare spawn, not an unsatisfiable assertion. Recon for the captain's fix-direction call — no fix applied. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../shared_reviewer_reuse_table_test.go | 71 ++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) diff --git a/internal/ensigncycle/shared_reviewer_reuse_table_test.go b/internal/ensigncycle/shared_reviewer_reuse_table_test.go index 00b22301..8da19c06 100644 --- a/internal/ensigncycle/shared_reviewer_reuse_table_test.go +++ b/internal/ensigncycle/shared_reviewer_reuse_table_test.go @@ -1,6 +1,9 @@ package ensigncycle -import "testing" +import ( + "strings" + "testing" +) // Offline table tests for the host-specific reviewer-reuse assertions. They prove // each assertion requires a REAL reuse tool call targeting the validation reviewer @@ -90,6 +93,72 @@ func TestAssertClaudeReviewerReuse(t *testing.T) { } } +// TestSingleEntityBareReviewerNonReuseRepro is the DETERMINISTIC, offline, +// model-free repro of the AC-3 `rejection-flow` finding (backlog seed e3z, +// bare-mode-coverage-baseline). It does NOT run the live scenario — it pins the +// END-STATE shape the live single-entity (`-p`) Claude run produces, so the +// causality can be reasoned about without spending a model or touching the +// validator's live run. +// +// Root cause (verified in the contract, not assumed): +// - The `rejection-flow` scenario launches `spacedock claude -- -p {prompt}` and +// the prompt says "Process only the entity `rejection-task`". Both single-entity +// activation conditions hold (non-interactive `claude -p` AND the prompt names a +// specific entity — first-officer-shared-core.md `## Single-Entity Mode`). +// - In single-entity mode the contract mandates bare dispatch: "In single-entity +// mode, skip team creation. Use bare-mode dispatch for all agent spawning" +// (claude-fo-dispatch.md). This clause predates P2 (it has lived since the +// original vendoring `83c73494`), so single-entity Claude reviewers are bare +// both before and after lazy-TeamCreate. +// - A bare reviewer fails Claude reuse-condition-1, "Not in bare mode (teams +// available)" (claude-fo-dispatch.md). With no team, `dispatch build --bare-mode` +// emits an Agent call with `name`/`team_name` ABSENT (build.go: Name/TeamName are +// *string omitempty; the bare-mode parity case pins it), so there is no kept-alive +// handle to SendMessage at all. The cycle-2 re-review therefore fresh-dispatches a +// SECOND validation Agent → exactly the >1-validation-spawn the assertion reds on. +// +// The two bare validation spawns below carry NO `id`/agentId-returning tool_result +// (a bare worker is not a team member, so it returns no `agentId:` resume handle). +// assertClaudeReviewerReuse reds on the spawn COUNT (>1) — the #141 keepalive +// violation — independent of any handle, which is exactly the live failure's shape. +// +// NOTE the contrast with Codex: the Codex `rejection-flow` reuses via `send_input` +// to a persistent thread (codex-first-officer-runtime.md: "no team_name lifecycle … +// use Codex task names and mailbox notifications as the worker handle"), which does +// NOT depend on a Claude team, so single-entity Codex reuse is unaffected. The +// finding is Claude-specific — it is the team-gated reuse-condition-1 meeting +// single-entity bare mode. +func TestSingleEntityBareReviewerNonReuseRepro(t *testing.T) { + // Two BARE validation Agent spawns (no team_name, no agentId resume handle), + // the deterministic end-state of a single-entity `-p` rejection-flow run: the FO + // fresh-dispatches the cycle-2 validator because the bare cycle-1 reviewer fails + // reuse-condition-1. + bareCycle1Validation := `{"type":"assistant","message":{"content":[{"type":"tool_use","name":"Agent","id":"toolu_BV1","input":{"description":"Rejection Task: validation","subagent_type":"spacedock:ensign"}}]}}` + bareCycle2Validation := `{"type":"assistant","message":{"content":[{"type":"tool_use","name":"Agent","id":"toolu_BV2","input":{"description":"Rejection Task: validation (cycle 2 fresh)","subagent_type":"spacedock:ensign"}}]}}` + bareTwoSpawnStream := bareCycle1Validation + "\n" + bareCycle2Validation + + err := assertClaudeReviewerReuse(bareTwoSpawnStream) + if err == nil { + t.Fatal("single-entity bare two-validation-spawn end-state must RED assertClaudeReviewerReuse — this is the AC-3 finding's deterministic shape") + } + if !strings.Contains(err.Error(), "FRESH-dispatched the cycle-2 validator") { + t.Fatalf("the repro must red on the #141 keepalive violation (>1 validation spawn), got a different failure: %v", err) + } + + // Falsifiability control: the SAME run in TEAM mode (a single kept-alive cycle-1 + // reviewer reused by agentId) PASSES — proving the repro's red is caused by the + // extra bare spawn, not by the assertion being unsatisfiable. This is the + // before/after the fix would restore (option (a): give the -p feedback flow a + // team so the reviewer is reusable). + teamCycle1Spawn := `{"type":"assistant","message":{"content":[{"type":"tool_use","name":"Agent","id":"toolu_TV","input":{"description":"Rejection Task: validation","subagent_type":"spacedock:ensign"}}]}}` + teamCycle1Result := `{"type":"user","message":{"content":[{"type":"tool_result","tool_use_id":"toolu_TV","content":[{"type":"text","text":"agentId: a1111deadbeef0 (use SendMessage with to: 'a1111deadbeef0')"}]}]}}` + teamReuse := `{"type":"assistant","message":{"content":[{"type":"tool_use","name":"SendMessage","input":{"to":"a1111deadbeef0","message":"re-review: fix marker present"}}]}}` + teamModeStream := teamCycle1Spawn + "\n" + teamCycle1Result + "\n" + teamReuse + if err := assertClaudeReviewerReuse(teamModeStream); err != nil { + t.Fatalf("the team-mode control (one reviewer reused by agentId) must PASS — else the repro's red is not attributable to the extra bare spawn: %v", err) + } +} + func TestAssertCodexReviewerReuse(t *testing.T) { // The real Codex collab_tool_call shape. A spawn_agent dispatching the validation // stage binds the reviewer's thread id (vThread); a later send_input to vThread is From 364ee4f150ea9ec7bbb73e6e28c62b1872dbb359 Mon Sep 17 00:00:00 2001 From: CL Kao Date: Sat, 13 Jun 2026 16:27:12 -0700 Subject: [PATCH 14/18] fix(j9): correct rejection-flow to the contract-correct single-entity behavior (AC-3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Option (b) per the captain: the live rejection-flow runs `spacedock claude -- -p` naming one entity, so it is single-entity → bare. The old assertClaudeReviewerReuse encoded a TEAM-mode #141 keepalive a `-p` run can never satisfy (the bare cycle-1 reviewer hard-fails reuse-condition-1). The contract already makes the bare flow deterministic: "Feedback Rejection Flow (bare mode) … sequential: dispatch fix agent … then dispatch reviewer" (claude-fo-dispatch.md, a rule that predates the P1 split). So the contract-correct end-state is two distinct fresh validation spawns with the fix agent and reviewer as separate dispatches. - New assertClaudeSingleEntityRejectionFlow: >=2 distinct validation spawns AND no impl-as-validator (a SendMessage to an implementation worker to re-review). This catches BOTH observed non-deterministic live shapes — the 2-fresh-spawns run (PASS) and the impl-reused-through-validation run (FAIL). - Claude runner points at it instead of the team-mode assertClaudeReviewerReuse. - Shared prompt: drop "REUSE the kept-alive validation reviewer" (which drove the impl-as-validator hack) for a contract-faithful "follow your contract's feedback flow; fix agent and reviewer are separate; no self-review" — host-neutral, so Codex's contract-valid thread reuse (no team requirement) is unaffected. No contract files touched — the determinism rests on the pre-existing bare-mode rule; only the test encoded a wrong team-mode assumption. go test ./... 1334 pass. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../ensigncycle/claude_live_runner_test.go | 16 ++- internal/ensigncycle/shared_fixtures_test.go | 2 +- .../shared_reviewer_reuse_table_test.go | 121 +++++++++--------- .../ensigncycle/shared_reviewer_reuse_test.go | 75 +++++++++++ 4 files changed, 147 insertions(+), 67 deletions(-) diff --git a/internal/ensigncycle/claude_live_runner_test.go b/internal/ensigncycle/claude_live_runner_test.go index b1f36708..e336bfad 100644 --- a/internal/ensigncycle/claude_live_runner_test.go +++ b/internal/ensigncycle/claude_live_runner_test.go @@ -187,12 +187,16 @@ func runClaudeRejectionFlowScenario(t *testing.T, runner claudeLiveRunner, scena if err := assertRejectionFlow(after, result.finalMessage+"\n"+result.stream); err != nil { t.Fatalf("%v\nFinal message:\n%s\nArtifacts: %s", err, result.finalMessage, result.artifactDir) } - // AC-4 reviewer-reuse: on Claude teams the FO must reuse the kept-alive - // validation reviewer via a SendMessage tool call for the cycle-2 re-review, - // not dispatch a fresh one (the #141 keepalive contract the Go port dropped). - // Host-specific producer signal, graded by the runner — not the shared - // host-neutral assertion. - if err := assertClaudeReviewerReuse(result.stream); err != nil { + // Single-entity (`-p`) reviewer producer-signal. The Claude runner launches + // `spacedock claude -- -p {prompt}` with a prompt naming one entity, so the run + // is single-entity → bare; the contract's bare-mode feedback flow is sequential + // fresh dispatch, so the cycle-2 re-review is a DISTINCT freshly-dispatched + // validation worker (not a reuse of the bare cycle-1 reviewer, not the impl + // worker serving as its own validator). assertClaudeReviewerReuse encoded a + // team-mode keepalive a `-p` run can never satisfy (the AC-3 finding); the + // contract-correct single-entity assertion is used here. The team-mode + // reviewer-reuse question is the spun-off option-(a) task. + if err := assertClaudeSingleEntityRejectionFlow(result.stream); err != nil { t.Fatalf("%v\nArtifacts: %s", err, result.artifactDir) } emitClaudeScenarioMetrics(t, scenario, result, runner.model) diff --git a/internal/ensigncycle/shared_fixtures_test.go b/internal/ensigncycle/shared_fixtures_test.go index e0b8d2d9..3ba070cb 100644 --- a/internal/ensigncycle/shared_fixtures_test.go +++ b/internal/ensigncycle/shared_fixtures_test.go @@ -136,7 +136,7 @@ func rejectionPrompt() string { "Use $spacedock:first-officer for this whole run.", "Workflow directory: .", "Process only the entity `rejection-task`, which starts at implementation, through a full two-cycle rejection feedback flow.", - "Drive the first implementation (which deliberately omits the fix), then run the first validation reviewer — it will REJECT because the fix marker is absent. Route that concrete finding back to the implementation target, wait for the rework to apply the fix, then re-run validation for a second cycle and record `- Cycle 2: PASSED` per the workflow README. For the second-cycle re-review, REUSE the kept-alive validation reviewer rather than dispatching a fresh one.", + "Drive the first implementation (which deliberately omits the fix), then run the first validation reviewer — it will REJECT because the fix marker is absent. Route that concrete finding back to the implementation target, wait for the rework to apply the fix, then re-run validation for a second cycle and record `- Cycle 2: PASSED` per the workflow README. For the second-cycle re-review, follow your contract's feedback flow for the current dispatch mode: the implementation rework and the validation re-review are SEPARATE workers — the worker that applied the fix must never review its own rework.", "Do not advance the entity to done. Your final response must mention the first-cycle rejection and the second-cycle re-validation result.", ) } diff --git a/internal/ensigncycle/shared_reviewer_reuse_table_test.go b/internal/ensigncycle/shared_reviewer_reuse_table_test.go index 8da19c06..cbecddce 100644 --- a/internal/ensigncycle/shared_reviewer_reuse_table_test.go +++ b/internal/ensigncycle/shared_reviewer_reuse_table_test.go @@ -1,9 +1,6 @@ package ensigncycle -import ( - "strings" - "testing" -) +import "testing" // Offline table tests for the host-specific reviewer-reuse assertions. They prove // each assertion requires a REAL reuse tool call targeting the validation reviewer @@ -93,69 +90,73 @@ func TestAssertClaudeReviewerReuse(t *testing.T) { } } -// TestSingleEntityBareReviewerNonReuseRepro is the DETERMINISTIC, offline, -// model-free repro of the AC-3 `rejection-flow` finding (backlog seed e3z, -// bare-mode-coverage-baseline). It does NOT run the live scenario — it pins the -// END-STATE shape the live single-entity (`-p`) Claude run produces, so the -// causality can be reasoned about without spending a model or touching the -// validator's live run. +// TestClaudeSingleEntityRejectionFlow is the CONTRACT-correct single-entity (`-p`) +// reviewer producer-signal table (the option-(b) correction of the AC-3 finding, +// backlog seed e3z). It pins the deterministic bare-mode end-state — two distinct +// fresh validation spawns, fix-agent and reviewer separate — over committed, +// model-free transcripts, including the TWO observed non-deterministic live shapes +// (2-fresh-spawns and impl-reused-through-validation), so the validator's live +// re-run has an offline oracle. // // Root cause (verified in the contract, not assumed): -// - The `rejection-flow` scenario launches `spacedock claude -- -p {prompt}` and -// the prompt says "Process only the entity `rejection-task`". Both single-entity -// activation conditions hold (non-interactive `claude -p` AND the prompt names a -// specific entity — first-officer-shared-core.md `## Single-Entity Mode`). -// - In single-entity mode the contract mandates bare dispatch: "In single-entity -// mode, skip team creation. Use bare-mode dispatch for all agent spawning" -// (claude-fo-dispatch.md). This clause predates P2 (it has lived since the -// original vendoring `83c73494`), so single-entity Claude reviewers are bare -// both before and after lazy-TeamCreate. -// - A bare reviewer fails Claude reuse-condition-1, "Not in bare mode (teams -// available)" (claude-fo-dispatch.md). With no team, `dispatch build --bare-mode` -// emits an Agent call with `name`/`team_name` ABSENT (build.go: Name/TeamName are -// *string omitempty; the bare-mode parity case pins it), so there is no kept-alive -// handle to SendMessage at all. The cycle-2 re-review therefore fresh-dispatches a -// SECOND validation Agent → exactly the >1-validation-spawn the assertion reds on. +// - The Claude runner launches `spacedock claude -- -p {prompt}` and the prompt +// names one entity, so the run is single-entity → bare (first-officer-shared- +// core.md `## Single-Entity Mode`; claude-fo-dispatch.md "In single-entity mode, +// skip team creation. Use bare-mode dispatch for all agent spawning"). That +// clause predates P2 (since the original vendoring `83c73494`), so the single- +// entity reviewer is bare both before and after lazy-TeamCreate — the contradiction +// is pre-existing; the old assertClaudeReviewerReuse encoded a team-mode +// assumption the `-p` run can never satisfy. +// - In bare mode the contract makes the flow DETERMINISTIC and SEQUENTIAL +// (claude-fo-dispatch.md `## Feedback Rejection Flow (bare mode)`: "dispatch fix +// agent (wait for completion), then dispatch reviewer (wait for completion)"). +// So the contract-correct end-state is two distinct fresh validation spawns with +// the fix agent and reviewer as SEPARATE dispatches — which is exactly what +// assertClaudeSingleEntityRejectionFlow asserts. // -// The two bare validation spawns below carry NO `id`/agentId-returning tool_result -// (a bare worker is not a team member, so it returns no `agentId:` resume handle). -// assertClaudeReviewerReuse reds on the spawn COUNT (>1) — the #141 keepalive -// violation — independent of any handle, which is exactly the live failure's shape. -// -// NOTE the contrast with Codex: the Codex `rejection-flow` reuses via `send_input` -// to a persistent thread (codex-first-officer-runtime.md: "no team_name lifecycle … -// use Codex task names and mailbox notifications as the worker handle"), which does -// NOT depend on a Claude team, so single-entity Codex reuse is unaffected. The -// finding is Claude-specific — it is the team-gated reuse-condition-1 meeting -// single-entity bare mode. -func TestSingleEntityBareReviewerNonReuseRepro(t *testing.T) { - // Two BARE validation Agent spawns (no team_name, no agentId resume handle), - // the deterministic end-state of a single-entity `-p` rejection-flow run: the FO - // fresh-dispatches the cycle-2 validator because the bare cycle-1 reviewer fails - // reuse-condition-1. +// NOTE the Codex contrast: Codex has no team registry, so its reviewer reuse via +// `send_input` to a persistent thread is contract-valid even in single-entity +// (context-dependent reuse — codex-first-officer-runtime.md `## Reuse And Feedback +// Routing`); assertCodexReviewerReuse stays correct for the Codex `-p` run. The +// Claude/Codex single-entity behaviors legitimately differ. +func TestClaudeSingleEntityRejectionFlow(t *testing.T) { + // CONTRACT-CORRECT (live Run 1): two distinct fresh validation spawns (cycle-1 + + // cycle-2), the bare-mode-sequential end-state — PASS. bareCycle1Validation := `{"type":"assistant","message":{"content":[{"type":"tool_use","name":"Agent","id":"toolu_BV1","input":{"description":"Rejection Task: validation","subagent_type":"spacedock:ensign"}}]}}` bareCycle2Validation := `{"type":"assistant","message":{"content":[{"type":"tool_use","name":"Agent","id":"toolu_BV2","input":{"description":"Rejection Task: validation (cycle 2 fresh)","subagent_type":"spacedock:ensign"}}]}}` - bareTwoSpawnStream := bareCycle1Validation + "\n" + bareCycle2Validation + twoFreshSpawns := bareCycle1Validation + "\n" + bareCycle2Validation - err := assertClaudeReviewerReuse(bareTwoSpawnStream) - if err == nil { - t.Fatal("single-entity bare two-validation-spawn end-state must RED assertClaudeReviewerReuse — this is the AC-3 finding's deterministic shape") - } - if !strings.Contains(err.Error(), "FRESH-dispatched the cycle-2 validator") { - t.Fatalf("the repro must red on the #141 keepalive violation (>1 validation spawn), got a different failure: %v", err) - } + // VIOLATION (live Run 2): only the cycle-1 validation spawn, then the cycle-2 + // re-review collapsed onto the implementation worker via SendMessage — the + // impl-as-validator shape. Must FAIL. + implRework := `{"type":"assistant","message":{"content":[{"type":"tool_use","name":"Agent","id":"toolu_BI","input":{"description":"Rejection Task: implementation rework","subagent_type":"spacedock:ensign"}}]}}` + implAsValidator := `{"type":"assistant","message":{"content":[{"type":"tool_use","name":"SendMessage","input":{"to":"spacedock-ensign-rejection-task-implementation","message":"now validate your own rework"}}]}}` + implReusedThroughValidation := bareCycle1Validation + "\n" + implRework + "\n" + implAsValidator - // Falsifiability control: the SAME run in TEAM mode (a single kept-alive cycle-1 - // reviewer reused by agentId) PASSES — proving the repro's red is caused by the - // extra bare spawn, not by the assertion being unsatisfiable. This is the - // before/after the fix would restore (option (a): give the -p feedback flow a - // team so the reviewer is reusable). - teamCycle1Spawn := `{"type":"assistant","message":{"content":[{"type":"tool_use","name":"Agent","id":"toolu_TV","input":{"description":"Rejection Task: validation","subagent_type":"spacedock:ensign"}}]}}` - teamCycle1Result := `{"type":"user","message":{"content":[{"type":"tool_result","tool_use_id":"toolu_TV","content":[{"type":"text","text":"agentId: a1111deadbeef0 (use SendMessage with to: 'a1111deadbeef0')"}]}]}}` - teamReuse := `{"type":"assistant","message":{"content":[{"type":"tool_use","name":"SendMessage","input":{"to":"a1111deadbeef0","message":"re-review: fix marker present"}}]}}` - teamModeStream := teamCycle1Spawn + "\n" + teamCycle1Result + "\n" + teamReuse - if err := assertClaudeReviewerReuse(teamModeStream); err != nil { - t.Fatalf("the team-mode control (one reviewer reused by agentId) must PASS — else the repro's red is not attributable to the extra bare spawn: %v", err) + // VIOLATION: only one validation spawn, no second re-review at all. Must FAIL on + // the spawn-count check. + onlyCycle1 := bareCycle1Validation + + cases := []struct { + name string + stream string + wantErr bool + }{ + {"contract-correct two fresh validation spawns (bare-mode sequential)", twoFreshSpawns, false}, + {"impl reused through validation (impl-as-validator) must RED", implReusedThroughValidation, true}, + {"only the cycle-1 validation spawn (no re-review) must RED", onlyCycle1, true}, + {"empty stream must RED", "", true}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + err := assertClaudeSingleEntityRejectionFlow(tc.stream) + if tc.wantErr && err == nil { + t.Fatalf("expected error for %q, got nil", tc.name) + } + if !tc.wantErr && err != nil { + t.Fatalf("expected pass for %q, got: %v", tc.name, err) + } + }) } } diff --git a/internal/ensigncycle/shared_reviewer_reuse_test.go b/internal/ensigncycle/shared_reviewer_reuse_test.go index e8abb997..b6b597ff 100644 --- a/internal/ensigncycle/shared_reviewer_reuse_test.go +++ b/internal/ensigncycle/shared_reviewer_reuse_test.go @@ -125,6 +125,81 @@ func assertClaudeReviewerReuse(stream string) error { return fmt.Errorf("the FO spawned exactly one validation reviewer but sent it no reuse SendMessage (by name or agentId) for the cycle-2 re-review") } +// assertClaudeSingleEntityRejectionFlow is the single-entity (`-p`) Claude +// producer-signal assertion for the rejection-flow scenario. The Claude runner +// launches `spacedock claude -- -p {prompt}` with a prompt naming one entity, so +// the run is single-entity → bare (claude-fo-dispatch.md: "In single-entity mode, +// skip team creation. Use bare-mode dispatch for all agent spawning"). In bare mode +// the contract makes the feedback flow DETERMINISTIC and SEQUENTIAL +// (claude-fo-dispatch.md `## Feedback Rejection Flow (bare mode)`: "dispatch fix +// agent (wait for completion), then dispatch reviewer (wait for completion)"). So +// the contract-correct end-state is: the cycle-2 re-review is a DISTINCT, FRESHLY +// DISPATCHED validation worker — NOT a reuse of the bare cycle-1 reviewer (reuse- +// condition-1 hard-fails in bare mode), and NOT the implementation worker serving as +// its own validator (the fix agent and the reviewer are separate sequential +// dispatches). It enforces BOTH halves, because either alone false-passes a wrong +// run: +// +// 1. AT LEAST TWO distinct validation-stage Agent/Task spawns. The bare flow +// fresh-dispatches a validation reviewer for cycle-1 AND a fresh one for cycle-2 +// (no reuse handle exists). A run with fewer than two validation spawns either +// never re-reviewed or collapsed the cycle-2 re-review onto a non-validation +// worker — both forbidden. This is the discriminator that catches the observed +// non-deterministic "reused the impl ensign through validation" run (which left +// only the cycle-1 validation spawn). +// 2. The cycle-2 re-review is NOT routed to an implementation worker. A SendMessage +// to an `…-implementation` handle telling it to validate is the impl-as-validator +// violation; the re-review must be a validation-stage spawn, not a message to the +// fix worker. +// +// This is the CONTRACT-correct single-entity assertion. It replaces the team-mode +// assertClaudeReviewerReuse for this scenario's `-p` run, which wrongly assumed a +// kept-alive reviewer the bare contract cannot produce (the AC-3 finding). The real +// reviewer-continuity question — whether single-entity SHOULD create a team so the +// reviewer is reusable — is the spun-off option-(a) task, not this correction. +func assertClaudeSingleEntityRejectionFlow(stream string) error { + validationSpawnCount := 0 + for _, line := range strings.Split(stream, "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + var entry struct { + Message *struct { + Content []struct { + Type string `json:"type"` + Name string `json:"name"` + Input struct { + Description string `json:"description"` + To string `json:"to"` + } `json:"input"` + } `json:"content"` + } `json:"message"` + } + if err := json.Unmarshal([]byte(line), &entry); err != nil || entry.Message == nil { + continue + } + for _, block := range entry.Message.Content { + if block.Type != "tool_use" { + continue + } + desc := strings.ToLower(block.Input.Description) + if (block.Name == "Agent" || block.Name == "Task") && strings.Contains(desc, "validation") { + validationSpawnCount++ + } + // The impl-as-validator violation: a SendMessage telling an + // implementation-named worker to validate / re-review. + if block.Name == "SendMessage" && strings.Contains(strings.ToLower(block.Input.To), "implementation") { + return fmt.Errorf("the cycle-2 re-review was routed to an implementation worker (%q) — the fix agent and the reviewer must be SEPARATE sequential dispatches in bare mode; the impl worker must never serve as its own validator", block.Input.To) + } + } + } + if validationSpawnCount < 2 { + return fmt.Errorf("single-entity bare rejection-flow produced %d validation-stage spawns, want >= 2 (a fresh cycle-1 reviewer AND a fresh cycle-2 reviewer — bare mode cannot reuse a kept-alive reviewer, so each cycle fresh-dispatches a distinct validation worker)", validationSpawnCount) + } + return nil +} + // codexCollabItem is one `codex exec --json` stream item. Codex surfaces its // multi-agent calls as `collab_tool_call` items (tool = spawn_agent / send_input / // wait / close_agent); the worker is addressed by opaque `receiver_thread_ids`, From fe822a9f23155a071d375e021ee925fc395ddbda Mon Sep 17 00:00:00 2001 From: CL Kao Date: Sat, 13 Jun 2026 17:36:18 -0700 Subject: [PATCH 15/18] fix(j9): host-neutral the rejection-flow prompt (Codex AC-3 regression) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cycle-1 dropped "REUSE the kept-alive reviewer" entirely — correct for Claude (bare -p, no team, can't reuse) but it broke Codex, which CAN reuse via a persistent send_input thread and was green doing so. Make the prompt host-CONDITIONAL (captain option a): route the cycle-2 re-review to the kept-alive reviewer IF the host supports reusing it across the feedback cycle, otherwise dispatch fresh. Claude → fresh (satisfies assertClaudeSingleEntityRejectionFlow); Codex → reuses (satisfies the unchanged assertCodexReviewerReuse). Both hosts contract-correct, neither assertion relaxed. The separate-workers / no-self-review guard stays. Co-Authored-By: Claude Opus 4.8 (1M context) --- internal/ensigncycle/shared_fixtures_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/ensigncycle/shared_fixtures_test.go b/internal/ensigncycle/shared_fixtures_test.go index 3ba070cb..f02f2060 100644 --- a/internal/ensigncycle/shared_fixtures_test.go +++ b/internal/ensigncycle/shared_fixtures_test.go @@ -136,7 +136,7 @@ func rejectionPrompt() string { "Use $spacedock:first-officer for this whole run.", "Workflow directory: .", "Process only the entity `rejection-task`, which starts at implementation, through a full two-cycle rejection feedback flow.", - "Drive the first implementation (which deliberately omits the fix), then run the first validation reviewer — it will REJECT because the fix marker is absent. Route that concrete finding back to the implementation target, wait for the rework to apply the fix, then re-run validation for a second cycle and record `- Cycle 2: PASSED` per the workflow README. For the second-cycle re-review, follow your contract's feedback flow for the current dispatch mode: the implementation rework and the validation re-review are SEPARATE workers — the worker that applied the fix must never review its own rework.", + "Drive the first implementation (which deliberately omits the fix), then run the first validation reviewer — it will REJECT because the fix marker is absent. Route that concrete finding back to the implementation target, wait for the rework to apply the fix, then re-run validation for a second cycle and record `- Cycle 2: PASSED` per the workflow README. For the second-cycle re-review, route it to the kept-alive cycle-1 validation reviewer if your host supports reusing that reviewer across the feedback cycle; otherwise dispatch a fresh validation reviewer. Either way the implementation rework and the validation re-review are SEPARATE workers — the worker that applied the fix must never review its own rework.", "Do not advance the entity to done. Your final response must mention the first-cycle rejection and the second-cycle re-validation result.", ) } From 073e119ead82c7f0673c1893c5e322dfbf7ee067 Mon Sep 17 00:00:00 2001 From: CL Kao Date: Sat, 13 Jun 2026 17:41:05 -0700 Subject: [PATCH 16/18] fix(j9): merge multi-delta tool_use so AC-2 catches a later-delta TeamCreate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ParseClaudeTurns deduped by message id taking only the FIRST delta (continue-on- seen-id). But real runner streams are multi-delta: delta[0] is thinking/text and the tool_use block lands on a LATER delta (verified against the committed real captures — sonnet_teamdelete_hang has TeamCreate on delta[1], TeamDelete on delta[2]). So assertNoTeamCreateBeforeGreet read tools from the wrong row and could not see a TeamCreate — the lazy-TeamCreate proof was hollow, hidden by the hand-written single-delta fixtures. Fix: merge every delta's tool_use names into the turn (dedup by the tool_use block's unique id so a repeated delta doesn't double-count); usage is identical across deltas, so the first-delta usage is kept. Regenerate the AC-2/AC-6 fixtures as multi-delta (thinking delta + tool_use/text delta per message), trim a real multi-delta capture (claude_multidelta_team.stream.jsonl) for the journeymetrics test, and add a positive control: a pre-greet TeamCreate on a later delta now makes assertNoTeamCreateBeforeGreet RED (it false-passed before). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../shallow_boot_measure_unit_test.go | 27 ++++++++-- .../testdata/eager-team-boot.stream.jsonl | 2 + .../testdata/shallow-boot-greet.stream.jsonl | 2 + internal/journeymetrics/claude.go | 44 +++++++++++----- internal/journeymetrics/claude_test.go | 51 +++++++++++++++++++ .../claude_multidelta_team.stream.jsonl | 7 +++ 6 files changed, 116 insertions(+), 17 deletions(-) create mode 100644 internal/journeymetrics/testdata/claude_multidelta_team.stream.jsonl diff --git a/internal/ensigncycle/shallow_boot_measure_unit_test.go b/internal/ensigncycle/shallow_boot_measure_unit_test.go index 01ac673e..d438e027 100644 --- a/internal/ensigncycle/shallow_boot_measure_unit_test.go +++ b/internal/ensigncycle/shallow_boot_measure_unit_test.go @@ -39,15 +39,34 @@ func TestAssertShallowBootMeasuredOffline(t *testing.T) { } // TestAssertNoTeamCreateBeforeGreetOffline validates the AC-2 behavioral oracle -// against the committed streams: the shallow-boot positive (no TeamCreate at all) -// passes; the eager-team negative (a TeamCreate before the greet) fails — the -// negative control proving the assertion distinguishes the two. +// against the committed MULTI-DELTA streams: the shallow-boot positive (no +// TeamCreate at all) passes; the eager-team negative (a TeamCreate before the greet) +// fails. Both fixtures are multi-delta — each message carries a `thinking` delta then +// a tool_use/text delta, the real runner shape — so the negative control genuinely +// exercises the path where the TeamCreate lands on a LATER delta. A first-delta-only +// parse would have FALSE-PASSED this negative (the hollow-AC-2 defect the forensics +// caught), so this is the positive control that the lazy-TeamCreate proof is real. func TestAssertNoTeamCreateBeforeGreetOffline(t *testing.T) { if err := assertNoTeamCreateBeforeGreet(readMeasureFixture(t, "shallow-boot-greet.stream.jsonl")); err != nil { t.Fatalf("shallow-boot positive fixture (no TeamCreate) must pass AC-2: %v", err) } if err := assertNoTeamCreateBeforeGreet(readMeasureFixture(t, "eager-team-boot.stream.jsonl")); err == nil { - t.Fatal("eager-team negative fixture (TeamCreate before greet) must FAIL AC-2") + t.Fatal("eager-team negative fixture (multi-delta TeamCreate before greet) must FAIL AC-2 — the TeamCreate lands on a later delta, so a first-delta-only parse would false-pass") + } +} + +// TestAssertNoTeamCreateBeforeGreetCatchesLaterDeltaTeamCreate is the AC-2 +// positive control over the parser fix: a stream whose TeamCreate is on a LATER +// delta of its message (the real runner shape, NOT the synthetic single-delta one) +// must make assertNoTeamCreateBeforeGreet RED. Before the multi-delta merge this +// false-passed — the proof of no-team-at-boot was hollow. +func TestAssertNoTeamCreateBeforeGreetCatchesLaterDeltaTeamCreate(t *testing.T) { + // msg_team: thinking on delta[0], TeamCreate on delta[1]; then a text greet. + stream := `{"type":"assistant","message":{"id":"msg_team","model":"claude-opus-4-8","usage":{"input_tokens":8,"cache_read_input_tokens":16000,"cache_creation_input_tokens":89000},"content":[{"type":"thinking","thinking":"create the team"}]}} +{"type":"assistant","message":{"id":"msg_team","model":"claude-opus-4-8","usage":{"input_tokens":8,"cache_read_input_tokens":16000,"cache_creation_input_tokens":89000},"content":[{"type":"tool_use","id":"toolu_tc","name":"TeamCreate","input":{"team_name":"eager"}}]}} +{"type":"assistant","message":{"id":"msg_greet","model":"claude-opus-4-8","usage":{"input_tokens":100,"cache_read_input_tokens":5000,"cache_creation_input_tokens":0},"content":[{"type":"text","text":"Gate review: ... Decision: approve or reject?"}]}}` + if err := assertNoTeamCreateBeforeGreet(stream); err == nil { + t.Fatal("a pre-greet TeamCreate on a LATER delta must make assertNoTeamCreateBeforeGreet RED — the parser must merge later-delta tool_use, not read only the first delta") } } diff --git a/internal/ensigncycle/testdata/eager-team-boot.stream.jsonl b/internal/ensigncycle/testdata/eager-team-boot.stream.jsonl index 649ee159..882b8cc8 100644 --- a/internal/ensigncycle/testdata/eager-team-boot.stream.jsonl +++ b/internal/ensigncycle/testdata/eager-team-boot.stream.jsonl @@ -1,5 +1,7 @@ {"type":"system","subtype":"init","model":"claude-opus-4-8"} +{"type":"assistant","message":{"id":"msg_boot1","model":"claude-opus-4-8","usage":{"input_tokens":4,"output_tokens":12,"cache_creation_input_tokens":1200,"cache_read_input_tokens":8000},"content":[{"type":"thinking","thinking":"check the version"}]}} {"type":"assistant","message":{"id":"msg_boot1","model":"claude-opus-4-8","usage":{"input_tokens":4,"output_tokens":12,"cache_creation_input_tokens":1200,"cache_read_input_tokens":8000},"content":[{"type":"tool_use","id":"toolu_ver","name":"Bash","input":{"command":"spacedock --version"}}]}} +{"type":"assistant","message":{"id":"msg_team","model":"claude-opus-4-8","usage":{"input_tokens":8,"output_tokens":30,"cache_creation_input_tokens":89000,"cache_read_input_tokens":16000},"content":[{"type":"thinking","thinking":"creating the team to dispatch"}]}} {"type":"assistant","message":{"id":"msg_team","model":"claude-opus-4-8","usage":{"input_tokens":8,"output_tokens":30,"cache_creation_input_tokens":89000,"cache_read_input_tokens":16000},"content":[{"type":"tool_use","id":"toolu_team","name":"TeamCreate","input":{"team_name":"eager"}}]}} {"type":"assistant","message":{"id":"msg_greet","model":"claude-opus-4-8","usage":{"input_tokens":300,"output_tokens":400,"cache_creation_input_tokens":1000,"cache_read_input_tokens":105000},"content":[{"type":"text","text":"Workflow overview: ... Gate review: ... Decision: approve or reject?"}]}} {"type":"result","subtype":"success","usage":{"input_tokens":320,"output_tokens":472,"cache_creation_input_tokens":91200,"cache_read_input_tokens":129000},"total_cost_usd":0.31,"result":"Workflow overview: ... Decision: approve or reject?"} diff --git a/internal/ensigncycle/testdata/shallow-boot-greet.stream.jsonl b/internal/ensigncycle/testdata/shallow-boot-greet.stream.jsonl index 7b715ba0..20a0b609 100644 --- a/internal/ensigncycle/testdata/shallow-boot-greet.stream.jsonl +++ b/internal/ensigncycle/testdata/shallow-boot-greet.stream.jsonl @@ -1,5 +1,7 @@ {"type":"system","subtype":"init","model":"claude-opus-4-8"} +{"type":"assistant","message":{"id":"msg_boot1","model":"claude-opus-4-8","usage":{"input_tokens":4,"output_tokens":12,"cache_creation_input_tokens":1200,"cache_read_input_tokens":8000},"content":[{"type":"thinking","thinking":"check the version"}]}} {"type":"assistant","message":{"id":"msg_boot1","model":"claude-opus-4-8","usage":{"input_tokens":4,"output_tokens":12,"cache_creation_input_tokens":1200,"cache_read_input_tokens":8000},"content":[{"type":"tool_use","id":"toolu_ver","name":"Bash","input":{"command":"spacedock --version"}}]}} +{"type":"assistant","message":{"id":"msg_boot2","model":"claude-opus-4-8","usage":{"input_tokens":6,"output_tokens":40,"cache_creation_input_tokens":900,"cache_read_input_tokens":18000},"content":[{"type":"thinking","thinking":"read the boot state"}]}} {"type":"assistant","message":{"id":"msg_boot2","model":"claude-opus-4-8","usage":{"input_tokens":6,"output_tokens":40,"cache_creation_input_tokens":900,"cache_read_input_tokens":18000},"content":[{"type":"tool_use","id":"toolu_boot","name":"Bash","input":{"command":"spacedock status --boot --json"}}]}} {"type":"assistant","message":{"id":"msg_greet","model":"claude-opus-4-8","usage":{"input_tokens":120,"output_tokens":300,"cache_creation_input_tokens":400,"cache_read_input_tokens":42000},"content":[{"type":"text","text":"Workflow overview: 1 task at the review gate. Gate review: ... Decision: approve or reject?"}]}} {"type":"result","subtype":"success","usage":{"input_tokens":130,"output_tokens":352,"cache_creation_input_tokens":2500,"cache_read_input_tokens":68000},"total_cost_usd":0.04,"result":"Workflow overview: 1 task at the review gate. Gate review: ... Decision: approve or reject?"} diff --git a/internal/journeymetrics/claude.go b/internal/journeymetrics/claude.go index decd45b9..0da74015 100644 --- a/internal/journeymetrics/claude.go +++ b/internal/journeymetrics/claude.go @@ -134,17 +134,25 @@ func (t ClaudeTurn) Context() int { return t.Usage.Input + t.Usage.CacheRead + t.Usage.CacheCreation } -// ParseClaudeTurns walks the stream-json transcript per assistant turn, deduping -// repeated rows by message ID the same way ParseClaudeJSONL does, and returns one -// ClaudeTurn per distinct assistant message in stream order. It reuses the -// rawTokenUsage field extraction; it does NOT sum or prefer the terminal result -// usage, so each turn's context window is recoverable. Non-JSON lines (folded -// stderr) are skipped, matching ParseClaudeJSONL. +// ParseClaudeTurns walks the stream-json transcript per assistant turn and returns +// one ClaudeTurn per distinct assistant message in stream order. Real runner streams +// are MULTI-DELTA: the same message id appears on several `assistant` rows, the first +// delta carries a `thinking`/`text` block, and the tool_use block(s) land on LATER +// deltas; the per-delta `usage` is identical across the deltas of a message. So this +// MERGES every delta's tool_use names into the turn (a first-delta-only parse drops +// the tool_use entirely — which would make a TeamCreate invisible) while keeping the +// first delta's usage. It reuses the rawTokenUsage field extraction; it does NOT sum +// or prefer the terminal result usage, so each turn's context window is recoverable. +// Non-JSON lines (folded stderr) are skipped, matching ParseClaudeJSONL. func ParseClaudeTurns(data []byte) ([]ClaudeTurn, error) { scanner := bufio.NewScanner(bytes.NewReader(data)) scanner.Buffer(make([]byte, 0, 64*1024), 4*1024*1024) - seen := map[string]bool{} + index := map[string]int{} // message id -> position in turns + // seenTool dedups tool_use blocks by their unique block id within a message, so a + // repeated delta (the same tool_use carried again) does not double-count, while a + // genuinely later, additive tool_use (a different block id) is merged in. + seenTool := map[string]bool{} var turns []ClaudeTurn lineNo := 0 for scanner.Scan() { @@ -168,16 +176,26 @@ func ParseClaudeTurns(data []byte) ([]ClaudeTurn, error) { if id == "" { id = fmt.Sprintf("line-%d", lineNo) } - if seen[id] { - continue - } - seen[id] = true var names []string for _, block := range msg.Content { - if block.Type == "tool_use" { - names = append(names, block.Name) + if block.Type != "tool_use" { + continue + } + toolKey := id + ":" + block.ID + if block.ID != "" && seenTool[toolKey] { + continue } + seenTool[toolKey] = true + names = append(names, block.Name) + } + if pos, ok := index[id]; ok { + // A later delta of a message already seen: merge its NEW tool_use names + // (the per-block dedup above keeps a repeated delta from double-counting). + // Usage is identical across deltas, so the first-delta usage is kept. + turns[pos].ToolNames = append(turns[pos].ToolNames, names...) + continue } + index[id] = len(turns) turns = append(turns, ClaudeTurn{ID: id, Usage: msg.Usage, ToolNames: names}) } if err := scanner.Err(); err != nil { diff --git a/internal/journeymetrics/claude_test.go b/internal/journeymetrics/claude_test.go index 218bdcb0..46625565 100644 --- a/internal/journeymetrics/claude_test.go +++ b/internal/journeymetrics/claude_test.go @@ -127,6 +127,57 @@ func TestParseClaudeTurnsPreservesPerTurnContextAndDedupes(t *testing.T) { } } +func TestParseClaudeTurnsMergesToolUseAcrossDeltas(t *testing.T) { + // Real runner streams are MULTI-DELTA: the same message id appears on several + // `assistant` rows, the first delta carries a `thinking` block with no tool_use, + // and the tool_use block lands on a LATER delta. The per-delta `usage` is + // identical across deltas. ParseClaudeTurns must MERGE the later-delta tool_use + // names into the turn — taking only the first delta (and skipping the rest) drops + // the tool_use entirely, which would make assertNoTeamCreateBeforeGreet blind to a + // TeamCreate (a hollow lazy-TeamCreate proof). Shape mirrors the committed real + // captures (testdata/sonnet_teamdelete_hang.stream.jsonl). + stream := `{"type":"assistant","message":{"id":"msg_x","model":"claude-sonnet-4-6","usage":{"input_tokens":50,"cache_read_input_tokens":1000,"cache_creation_input_tokens":89000},"content":[{"type":"thinking","thinking":"deciding to create a team"}]}} +{"type":"assistant","message":{"id":"msg_x","model":"claude-sonnet-4-6","usage":{"input_tokens":50,"cache_read_input_tokens":1000,"cache_creation_input_tokens":89000},"content":[{"type":"tool_use","id":"toolu_tc","name":"TeamCreate","input":{"team_name":"eager"}}]}}` + + turns, err := ParseClaudeTurns([]byte(stream)) + if err != nil { + t.Fatalf("ParseClaudeTurns: %v", err) + } + if len(turns) != 1 { + t.Fatalf("turns = %d, want 1 (the two deltas are one message)", len(turns)) + } + // The tool_use lands on the SECOND delta — it must still be surfaced. + if len(turns[0].ToolNames) != 1 || turns[0].ToolNames[0] != "TeamCreate" { + t.Fatalf("turn[0].ToolNames = %v, want [TeamCreate] (merged from the later delta) — taking only the first delta loses it", turns[0].ToolNames) + } + // Usage is consistent across deltas; the merged turn keeps it. + if turns[0].Usage.CacheCreation != 89000 { + t.Errorf("turn[0].Usage.CacheCreation = %d, want 89000 (consistent across deltas)", turns[0].Usage.CacheCreation) + } +} + +func TestParseClaudeTurnsRealMultiDeltaCaptureSurfacesTeamCalls(t *testing.T) { + // Drive the REAL captured multi-delta stream: it contains TeamCreate and + // TeamDelete tool_use blocks, each on a later delta of its message. The buggy + // first-delta-only parse would surface NEITHER; the merge must surface BOTH. + turns, err := ParseClaudeTurns(readTestdata(t, "claude_multidelta_team.stream.jsonl")) + if err != nil { + t.Fatalf("ParseClaudeTurns: %v", err) + } + saw := map[string]bool{} + for _, turn := range turns { + for _, name := range turn.ToolNames { + saw[name] = true + } + } + if !saw["TeamCreate"] { + t.Error("the real multi-delta capture's TeamCreate (on a later delta) was not surfaced — the parser still drops later-delta tool_use") + } + if !saw["TeamDelete"] { + t.Error("the real multi-delta capture's TeamDelete (on a later delta) was not surfaced") + } +} + func readTestdata(t *testing.T, name string) []byte { t.Helper() data, err := os.ReadFile(filepath.Join("testdata", name)) diff --git a/internal/journeymetrics/testdata/claude_multidelta_team.stream.jsonl b/internal/journeymetrics/testdata/claude_multidelta_team.stream.jsonl new file mode 100644 index 00000000..0953686e --- /dev/null +++ b/internal/journeymetrics/testdata/claude_multidelta_team.stream.jsonl @@ -0,0 +1,7 @@ +{"type":"assistant","message":{"model":"claude-sonnet-4-6","id":"msg_0118mye2TWrcQhWTFK71dJPY","type":"message","role":"assistant","content":[{"type":"thinking","thinking":"The user wants me to drive the workflow. I need to load the `spacedock:first-officer` skill to understand my operating contract, then begin the Startup procedure.","signature":"EusCCmUIDhgCKkCd2/F3/jaFZcIgecrsFzRv3saq5kzzOxi26TDnzYr5bhAVXzulgBEJdiD6WEE4p4JxaZ7fWQ5L30AfvWon9I3zMhFjbGF1ZGUtc29ubmV0LTQtNjgAQgh0aGlua2luZxIMUN4zD+3FLxYpsHOdGgx7raEtf/YKkTJ77vEiMCpiwTJaRpKxTB1jFsYmvXeDfs4tRu/jNHJxcnqtbzu0tsuIXIoxefED1gmWt7H41SqzASBhw0BCSoZaTzvDvd0aCKzsKJ50ZyrcWpWQ9fsYAfD3JXQoujFU5KEMg/qlUUx9iDbizb5F6JFqjJAGfqvGM+4vXGvf2PTstX5aoVlV8aWt7YLF5WJIXylLclXFTa4E4fuHfe627byNcxW43G8qwih386dWswyaM1ON0aT5BxzFY41P8PfqBYboZwhwc7CMcTBrFW3JiX8nA+K2w7li69Ht17qlJo8YPZQTB1hO6drWWU6yGAE="}],"stop_reason":null,"stop_sequence":null,"stop_details":null,"usage":{"input_tokens":3,"cache_creation_input_tokens":0,"cache_read_input_tokens":18463,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":0,"service_tier":"standard","inference_geo":"global"},"diagnostics":null,"context_management":null},"parent_tool_use_id":null,"session_id":"240470ca-8e5a-4cb2-9feb-410e103437ac","uuid":"466798e4-d180-4aa5-bfea-b47a34dc113f","request_id":"req_011CbfnY9snFoEBU7eJFA4XT"} +{"type":"assistant","message":{"model":"claude-sonnet-4-6","id":"msg_0118mye2TWrcQhWTFK71dJPY","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_012CxbkN9C4cimS5pyFmALEV","name":"Skill","input":{"skill":"spacedock:first-officer"},"caller":{"type":"direct"}}],"stop_reason":null,"stop_sequence":null,"stop_details":null,"usage":{"input_tokens":3,"cache_creation_input_tokens":0,"cache_read_input_tokens":18463,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":0,"service_tier":"standard","inference_geo":"global"},"diagnostics":null,"context_management":null},"parent_tool_use_id":null,"session_id":"240470ca-8e5a-4cb2-9feb-410e103437ac","uuid":"007e29e4-6c1c-48bf-b1a0-c7d943601d8d","request_id":"req_011CbfnY9snFoEBU7eJFA4XT"} +{"type":"assistant","message":{"model":"claude-sonnet-4-6","id":"msg_01RvXzZwwGSE3Hw5goPUJ6nC","type":"message","role":"assistant","content":[{"type":"thinking","thinking":"TeamCreate is available. Now I need to:\n1. Derive the project name: `basename /tmp/TestLiveEnsignCycle1166216625/002` = `002`\n2. Workflow dir basename: `002`\n3. Timestamp: `20260603-0538`\n4. Shortuuid: 8 lowercase alphanumeric chars - I'll generate something like `a1b2c3d4`\n\nTeam name: `002-002-20260603-0538-{shortuuid}`\n\nI also need to run the reconcile sweep and discover standing teammates. But first, let me create the team.\n\n\nI'm confirming the project name and workflow directory both resolve to `002`, so the team name will be `002-002-20260603-0538-r4k9m2xz`.","signature":"EtcICmUIDhgCKkAeUaB/JORi8aDD+uKNgPTmjIkDvHnm41qeE+AstFubcLte7BlpVGF9kO7KgHNVtFxcVkfvyoABp1OPkjv+qdJSMhFjbGF1ZGUtc29ubmV0LTQtNjgAQgh0aGlua2luZxIMbYnmMA9aV/zJ5SQ6GgynIv2dcmmWjZu00s0iMNeIfznGTp/w6R3tzi6Og9muEEyq07hC24+P8UX1qx+jQvHqsMRbOVuQy6KUL3mASSqfB8MgT2asHDgx8qrsPq0rXA+zdbI/I17Oqo6gEc1cgKFGB4OmUWYu/GRM48LjC5UEBb3WScU8+jv6TLkI//ZFndGLrCcMc6xRlnflW4bG9H6TPDBQuIQ519myFRaYAjVWskRqQND6bnz3UjnCMpJvw2UvQyVnNb6vjKbj435sTFO310vra2AnZKOKeuH5D1R6kx2QUI26wM+2VuKmRQdxA18K0SnuVFnp3ncmeiaETiYQazFBVxx5YiCLlLJwjgb7UR3eLq8qZDRbauV2Vt9Grg10oxC5aKEB4ibO125x6HR4Bg+DicVAVcWt5P0im3w2YeINHCT2hhxcGFUEEgHfhHSkJtzVDKh9CzgYybyadC049hzQf/ch5qqYDHZVduFFoC94Z1PH67+F/6OYCMHMWokFgkEA6Rte9/estVFgxsLKa7PjlB6pZ6h7WMcuDtvlLGhK6SHjELZsTSxIIPW2whsBRp0dbHteJR4Fvds+q+wk1jE5+/w8YZdbXFVXhaVC+Ts20FLf8+xNosiE9Sobgi5peOpHfN1sz1IyncVpoF9qTbriI5psZIsogGlqHjmpZZldVktIospaL/Xz2EftMKv7HNDG57dp051z4q0qemf0e5zbH5pBT5aGMcpNVutPgjeX5bL1+0EuXfSyae4J5WzA9ChWpmWSmKYolXsQXKfFMUBQ7mNz7lnJeDROvJFcG6rhgbXMxTQCXk0MG5C1NqjC+C1eHx/t0xI8DAebP0On31/w7Fr7ERdbppLq0I1ptt437p3oJQT1gVE1e//MCZO6meDACo7jpmFKU8DjCnROFtggkqlLDkWGIbu67MREDefTU1lfhYg5dzzwt4rAZwNJq2Jw0I7kmb4cNk7zL5r2htWUoS7Pp6y+ld7JBriUgqCWGlkqYXqyM1ZD9o7t/Ioi7b5LRPAdMgSP0K/ajtBmZDbS1f+wI4X1dao/FUqFk4uRyLC/x7quaUoeAPh0PdwVo6ULCEdQ2cmp6e4itNM+PFgPfse8/gFDtOxGhz+7Pjk/HhP0vtaofQGmF75ky/42nrpF6I3rSvRHfQydScMAJWX9izZde8IDlX3uhJRbRXZMBMoQ3p5ac+xjmi28f0Esv/iJALwiXY9Ha89bUYbvc82U5cYjBaPts0aaVn6BNB8srd0IFsek9I+n/niBbWxayfr+Ri859OzeaU85REVBE+LkdzadI758ZAcxlJgmYBVYikJy6+YkGW5cEneD7RgB"}],"stop_reason":null,"stop_sequence":null,"stop_details":null,"usage":{"input_tokens":3,"cache_creation_input_tokens":2505,"cache_read_input_tokens":43402,"cache_creation":{"ephemeral_5m_input_tokens":2505,"ephemeral_1h_input_tokens":0},"output_tokens":6,"service_tier":"standard","inference_geo":"global"},"diagnostics":null,"context_management":null},"parent_tool_use_id":null,"session_id":"240470ca-8e5a-4cb2-9feb-410e103437ac","uuid":"e648228b-b392-4085-af3a-4acf896e62f7","request_id":"req_011CbfnbEXuevkT7kyjNxeUz"} +{"type":"assistant","message":{"model":"claude-sonnet-4-6","id":"msg_01RvXzZwwGSE3Hw5goPUJ6nC","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01QuuYBWsYE8riwUhCqWfyPh","name":"TeamCreate","input":{"team_name":"002-002-20260603-0538-r4k9m2xz","description":"Fixture Workflow — first officer team"},"caller":{"type":"direct"}}],"stop_reason":null,"stop_sequence":null,"stop_details":null,"usage":{"input_tokens":3,"cache_creation_input_tokens":2505,"cache_read_input_tokens":43402,"cache_creation":{"ephemeral_5m_input_tokens":2505,"ephemeral_1h_input_tokens":0},"output_tokens":6,"service_tier":"standard","inference_geo":"global"},"diagnostics":null,"context_management":null},"parent_tool_use_id":null,"session_id":"240470ca-8e5a-4cb2-9feb-410e103437ac","uuid":"ce02da18-41d7-4347-9976-63bc59cb0fa9","request_id":"req_011CbfnbEXuevkT7kyjNxeUz"} +{"type":"assistant","message":{"model":"claude-sonnet-4-6","id":"msg_015hcHVgaiZVy2pCYGb7PVou","type":"message","role":"assistant","content":[{"type":"thinking","thinking":"Now I need to clean up the team. The ensign was already sent a shutdown_request and responded with \"Done:\". Let me now TeamDelete to clean up.","signature":"EtoCCmUIDhgCKkBUNsh0TBoxwOJvPhr+ayAAsu3h51VWBkYA7jvWiS0g5NXUOyO2Zht1aTGf04vMmfly8wwrcrS0P6XijbDjePpEMhFjbGF1ZGUtc29ubmV0LTQtNjgAQgh0aGlua2luZxIMe4/gktQhSqVq0Q+BGgw2lkmmsz1ABp2gEuEiMLIFTLu4kLXHr+M5+K0sXOUXOw/KG7BxhKl0H1H9qeegr4Z0NgUXUmkuF88DKG7q0SqiAcHd2Kk6ni/eYJHQY0U3xnXJ2qS4wM1uqWFUL0MnNyFepPwm3hgQiEV1QOpjJRGY3in1g8m464AOBCKp3MmYlS7R2uzN0+8/yaSoNmWylB9Yjqtyp8qZSQVXIIqPlohE/j2GI4Pq2YlZOoMzlMQTd0WqWAWQxybGzYNrfd5FkBZ/8krS+cXPIdZwRfbvLjDnOS6LysETQW19/EMGq5NZQcfn0xgB"}],"stop_reason":null,"stop_sequence":null,"stop_details":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":243,"cache_read_input_tokens":57235,"cache_creation":{"ephemeral_5m_input_tokens":243,"ephemeral_1h_input_tokens":0},"output_tokens":22,"service_tier":"standard","inference_geo":"global"},"diagnostics":null,"context_management":null},"parent_tool_use_id":null,"session_id":"240470ca-8e5a-4cb2-9feb-410e103437ac","uuid":"3f413c15-175a-4937-9aef-6fd1e7f75342","request_id":"req_011Cbfnq9AqkFxqAXLbF56yY"} +{"type":"assistant","message":{"model":"claude-sonnet-4-6","id":"msg_015hcHVgaiZVy2pCYGb7PVou","type":"message","role":"assistant","content":[{"type":"text","text":"Entity archived. Now tearing down the team."}],"stop_reason":null,"stop_sequence":null,"stop_details":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":243,"cache_read_input_tokens":57235,"cache_creation":{"ephemeral_5m_input_tokens":243,"ephemeral_1h_input_tokens":0},"output_tokens":22,"service_tier":"standard","inference_geo":"global"},"diagnostics":null,"context_management":null},"parent_tool_use_id":null,"session_id":"240470ca-8e5a-4cb2-9feb-410e103437ac","uuid":"a97bc90d-d534-452f-b04a-9e46ccd3c700","request_id":"req_011Cbfnq9AqkFxqAXLbF56yY"} +{"type":"assistant","message":{"model":"claude-sonnet-4-6","id":"msg_015hcHVgaiZVy2pCYGb7PVou","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_012YqWtc8LFWm8PzcNfwjJN7","name":"TeamDelete","input":{},"caller":{"type":"direct"}}],"stop_reason":null,"stop_sequence":null,"stop_details":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":243,"cache_read_input_tokens":57235,"cache_creation":{"ephemeral_5m_input_tokens":243,"ephemeral_1h_input_tokens":0},"output_tokens":22,"service_tier":"standard","inference_geo":"global"},"diagnostics":null,"context_management":null},"parent_tool_use_id":null,"session_id":"240470ca-8e5a-4cb2-9feb-410e103437ac","uuid":"58dae075-1524-485f-89ce-f4c5f6945ac4","request_id":"req_011Cbfnq9AqkFxqAXLbF56yY"} From 8328582e2500fcdc86d6a1dd4dfb6f28d7c6e300 Mon Sep 17 00:00:00 2001 From: CL Kao Date: Sat, 13 Jun 2026 17:48:19 -0700 Subject: [PATCH 17/18] test(j9): pin the AC-2 parser fix to the validator-named real hang capture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the committed real-runner stream sonnet_teamdelete_hang.stream.jsonl (the validator's verified ready oracle: 20/27 message ids multi-delta; its TeamCreate and TeamDelete each on a non-first delta) as the parser regression — the FIXED ParseClaudeTurns surfaces TeamCreate=true where the pre-fix first-delta-only parse reported false across all 27 turns. Driving the FULL committed fixture pins the fix to the exact stream the forensics verified the defect on; drop the redundant trimmed copy + its journeymetrics test (superseded by the full-fixture regression). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../shallow_boot_measure_unit_test.go | 31 +++++++++++++++++++ internal/journeymetrics/claude_test.go | 22 ------------- .../claude_multidelta_team.stream.jsonl | 7 ----- 3 files changed, 31 insertions(+), 29 deletions(-) delete mode 100644 internal/journeymetrics/testdata/claude_multidelta_team.stream.jsonl diff --git a/internal/ensigncycle/shallow_boot_measure_unit_test.go b/internal/ensigncycle/shallow_boot_measure_unit_test.go index d438e027..c5949b2b 100644 --- a/internal/ensigncycle/shallow_boot_measure_unit_test.go +++ b/internal/ensigncycle/shallow_boot_measure_unit_test.go @@ -70,6 +70,37 @@ func TestAssertNoTeamCreateBeforeGreetCatchesLaterDeltaTeamCreate(t *testing.T) } } +// TestParserExtractsTeamCallsFromRealHangCapture is the validator-named ready +// oracle: the committed real-runner stream `sonnet_teamdelete_hang.stream.jsonl` +// (20/27 message ids multi-delta; its lone TeamCreate and TeamDelete each land on a +// non-first delta) must surface BOTH team calls through the fixed ParseClaudeTurns. +// Against the pre-fix first-delta-only parse this reported TeamCreate=false across +// all 27 turns (the proven false-pass); the merge reports TeamCreate=true. Driving +// the FULL committed fixture (not a trimmed copy) pins the fix to the exact stream +// the forensics verified the defect on. +func TestParserExtractsTeamCallsFromRealHangCapture(t *testing.T) { + data, err := os.ReadFile(filepath.Join("testdata", "sonnet_teamdelete_hang.stream.jsonl")) + if err != nil { + t.Fatal(err) + } + turns, err := journeymetrics.ParseClaudeTurns(data) + if err != nil { + t.Fatalf("ParseClaudeTurns: %v", err) + } + saw := map[string]bool{} + for _, turn := range turns { + for _, name := range turn.ToolNames { + saw[name] = true + } + } + if !saw["TeamCreate"] { + t.Error("the real hang capture's TeamCreate (on a non-first delta) was not surfaced — the parser is still first-delta-only (the AC-2 false-pass)") + } + if !saw["TeamDelete"] { + t.Error("the real hang capture's TeamDelete (on a non-first delta) was not surfaced") + } +} + // TestShallowBootMeasureSignalsAreIndependent isolates the two AC-6 signals so // neither can be silently dropped: a stream that fails ONLY the ceiling check (a // heavy greet, no spike) and a stream that fails ONLY the spike check (a pre-greet diff --git a/internal/journeymetrics/claude_test.go b/internal/journeymetrics/claude_test.go index 46625565..df782bae 100644 --- a/internal/journeymetrics/claude_test.go +++ b/internal/journeymetrics/claude_test.go @@ -156,28 +156,6 @@ func TestParseClaudeTurnsMergesToolUseAcrossDeltas(t *testing.T) { } } -func TestParseClaudeTurnsRealMultiDeltaCaptureSurfacesTeamCalls(t *testing.T) { - // Drive the REAL captured multi-delta stream: it contains TeamCreate and - // TeamDelete tool_use blocks, each on a later delta of its message. The buggy - // first-delta-only parse would surface NEITHER; the merge must surface BOTH. - turns, err := ParseClaudeTurns(readTestdata(t, "claude_multidelta_team.stream.jsonl")) - if err != nil { - t.Fatalf("ParseClaudeTurns: %v", err) - } - saw := map[string]bool{} - for _, turn := range turns { - for _, name := range turn.ToolNames { - saw[name] = true - } - } - if !saw["TeamCreate"] { - t.Error("the real multi-delta capture's TeamCreate (on a later delta) was not surfaced — the parser still drops later-delta tool_use") - } - if !saw["TeamDelete"] { - t.Error("the real multi-delta capture's TeamDelete (on a later delta) was not surfaced") - } -} - func readTestdata(t *testing.T, name string) []byte { t.Helper() data, err := os.ReadFile(filepath.Join("testdata", name)) diff --git a/internal/journeymetrics/testdata/claude_multidelta_team.stream.jsonl b/internal/journeymetrics/testdata/claude_multidelta_team.stream.jsonl deleted file mode 100644 index 0953686e..00000000 --- a/internal/journeymetrics/testdata/claude_multidelta_team.stream.jsonl +++ /dev/null @@ -1,7 +0,0 @@ -{"type":"assistant","message":{"model":"claude-sonnet-4-6","id":"msg_0118mye2TWrcQhWTFK71dJPY","type":"message","role":"assistant","content":[{"type":"thinking","thinking":"The user wants me to drive the workflow. I need to load the `spacedock:first-officer` skill to understand my operating contract, then begin the Startup procedure.","signature":"EusCCmUIDhgCKkCd2/F3/jaFZcIgecrsFzRv3saq5kzzOxi26TDnzYr5bhAVXzulgBEJdiD6WEE4p4JxaZ7fWQ5L30AfvWon9I3zMhFjbGF1ZGUtc29ubmV0LTQtNjgAQgh0aGlua2luZxIMUN4zD+3FLxYpsHOdGgx7raEtf/YKkTJ77vEiMCpiwTJaRpKxTB1jFsYmvXeDfs4tRu/jNHJxcnqtbzu0tsuIXIoxefED1gmWt7H41SqzASBhw0BCSoZaTzvDvd0aCKzsKJ50ZyrcWpWQ9fsYAfD3JXQoujFU5KEMg/qlUUx9iDbizb5F6JFqjJAGfqvGM+4vXGvf2PTstX5aoVlV8aWt7YLF5WJIXylLclXFTa4E4fuHfe627byNcxW43G8qwih386dWswyaM1ON0aT5BxzFY41P8PfqBYboZwhwc7CMcTBrFW3JiX8nA+K2w7li69Ht17qlJo8YPZQTB1hO6drWWU6yGAE="}],"stop_reason":null,"stop_sequence":null,"stop_details":null,"usage":{"input_tokens":3,"cache_creation_input_tokens":0,"cache_read_input_tokens":18463,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":0,"service_tier":"standard","inference_geo":"global"},"diagnostics":null,"context_management":null},"parent_tool_use_id":null,"session_id":"240470ca-8e5a-4cb2-9feb-410e103437ac","uuid":"466798e4-d180-4aa5-bfea-b47a34dc113f","request_id":"req_011CbfnY9snFoEBU7eJFA4XT"} -{"type":"assistant","message":{"model":"claude-sonnet-4-6","id":"msg_0118mye2TWrcQhWTFK71dJPY","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_012CxbkN9C4cimS5pyFmALEV","name":"Skill","input":{"skill":"spacedock:first-officer"},"caller":{"type":"direct"}}],"stop_reason":null,"stop_sequence":null,"stop_details":null,"usage":{"input_tokens":3,"cache_creation_input_tokens":0,"cache_read_input_tokens":18463,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":0,"service_tier":"standard","inference_geo":"global"},"diagnostics":null,"context_management":null},"parent_tool_use_id":null,"session_id":"240470ca-8e5a-4cb2-9feb-410e103437ac","uuid":"007e29e4-6c1c-48bf-b1a0-c7d943601d8d","request_id":"req_011CbfnY9snFoEBU7eJFA4XT"} -{"type":"assistant","message":{"model":"claude-sonnet-4-6","id":"msg_01RvXzZwwGSE3Hw5goPUJ6nC","type":"message","role":"assistant","content":[{"type":"thinking","thinking":"TeamCreate is available. Now I need to:\n1. Derive the project name: `basename /tmp/TestLiveEnsignCycle1166216625/002` = `002`\n2. Workflow dir basename: `002`\n3. Timestamp: `20260603-0538`\n4. Shortuuid: 8 lowercase alphanumeric chars - I'll generate something like `a1b2c3d4`\n\nTeam name: `002-002-20260603-0538-{shortuuid}`\n\nI also need to run the reconcile sweep and discover standing teammates. But first, let me create the team.\n\n\nI'm confirming the project name and workflow directory both resolve to `002`, so the team name will be `002-002-20260603-0538-r4k9m2xz`.","signature":"EtcICmUIDhgCKkAeUaB/JORi8aDD+uKNgPTmjIkDvHnm41qeE+AstFubcLte7BlpVGF9kO7KgHNVtFxcVkfvyoABp1OPkjv+qdJSMhFjbGF1ZGUtc29ubmV0LTQtNjgAQgh0aGlua2luZxIMbYnmMA9aV/zJ5SQ6GgynIv2dcmmWjZu00s0iMNeIfznGTp/w6R3tzi6Og9muEEyq07hC24+P8UX1qx+jQvHqsMRbOVuQy6KUL3mASSqfB8MgT2asHDgx8qrsPq0rXA+zdbI/I17Oqo6gEc1cgKFGB4OmUWYu/GRM48LjC5UEBb3WScU8+jv6TLkI//ZFndGLrCcMc6xRlnflW4bG9H6TPDBQuIQ519myFRaYAjVWskRqQND6bnz3UjnCMpJvw2UvQyVnNb6vjKbj435sTFO310vra2AnZKOKeuH5D1R6kx2QUI26wM+2VuKmRQdxA18K0SnuVFnp3ncmeiaETiYQazFBVxx5YiCLlLJwjgb7UR3eLq8qZDRbauV2Vt9Grg10oxC5aKEB4ibO125x6HR4Bg+DicVAVcWt5P0im3w2YeINHCT2hhxcGFUEEgHfhHSkJtzVDKh9CzgYybyadC049hzQf/ch5qqYDHZVduFFoC94Z1PH67+F/6OYCMHMWokFgkEA6Rte9/estVFgxsLKa7PjlB6pZ6h7WMcuDtvlLGhK6SHjELZsTSxIIPW2whsBRp0dbHteJR4Fvds+q+wk1jE5+/w8YZdbXFVXhaVC+Ts20FLf8+xNosiE9Sobgi5peOpHfN1sz1IyncVpoF9qTbriI5psZIsogGlqHjmpZZldVktIospaL/Xz2EftMKv7HNDG57dp051z4q0qemf0e5zbH5pBT5aGMcpNVutPgjeX5bL1+0EuXfSyae4J5WzA9ChWpmWSmKYolXsQXKfFMUBQ7mNz7lnJeDROvJFcG6rhgbXMxTQCXk0MG5C1NqjC+C1eHx/t0xI8DAebP0On31/w7Fr7ERdbppLq0I1ptt437p3oJQT1gVE1e//MCZO6meDACo7jpmFKU8DjCnROFtggkqlLDkWGIbu67MREDefTU1lfhYg5dzzwt4rAZwNJq2Jw0I7kmb4cNk7zL5r2htWUoS7Pp6y+ld7JBriUgqCWGlkqYXqyM1ZD9o7t/Ioi7b5LRPAdMgSP0K/ajtBmZDbS1f+wI4X1dao/FUqFk4uRyLC/x7quaUoeAPh0PdwVo6ULCEdQ2cmp6e4itNM+PFgPfse8/gFDtOxGhz+7Pjk/HhP0vtaofQGmF75ky/42nrpF6I3rSvRHfQydScMAJWX9izZde8IDlX3uhJRbRXZMBMoQ3p5ac+xjmi28f0Esv/iJALwiXY9Ha89bUYbvc82U5cYjBaPts0aaVn6BNB8srd0IFsek9I+n/niBbWxayfr+Ri859OzeaU85REVBE+LkdzadI758ZAcxlJgmYBVYikJy6+YkGW5cEneD7RgB"}],"stop_reason":null,"stop_sequence":null,"stop_details":null,"usage":{"input_tokens":3,"cache_creation_input_tokens":2505,"cache_read_input_tokens":43402,"cache_creation":{"ephemeral_5m_input_tokens":2505,"ephemeral_1h_input_tokens":0},"output_tokens":6,"service_tier":"standard","inference_geo":"global"},"diagnostics":null,"context_management":null},"parent_tool_use_id":null,"session_id":"240470ca-8e5a-4cb2-9feb-410e103437ac","uuid":"e648228b-b392-4085-af3a-4acf896e62f7","request_id":"req_011CbfnbEXuevkT7kyjNxeUz"} -{"type":"assistant","message":{"model":"claude-sonnet-4-6","id":"msg_01RvXzZwwGSE3Hw5goPUJ6nC","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01QuuYBWsYE8riwUhCqWfyPh","name":"TeamCreate","input":{"team_name":"002-002-20260603-0538-r4k9m2xz","description":"Fixture Workflow — first officer team"},"caller":{"type":"direct"}}],"stop_reason":null,"stop_sequence":null,"stop_details":null,"usage":{"input_tokens":3,"cache_creation_input_tokens":2505,"cache_read_input_tokens":43402,"cache_creation":{"ephemeral_5m_input_tokens":2505,"ephemeral_1h_input_tokens":0},"output_tokens":6,"service_tier":"standard","inference_geo":"global"},"diagnostics":null,"context_management":null},"parent_tool_use_id":null,"session_id":"240470ca-8e5a-4cb2-9feb-410e103437ac","uuid":"ce02da18-41d7-4347-9976-63bc59cb0fa9","request_id":"req_011CbfnbEXuevkT7kyjNxeUz"} -{"type":"assistant","message":{"model":"claude-sonnet-4-6","id":"msg_015hcHVgaiZVy2pCYGb7PVou","type":"message","role":"assistant","content":[{"type":"thinking","thinking":"Now I need to clean up the team. The ensign was already sent a shutdown_request and responded with \"Done:\". Let me now TeamDelete to clean up.","signature":"EtoCCmUIDhgCKkBUNsh0TBoxwOJvPhr+ayAAsu3h51VWBkYA7jvWiS0g5NXUOyO2Zht1aTGf04vMmfly8wwrcrS0P6XijbDjePpEMhFjbGF1ZGUtc29ubmV0LTQtNjgAQgh0aGlua2luZxIMe4/gktQhSqVq0Q+BGgw2lkmmsz1ABp2gEuEiMLIFTLu4kLXHr+M5+K0sXOUXOw/KG7BxhKl0H1H9qeegr4Z0NgUXUmkuF88DKG7q0SqiAcHd2Kk6ni/eYJHQY0U3xnXJ2qS4wM1uqWFUL0MnNyFepPwm3hgQiEV1QOpjJRGY3in1g8m464AOBCKp3MmYlS7R2uzN0+8/yaSoNmWylB9Yjqtyp8qZSQVXIIqPlohE/j2GI4Pq2YlZOoMzlMQTd0WqWAWQxybGzYNrfd5FkBZ/8krS+cXPIdZwRfbvLjDnOS6LysETQW19/EMGq5NZQcfn0xgB"}],"stop_reason":null,"stop_sequence":null,"stop_details":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":243,"cache_read_input_tokens":57235,"cache_creation":{"ephemeral_5m_input_tokens":243,"ephemeral_1h_input_tokens":0},"output_tokens":22,"service_tier":"standard","inference_geo":"global"},"diagnostics":null,"context_management":null},"parent_tool_use_id":null,"session_id":"240470ca-8e5a-4cb2-9feb-410e103437ac","uuid":"3f413c15-175a-4937-9aef-6fd1e7f75342","request_id":"req_011Cbfnq9AqkFxqAXLbF56yY"} -{"type":"assistant","message":{"model":"claude-sonnet-4-6","id":"msg_015hcHVgaiZVy2pCYGb7PVou","type":"message","role":"assistant","content":[{"type":"text","text":"Entity archived. Now tearing down the team."}],"stop_reason":null,"stop_sequence":null,"stop_details":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":243,"cache_read_input_tokens":57235,"cache_creation":{"ephemeral_5m_input_tokens":243,"ephemeral_1h_input_tokens":0},"output_tokens":22,"service_tier":"standard","inference_geo":"global"},"diagnostics":null,"context_management":null},"parent_tool_use_id":null,"session_id":"240470ca-8e5a-4cb2-9feb-410e103437ac","uuid":"a97bc90d-d534-452f-b04a-9e46ccd3c700","request_id":"req_011Cbfnq9AqkFxqAXLbF56yY"} -{"type":"assistant","message":{"model":"claude-sonnet-4-6","id":"msg_015hcHVgaiZVy2pCYGb7PVou","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_012YqWtc8LFWm8PzcNfwjJN7","name":"TeamDelete","input":{},"caller":{"type":"direct"}}],"stop_reason":null,"stop_sequence":null,"stop_details":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":243,"cache_read_input_tokens":57235,"cache_creation":{"ephemeral_5m_input_tokens":243,"ephemeral_1h_input_tokens":0},"output_tokens":22,"service_tier":"standard","inference_geo":"global"},"diagnostics":null,"context_management":null},"parent_tool_use_id":null,"session_id":"240470ca-8e5a-4cb2-9feb-410e103437ac","uuid":"58dae075-1524-485f-89ce-f4c5f6945ac4","request_id":"req_011Cbfnq9AqkFxqAXLbF56yY"} From b2d7d6ed877300905ab6d4e68cce336ab8940575 Mon Sep 17 00:00:00 2001 From: CL Kao Date: Sat, 13 Jun 2026 21:31:11 -0700 Subject: [PATCH 18/18] test(j9): scrub CI repo-naming env from live child + loud wrong-root detector (PR #365) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PR #365 opus runtime-live-e2e failed TestLiveEnsignCycle at TeamCreate because isolatedClaudeEnv's CI path passed the whole os.Environ() through to the child FO — including GITHUB_WORKSPACE (= the real spacedock checkout). That lured the FO to cd into the real repo and boot its docs/dev workflow instead of the test's tmpdir fixture; it found nothing dispatchable and (correctly, per the lazy contract) greeted-and-stopped, surfacing only as a confusing pre-TeamCreate timeout. A test-harness env leak, not a contract defect — real `spacedock claude` use has no such CI var. - cleanEnviron now drops the GITHUB_*/RUNNER_* family (isCIRepoNamingVar) so both Claude live lanes (and Codex, which shares cleanEnviron) reproduce a production-clean child env. ANTHROPIC_API_KEY, CLAUDE_CONFIG_DIR (resolved before the child env is built), and PATH survive — verified by the existing config-dir/credential tests staying green. - detectWrongRootBoot: a pure, model-agnostic detector that names the expected fixture root vs the wandered-to path, keyed on cd-off-fixture / --workflow-dir-outside / workflow-README-read-outside (a contract-skill Read from --plugin-dir is NOT flagged). Wired into TestLiveEnsignCycle and the shared Claude runner so a future leak fails loud and early. Test-only; zero skills/** touched (captain: test-faithful env fix, no FO-contract change). Offline gate `go test ./...` exit 0. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../ensigncycle/claude_live_runner_test.go | 14 ++ internal/ensigncycle/live_test.go | 9 ++ internal/ensigncycle/liveenv_scrub_test.go | 72 ++++++++++ internal/ensigncycle/liveenv_test.go | 26 +++- internal/ensigncycle/streamwatch_test.go | 6 + .../wrong_root_detect_impl_test.go | 128 ++++++++++++++++++ .../ensigncycle/wrong_root_detect_test.go | 125 +++++++++++++++++ 7 files changed, 376 insertions(+), 4 deletions(-) create mode 100644 internal/ensigncycle/liveenv_scrub_test.go create mode 100644 internal/ensigncycle/wrong_root_detect_impl_test.go create mode 100644 internal/ensigncycle/wrong_root_detect_test.go diff --git a/internal/ensigncycle/claude_live_runner_test.go b/internal/ensigncycle/claude_live_runner_test.go index e336bfad..2cf7904f 100644 --- a/internal/ensigncycle/claude_live_runner_test.go +++ b/internal/ensigncycle/claude_live_runner_test.go @@ -373,6 +373,20 @@ func (r claudeLiveRunner) run(t *testing.T, scenario sharedRuntimeScenario, work if writeErr := os.WriteFile(streamPath, []byte(stream), 0o644); writeErr != nil { t.Fatal(writeErr) } + + // A wrong-root boot is the most specific diagnosis on any failure path: a CI env + // leak lures the FO off workflowRoot, it boots the real repo, finds nothing + // dispatchable, and greets-and-stops — surfacing otherwise only as an opaque + // no-progress stall (when it idles) or as every scenario assertion silently + // running against the wrong state (when it completes cleanly). Name it FIRST so + // the leak fails legibly with expected-fixture vs wandered-to, ahead of the + // generic stall message or the downstream assertions. + if wrongRoot := detectWrongRootBoot(stream, workflowRoot); wrongRoot != nil { + if stallErr != nil { + t.Fatalf("%v\nUnderlying stall: %v\nArtifacts: %s", wrongRoot, stallErr, artifactDir) + } + t.Fatalf("%v\nArtifacts: %s", wrongRoot, artifactDir) + } if stallErr != nil { t.Fatalf("%v\nArtifacts: %s", stallErr, artifactDir) } diff --git a/internal/ensigncycle/live_test.go b/internal/ensigncycle/live_test.go index a2bfdc83..ad5d43a0 100644 --- a/internal/ensigncycle/live_test.go +++ b/internal/ensigncycle/live_test.go @@ -174,6 +174,15 @@ func TestLiveEnsignCycle(t *testing.T) { // than a silent one (which the quiet budget does catch). Documenting it here // keeps the trade-off explicit instead of silently absent. if _, err := watcher.expect(isTeamCreate, quietBudgetDefault, "TeamCreate"); err != nil { + // A pre-TeamCreate failure is opaque on its own (the FO "exited before + // TeamCreate matched"). The most common cause is a wrong-root boot: a CI env + // leak lures the FO off `root` into the real repo, it boots that workflow, + // finds nothing dispatchable, and greets-and-stops. Surface that explicitly + // — naming the expected fixture root vs the wandered-to path — so the leak + // fails legibly instead of as a confusing timeout. + if wrongRoot := detectWrongRootBoot(watcher.fullTranscript(), root); wrongRoot != nil { + t.Fatalf("live cycle failed at TeamCreate due to a wrong-root boot: %v\nUnderlying watcher error: %v", wrongRoot, err) + } t.Fatalf("live cycle failed at TeamCreate: %v", err) } if err := watcher.expectDispatchClose(quietBudgetDefault, "dispatch close"); err != nil { diff --git a/internal/ensigncycle/liveenv_scrub_test.go b/internal/ensigncycle/liveenv_scrub_test.go new file mode 100644 index 00000000..ee48bcdd --- /dev/null +++ b/internal/ensigncycle/liveenv_scrub_test.go @@ -0,0 +1,72 @@ +// ABOUTME: Offline test that the live-cycle child env scrubs the CI repo-naming +// ABOUTME: vars (GITHUB_*/RUNNER_*) that lure the FO off its launch cwd (no model). +package ensigncycle + +import ( + "testing" +) + +// ciRepoNamingVars are the CI-runner-injected GITHUB_*/RUNNER_* vars the live-test +// child env must NOT carry. On a GitHub Actions runner os.Environ() includes the +// whole family; GITHUB_WORKSPACE (= /home/runner/work/spacedock/spacedock, the REAL +// repo) is the proven lure that made the FO `cd` away from its launch cwd and boot +// the real docs/dev workflow instead of its tmpdir fixture (PR #365 opus +// TestLiveEnsignCycle). In real `spacedock claude` use the cwd IS the project and no +// such CI var exists, so scrubbing the family reproduces a production-clean child +// env. The list is the GITHUB_*/RUNNER_* names a runner sets that name or point at +// the real repo / the runner workspace; ANTHROPIC_API_KEY, CLAUDE_CONFIG_DIR, and +// PATH are NOT in it (they are the credential / archive / launcher the test needs). +var ciRepoNamingVars = []string{ + "GITHUB_WORKSPACE", + "GITHUB_REPOSITORY", + "GITHUB_REPOSITORY_OWNER", + "GITHUB_ACTION_PATH", + "GITHUB_SERVER_URL", + "RUNNER_WORKSPACE", + "RUNNER_TEMP", +} + +// TestCleanEnvironScrubsCIRepoNamingVars asserts cleanEnviron drops every +// GITHUB_*/RUNNER_* repo-naming var from the parent env, so the live FO subprocess +// never sees GITHUB_WORKSPACE (or its family) and cannot be lured off its launch +// cwd. It seeds the parent env with the family, builds the child env via +// cleanEnviron, and asserts none survive. +func TestCleanEnvironScrubsCIRepoNamingVars(t *testing.T) { + for _, key := range ciRepoNamingVars { + t.Setenv(key, "/home/runner/work/spacedock/spacedock") + } + + env := cleanEnviron("CLAUDECODE", "HOME", "CLAUDE_CONFIG_DIR") + + for _, key := range ciRepoNamingVars { + if v, ok := envValue(env, key); ok { + t.Errorf("cleanEnviron leaked CI repo-naming var %s=%q — it lures the FO off its launch cwd", key, v) + } + } +} + +// TestIsolatedClaudeEnvScrubsCIRepoNamingVars asserts the same scrub through the +// concrete child env both Claude live lanes build (TestLiveEnsignCycle and +// TestLiveClaudeSharedScenarios). It covers the API-key (CI) auth path — the path +// the leaked CI env actually rides on — and confirms the credential the test needs +// survives the scrub. +func TestIsolatedClaudeEnvScrubsCIRepoNamingVars(t *testing.T) { + fakeHome := t.TempDir() // no benchmark-token -> API-key (CI) path + t.Setenv("ANTHROPIC_API_KEY", "sk-ci-api-key") + t.Setenv("CLAUDECODE", "1") + for _, key := range ciRepoNamingVars { + t.Setenv(key, "/home/runner/work/spacedock/spacedock") + } + + env := isolatedClaudeEnv(t, fakeHome) + + for _, key := range ciRepoNamingVars { + if v, ok := envValue(env, key); ok { + t.Errorf("isolatedClaudeEnv leaked CI repo-naming var %s=%q to the child FO", key, v) + } + } + // The credential the CI auth path needs MUST survive the scrub. + if key, ok := envValue(env, "ANTHROPIC_API_KEY"); !ok || key != "sk-ci-api-key" { + t.Errorf("ANTHROPIC_API_KEY = %q (present=%v), want it to survive the scrub", key, ok) + } +} diff --git a/internal/ensigncycle/liveenv_test.go b/internal/ensigncycle/liveenv_test.go index 26868b89..05969de6 100644 --- a/internal/ensigncycle/liveenv_test.go +++ b/internal/ensigncycle/liveenv_test.go @@ -65,9 +65,16 @@ func decideClaudeEnv(realHome, apiKey string) claudeEnvDecision { return claudeEnvDecision{mode: authNone} } -// cleanEnviron returns os.Environ() filtered to drop the keys in drop. It is the -// port of the Python _clean_env (strip CLAUDECODE so the child can launch -// claude); the live path also drops/overrides HOME and the credential keys. +// cleanEnviron returns os.Environ() filtered to drop the keys in drop AND the CI +// repo-naming families (isCIRepoNamingVar). It is the port of the Python _clean_env +// (strip CLAUDECODE so the child can launch claude); the live path also +// drops/overrides HOME and the credential keys. The CI scrub keeps the child FO +// subprocess in a production-clean environment: a GitHub Actions runner injects +// GITHUB_WORKSPACE (= the real spacedock checkout) and the rest of the +// GITHUB_*/RUNNER_* family, and GITHUB_WORKSPACE lured the FO to `cd` into the real +// repo and boot its docs/dev workflow instead of the test's tmpdir fixture. Real +// `spacedock claude` use has no such CI var (cwd IS the project), so the child env +// must not carry it either. func cleanEnviron(drop ...string) []string { dropped := make(map[string]bool, len(drop)) for _, k := range drop { @@ -79,7 +86,7 @@ func cleanEnviron(drop ...string) []string { if i := strings.IndexByte(kv, '='); i >= 0 { key = kv[:i] } - if dropped[key] { + if dropped[key] || isCIRepoNamingVar(key) { continue } env = append(env, kv) @@ -87,6 +94,17 @@ func cleanEnviron(drop ...string) []string { return env } +// isCIRepoNamingVar reports whether key is a CI-runner-injected GITHUB_*/RUNNER_* +// var. The whole family names or points at the real repo / the runner workspace +// (GITHUB_WORKSPACE is the proven FO lure) and has no place in a production +// `spacedock claude` environment, so the live-test child env scrubs all of it. The +// credential (ANTHROPIC_API_KEY), the archive path (CLAUDE_CONFIG_DIR — already +// resolved into a literal string before the child env is built), and PATH are NOT +// in this family, so the scrub leaves the launcher's needs intact. +func isCIRepoNamingVar(key string) bool { + return strings.HasPrefix(key, "GITHUB_") || strings.HasPrefix(key, "RUNNER_") +} + // resolveClaudeConfigDir picks the child claude's CLAUDE_CONFIG_DIR. When the // parent env sets it (the CI path: the live job points it at an archivable // $RUNNER_TEMP dir so the upload step grabs projects//*.jsonl after a diff --git a/internal/ensigncycle/streamwatch_test.go b/internal/ensigncycle/streamwatch_test.go index 62643740..31a11c83 100644 --- a/internal/ensigncycle/streamwatch_test.go +++ b/internal/ensigncycle/streamwatch_test.go @@ -533,6 +533,12 @@ type streamToolInput struct { SubagentType string `json:"subagent_type"` Description string `json:"description"` Message json.RawMessage `json:"message"` + // Command is the shell command of a Bash tool_use, read by the wrong-root boot + // detector to spot a `cd` off the fixture root or a boot --workflow-dir outside it. + Command string `json:"command"` + // FilePath is the target of a Read/Write tool_use, used the same way (a workflow + // README read from outside the fixture root is a wander signature). + FilePath string `json:"file_path"` } // toolUseBlock returns the first tool_use block of an assistant entry, or nil — diff --git a/internal/ensigncycle/wrong_root_detect_impl_test.go b/internal/ensigncycle/wrong_root_detect_impl_test.go new file mode 100644 index 00000000..8a74243e --- /dev/null +++ b/internal/ensigncycle/wrong_root_detect_impl_test.go @@ -0,0 +1,128 @@ +// ABOUTME: Pure detector that fails LOUD and EARLY when the live FO booted the +// ABOUTME: wrong root (cd off the fixture / a workflow-dir outside it) — PR #365. +package ensigncycle + +import ( + "encoding/json" + "fmt" + "path/filepath" + "strings" +) + +// detectWrongRootBoot scans a captured FO stream for the wrong-root wander that +// PR #365's opus run hit: a CI env leak (GITHUB_WORKSPACE naming the real repo) +// lured the FO to `cd` into the real checkout and boot its docs/dev workflow +// instead of the test's tmpdir fixture, after which it greeted-and-stopped with +// dispatchable:[] — surfacing only as a confusing pre-TeamCreate timeout. This +// detector turns that silent wander into a legible "FO booted the wrong root" +// failure naming the expected fixture root vs the actual wandered-to path. +// +// It is model-agnostic (it reads the tool-call stream, not any model-specific +// phrasing) and pure (stream + fixtureRoot in, error out), with its own offline +// test. The wander signatures it keys on, all observable in the boot stream: +// +// - a `cd ` whose target escapes the fixture root, +// - a `spacedock status --boot --workflow-dir ` whose PATH escapes it, and +// - a `Read /README.md` (the boot's workflow-README read) outside it. +// +// It deliberately does NOT flag the legitimate real-repo paths a correct boot +// touches: the FO Reads its contract skills from the --plugin-dir checkout (the +// real repo) by design, so a contract Read outside the fixture is NOT a wander — +// only the WORKFLOW root (where it boots / cd's / reads the workflow README) must +// stay under the fixture. +func detectWrongRootBoot(stream, fixtureRoot string) error { + clean := filepath.Clean(fixtureRoot) + for _, line := range strings.Split(stream, "\n") { + line = strings.TrimSpace(line) + if line == "" || !strings.HasPrefix(line, "{") { + continue + } + var e streamEntry + if json.Unmarshal([]byte(line), &e) != nil { + continue + } + b := e.toolUseBlock() + if b == nil { + continue + } + switch b.Name { + case "Bash": + if target, ok := wanderTarget(b.Input.Command, clean); ok { + return fmt.Errorf("FO booted the wrong root: expected the fixture root %q, but the boot command %q targets %q (outside the fixture) — a CI env leak likely lured the FO off its launch cwd", + clean, strings.TrimSpace(b.Input.Command), target) + } + case "Read": + // The FO reads {workflow_dir}/README.md at boot (Startup step 4). A + // workflow README read OUTSIDE the fixture means it booted the wrong + // workflow. Contract skills live under {plugin_dir}/skills/...references/, + // never a bare /README.md, so this does not flag a contract read. + if target, ok := wanderWorkflowReadme(b.Input.FilePath, clean); ok { + return fmt.Errorf("FO booted the wrong root: expected the fixture root %q, but it read the workflow README at %q (outside the fixture) — a CI env leak likely lured the FO off its launch cwd", + clean, target) + } + } + } + return nil +} + +// wanderWorkflowReadme returns the off-fixture absolute path of a workflow README +// read, when filePath is an absolute `/README.md` outside fixtureRoot. ok is +// false for a relative path, a README under the fixture, or any non-README read +// (a contract-skill Read under {plugin_dir}/skills is not a workflow README). +func wanderWorkflowReadme(filePath, fixtureRoot string) (string, bool) { + if filePath == "" || filepath.Base(filePath) != "README.md" || !filepath.IsAbs(filePath) { + return "", false + } + p := filepath.Clean(filePath) + if isUnder(p, fixtureRoot) { + return "", false + } + return p, true +} + +// wanderTarget returns the off-fixture absolute path a boot command targets, when +// the command is a `cd ` or a `--workflow-dir ` resolving outside +// fixtureRoot. ok is false when the command names no such escaping path (it stays +// under the fixture, uses a relative path, or is an ordinary command). +func wanderTarget(command, fixtureRoot string) (string, bool) { + for _, tok := range bootPathArgs(command) { + if !filepath.IsAbs(tok) { + continue + } + p := filepath.Clean(tok) + if p == fixtureRoot || isUnder(p, fixtureRoot) { + continue + } + return p, true + } + return "", false +} + +// bootPathArgs pulls the path arguments a boot command supplies: the target of a +// leading `cd`, and the value after `--workflow-dir`. It splits on whitespace (the +// boot commands are simple `cd …`, `spacedock status --boot --workflow-dir …` +// forms; quoting is not exercised by the real boot stream). +func bootPathArgs(command string) []string { + fields := strings.Fields(command) + var paths []string + for i, f := range fields { + switch { + case f == "cd" && i+1 < len(fields): + paths = append(paths, fields[i+1]) + case f == "--workflow-dir" && i+1 < len(fields): + paths = append(paths, fields[i+1]) + case strings.HasPrefix(f, "--workflow-dir="): + paths = append(paths, strings.TrimPrefix(f, "--workflow-dir=")) + } + } + return paths +} + +// isUnder reports whether path p is nested under dir (both already cleaned). +func isUnder(p, dir string) bool { + rel, err := filepath.Rel(dir, p) + if err != nil { + return false + } + return rel != ".." && !strings.HasPrefix(rel, ".."+string(filepath.Separator)) +} diff --git a/internal/ensigncycle/wrong_root_detect_test.go b/internal/ensigncycle/wrong_root_detect_test.go new file mode 100644 index 00000000..f2546721 --- /dev/null +++ b/internal/ensigncycle/wrong_root_detect_test.go @@ -0,0 +1,125 @@ +// ABOUTME: Offline test for the wrong-root boot detector — proves it reds on a +// ABOUTME: simulated FO wander off the fixture root and passes a fixture-rooted boot. +package ensigncycle + +import ( + "strings" + "testing" +) + +// streamLine builds one assistant-with-Bash-tool_use stream-json line carrying the +// given shell command, the shape detectWrongRootBoot scans. Kept tiny so the cases +// read as "this command in the boot stream" without a fixture file. +func streamLine(command string) string { + return `{"type":"assistant","message":{"content":[{"type":"tool_use","name":"Bash","input":{"command":` + + mustJSONString(command) + `}}]}}` +} + +func mustJSONString(s string) string { + // A minimal JSON string encoder for the test fixtures (the commands here carry + // no control chars), so the line is valid stream-json without importing the + // encoder into the test body. + return `"` + strings.ReplaceAll(s, `"`, `\"`) + `"` +} + +// TestDetectWrongRootBoot covers the pure detector both ways: it reds on a stream +// where the FO `cd`s off the fixture root (the PR #365 opus wander) or boots a +// workflow-dir outside it, and it passes on a fixture-rooted boot whose only +// real-repo paths are the legitimate --plugin-dir contract reads. +func TestDetectWrongRootBoot(t *testing.T) { + const fixtureRoot = "/tmp/TestLiveEnsignCycle1166216625/002" + const realRepo = "/home/runner/work/spacedock/spacedock" + + t.Run("cd_away_from_fixture_root_reds", func(t *testing.T) { + stream := strings.Join([]string{ + streamLine(`echo "CLAUDECODE=${CLAUDECODE:-unset}"`), + streamLine(`cd ` + realRepo + ` && spacedock --version`), + streamLine(`spacedock status --discover`), + }, "\n") + + err := detectWrongRootBoot(stream, fixtureRoot) + if err == nil { + t.Fatal("detector passed a stream where the FO cd'd into the real repo — want a wrong-root error") + } + if !strings.Contains(err.Error(), fixtureRoot) || !strings.Contains(err.Error(), realRepo) { + t.Errorf("error must name both expected (%q) and actual (%q): %v", fixtureRoot, realRepo, err) + } + }) + + t.Run("boot_workflow_dir_outside_fixture_reds", func(t *testing.T) { + stream := strings.Join([]string{ + streamLine(`spacedock --version`), + streamLine(`spacedock status --boot --workflow-dir ` + realRepo + `/docs/dev`), + }, "\n") + + err := detectWrongRootBoot(stream, fixtureRoot) + if err == nil { + t.Fatal("detector passed a boot whose --workflow-dir is the real repo — want a wrong-root error") + } + if !strings.Contains(err.Error(), fixtureRoot) { + t.Errorf("error must name the expected fixture root %q: %v", fixtureRoot, err) + } + }) + + t.Run("workflow_readme_outside_fixture_reds", func(t *testing.T) { + // The FO discovered cwd from the real repo and read its docs/dev workflow + // README — the wander even when no explicit --workflow-dir is on the boot cmd. + stream := strings.Join([]string{ + streamLine(`spacedock --version`), + streamLine(`spacedock status --discover`), + `{"type":"assistant","message":{"content":[{"type":"tool_use","name":"Read","input":{"file_path":"` + realRepo + `/docs/dev/README.md"}}]}}`, + }, "\n") + + err := detectWrongRootBoot(stream, fixtureRoot) + if err == nil { + t.Fatal("detector passed a boot that read the real repo's workflow README — want a wrong-root error") + } + if !strings.Contains(err.Error(), fixtureRoot) { + t.Errorf("error must name the expected fixture root %q: %v", fixtureRoot, err) + } + }) + + t.Run("contract_skill_read_outside_fixture_passes", func(t *testing.T) { + // A contract-skill Read from the real-repo --plugin-dir is legitimate, NOT a + // workflow README, so it must not false-red. + stream := `{"type":"assistant","message":{"content":[{"type":"tool_use","name":"Read","input":{"file_path":"` + realRepo + `/skills/first-officer/references/claude-first-officer-runtime.md"}}]}}` + if err := detectWrongRootBoot(stream, fixtureRoot); err != nil { + t.Errorf("detector red a legitimate contract-skill Read from the plugin-dir: %v", err) + } + }) + + t.Run("fixture_rooted_boot_passes", func(t *testing.T) { + // A correct boot: the contract skill Reads come from the real-repo + // --plugin-dir (legitimate), but the workflow boot stays under the fixture. + stream := strings.Join([]string{ + `{"type":"assistant","message":{"content":[{"type":"tool_use","name":"Read","input":{"file_path":"` + realRepo + `/skills/first-officer/references/first-officer-shared-core.md"}}]}}`, + streamLine(`spacedock --version`), + streamLine(`git rev-parse --show-toplevel`), + streamLine(`spacedock status --boot --workflow-dir ` + fixtureRoot), + `{"type":"assistant","message":{"content":[{"type":"tool_use","name":"Read","input":{"file_path":"` + fixtureRoot + `/README.md"}}]}}`, + }, "\n") + + if err := detectWrongRootBoot(stream, fixtureRoot); err != nil { + t.Errorf("detector red a fixture-rooted boot (plugin-dir contract reads are legitimate): %v", err) + } + }) + + t.Run("cd_to_fixture_subdir_passes", func(t *testing.T) { + // `cd` INTO the fixture (or a subdir) is not a wander. + stream := strings.Join([]string{ + streamLine(`cd ` + fixtureRoot + ` && spacedock status --discover`), + }, "\n") + + if err := detectWrongRootBoot(stream, fixtureRoot); err != nil { + t.Errorf("detector red a cd into the fixture root itself: %v", err) + } + }) + + t.Run("empty_stream_does_not_false_red", func(t *testing.T) { + // No Bash commands at all (e.g. a launch failure stream) is not a wrong-root + // boot — it is a different failure the caller's own checks surface. + if err := detectWrongRootBoot("", fixtureRoot); err != nil { + t.Errorf("detector red an empty stream: %v", err) + } + }) +}