Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/specs/scenario-testing-principles.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ The first foundation is the host-neutral runtime scenarios already shipped and h
- `rejection-flow` — the FO drives a two-cycle rejection trajectory: route the finding back through implementation, re-implement, and re-validate a second cycle reusing the kept-alive reviewer.
- `feedback-3-cycle-escalation` — on the third consecutive REJECTED validation the FO escalates to the human instead of auto-bouncing a fourth time.
- `merge-hook-guardrail` — the FO cannot bypass a registered merge hook by terminalizing without pr, mod-block, or force.
- `filing` — the FO files a new seed entity via the atomic `spacedock new <slug>` path, not the drift-prone `--next-id` + hand-write pair.
<!-- /seed-scenarios -->

These IDs are the code-backed source of truth. They mirror the `sharedRuntimeScenarios()` table in `internal/ensigncycle`; the seed IDs declared above must equal that table. This block is machine-readable so a lock test can bind the doc to the code and red on drift in either direction — adding, dropping, or renaming a scenario on one side without the other. This is what makes the doc the human-readable face of a code-backed truth rather than prose bound to nothing.
Expand Down
22 changes: 22 additions & 0 deletions internal/ensigncycle/claude_live_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ func claudeScenarioRunners() map[string]func(*testing.T, claudeLiveRunner, share
"rejection-flow": runClaudeRejectionFlowScenario,
"feedback-3-cycle-escalation": runClaudeFeedback3CycleEscalationScenario,
"merge-hook-guardrail": runClaudeMergeHookGuardrailScenario,
"filing": runClaudeFilingScenario,
}
}

Expand Down Expand Up @@ -203,6 +204,27 @@ func runClaudeMergeHookGuardrailScenario(t *testing.T, runner claudeLiveRunner,
emitClaudeScenarioMetrics(t, scenario, result, runner.model)
}

// runClaudeFilingScenario drives the real FO against an EMPTY workflow and asks it
// to file one seed entity. It grades the FO's recorded tool-call stream — the FO
// filed via `spacedock … new <slug>`, not the `--next-id` + `Write` pair — because
// the durable end-state file is indistinguishable between the two paths. The file
// must also actually land (the run produced a real seed), so the stream grade is
// proof of HOW, not just THAT, the entity was filed.
func runClaudeFilingScenario(t *testing.T, runner claudeLiveRunner, scenario sharedRuntimeScenario) {
t.Helper()
workflowRoot := t.TempDir()
entityPath := writeFilingWorkflow(t, workflowRoot)

result := runner.run(t, scenario, workflowRoot, filingPrompt())
if _, err := os.Stat(entityPath); err != nil {
t.Fatalf("the FO did not land the seed entity at %s: %v\nFinal message:\n%s\nArtifacts: %s", entityPath, err, result.finalMessage, result.artifactDir)
}
if err := assertClaudeFilingViaNew(result.stream, filingSlug); err != nil {
t.Fatalf("%v\nFinal message:\n%s\nArtifacts: %s", err, result.finalMessage, result.artifactDir)
}
emitClaudeScenarioMetrics(t, scenario, result, runner.model)
}

// run launches the real `spacedock claude` front door for one shared scenario and
// returns the (finalMessage, full stream) the shared assertions consume. The
// launch shape is the spike WINNER: --plugin-dir + --skip-contract-check are the
Expand Down
22 changes: 22 additions & 0 deletions internal/ensigncycle/codex_live_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ func codexScenarioRunners() map[string]func(*testing.T, codexLiveRunner, sharedR
"rejection-flow": runCodexRejectionFlowScenario,
"feedback-3-cycle-escalation": runCodexFeedback3CycleEscalationScenario,
"merge-hook-guardrail": runCodexMergeHookGuardrailScenario,
"filing": runCodexFilingScenario,
}
}

Expand Down Expand Up @@ -214,6 +215,27 @@ func runCodexMergeHookGuardrailScenario(t *testing.T, runner codexLiveRunner, sc
emitCodexScenarioMetrics(t, scenario, result)
}

// runCodexFilingScenario drives the real FO against an EMPTY workflow and asks it
// to file one seed entity. Like the Claude runner it grades the FO's recorded
// command stream — the FO filed via `spacedock … new <slug>`, not a `--next-id`
// preview-then-write — because the durable end-state file is indistinguishable
// between the two paths. The file must also actually land, so the stream grade is
// proof of HOW, not just THAT, the entity was filed.
func runCodexFilingScenario(t *testing.T, runner codexLiveRunner, scenario sharedRuntimeScenario) {
t.Helper()
workflowRoot := t.TempDir()
entityPath := writeFilingWorkflow(t, workflowRoot)

result := runner.run(t, scenario, workflowRoot, filingPrompt())
if _, err := os.Stat(entityPath); err != nil {
t.Fatalf("the FO did not land the seed entity at %s: %v\nFinal message:\n%s\nArtifacts: %s", entityPath, err, result.finalMessage, result.artifactDir)
}
if err := assertCodexFilingViaNew(result.jsonl, filingSlug); err != nil {
t.Fatalf("%v\nFinal message:\n%s\nArtifacts: %s", err, result.finalMessage, result.artifactDir)
}
emitCodexScenarioMetrics(t, scenario, result)
}

// run launches `codex exec --json` for one shared scenario. Liveness is the SAME
// streamWatcher the Claude runner and the live cycle use — one mechanism, no second
// impl. drainToExit runs the process to exit accumulating the full --json
Expand Down
4 changes: 4 additions & 0 deletions internal/ensigncycle/pi_shared_coverage_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ func piSharedScenarioCoverageMap() map[string]piSharedScenarioCoverage {
mode: "gap",
reason: "Pi currently has durable live coverage for subagent dispatch/front-door setup, but not a live-safe shared first-officer merge-hook runner.",
},
"filing": {
mode: "gap",
reason: "Pi currently has durable live coverage for subagent dispatch/front-door setup, but not a live-safe shared first-officer filing runner.",
},
}
}

Expand Down
98 changes: 98 additions & 0 deletions internal/ensigncycle/shared_filing_negative_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
package ensigncycle

import "testing"

// Offline positive + negative cases for the `filing` scenario assertions. They
// build synthetic host streams — a stream that filed via `new` (passes) and the
// SPECIFIC manual-flow streams the assertion guards against (`--next-id` + a
// hand-write, must go red) — so a tautological assertion that only checked "a new
// command appeared" would stay green on the manual flow and these cases fail it.
// Offline (default tag): the assertions are pure functions over the transcript.

// claudeToolUse builds a stream-json assistant line carrying one tool_use block.
func claudeToolUse(name, inputJSON string) string {
return `{"type":"assistant","message":{"content":[{"type":"tool_use","name":"` + name + `","input":` + inputJSON + `}]}}`
}

// codexCommand builds a `codex exec --json` command_execution item line.
func codexCommand(command string) string {
return `{"type":"item.completed","item":{"type":"command_execution","command":"` + command + `"}}`
}

func TestAssertClaudeFilingViaNew(t *testing.T) {
slug := filingSlug

// Positive: the FO filed via `spacedock new <slug>` piping a body on stdin.
filed := claudeToolUse("Bash", `{"command":"spacedock new `+slug+` --workflow-dir . <<'EOF'\n# Wire The Thing\nbody\nEOF"}`)
if err := assertClaudeFilingViaNew(filed, slug); err != nil {
t.Fatalf("expected a `new`-filed stream to pass: %v", err)
}

// Positive: the `--new` flag alias also counts.
filedAlias := claudeToolUse("Bash", `{"command":"spacedock status --new `+slug+` --workflow-dir ."}`)
if err := assertClaudeFilingViaNew(filedAlias, slug); err != nil {
t.Fatalf("expected the `--new` alias to count as atomic filing: %v", err)
}

// Negative: no atomic filing at all — the FO only previewed the id and never
// committed to a create path. Must fail on the missing-`new` half.
previewOnly := claudeToolUse("Bash", `{"command":"spacedock status --next-id --workflow-dir ."}`)
if err := assertClaudeFilingViaNew(previewOnly, slug); err == nil {
t.Fatal("expected a stream with no `new` command to fail")
}

// Negative: the manual pair — `--next-id` preview THEN a `Write` of the entity
// file. This is the drift-prone flow `new` replaces; it must fail even though
// the durable file would look identical.
manualPair := claudeToolUse("Bash", `{"command":"spacedock status --next-id --workflow-dir ."}`) + "\n" +
claudeToolUse("Write", `{"file_path":"001-`+slug+`.md","content":"---\nid: 001\n---\n"}`)
if err := assertClaudeFilingViaNew(manualPair, slug); err == nil {
t.Fatal("expected the manual `--next-id` + `Write` pair to fail even with no `new` command")
}

// Negative: BOTH `new` AND the manual pair appear — a run that filed atomically
// but ALSO hand-wrote must still fail on the pair check, so the positive half
// cannot mask a manual write.
newPlusManual := filed + "\n" + manualPair
if err := assertClaudeFilingViaNew(newPlusManual, slug); err == nil {
t.Fatal("expected `new` plus the manual `--next-id` + `Write` pair to fail on the pair check")
}

// A `--next-id` alone alongside `new` (no entity Write) is fine — previewing the
// candidate is not the manual flow without the hand-write that pairs with it.
newWithPreview := filed + "\n" + claudeToolUse("Bash", `{"command":"spacedock status --next-id --workflow-dir ."}`)
if err := assertClaudeFilingViaNew(newWithPreview, slug); err != nil {
t.Fatalf("expected `new` plus a bare `--next-id` preview (no entity Write) to pass: %v", err)
}
}

func TestAssertCodexFilingViaNew(t *testing.T) {
slug := filingSlug

// Positive: the FO filed via a `spacedock new <slug>` command_execution.
filed := codexCommand("spacedock new " + slug + " --workflow-dir .")
if err := assertCodexFilingViaNew(filed, slug); err != nil {
t.Fatalf("expected a `new`-filed Codex stream to pass: %v", err)
}

// Negative: no atomic filing — must fail on the missing-`new` half.
none := codexCommand("spacedock status --workflow-dir .")
if err := assertCodexFilingViaNew(none, slug); err == nil {
t.Fatal("expected a Codex stream with no `new` command to fail")
}

// Negative: the manual flow's id source — a `--next-id` command — appears. On
// Codex (no Write tool) the `--next-id` command itself is the discriminator;
// `new` needs none. Must fail even if `new` was also run.
newPlusNextID := filed + "\n" + codexCommand("spacedock status --next-id --workflow-dir .")
if err := assertCodexFilingViaNew(newPlusNextID, slug); err == nil {
t.Fatal("expected a `--next-id` filing command on Codex to fail even alongside `new`")
}

// Negative: only the manual `--next-id` preview, no `new` — fails on both halves
// (caught by the missing-`new` check first).
nextIDOnly := codexCommand("spacedock status --next-id --workflow-dir .")
if err := assertCodexFilingViaNew(nextIDOnly, slug); err == nil {
t.Fatal("expected a `--next-id`-only Codex stream to fail")
}
}
155 changes: 155 additions & 0 deletions internal/ensigncycle/shared_filing_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
package ensigncycle

import (
"encoding/json"
"fmt"
"regexp"
"strings"
)

// The host-specific filing assertions for the `filing` scenario. They grade the
// FO's recorded tool-call STREAM — not the end-state file, which looks identical
// whether filed via `spacedock new` or hand-assembled, and not a grep of the
// contract prose. The producer signal is: the FO ran a `spacedock … new <slug>`
// invocation (the atomic-create path) and did NOT fall back to the manual
// `--next-id` + file-write pair. Claude runs commands via the `Bash` tool and
// writes files via the `Write` tool; Codex runs everything (including file
// writes) as `command_execution` items — so the manual-pair shape differs per
// host and the assertions live behind host adapters, like reviewer-reuse. They
// sit under the DEFAULT build tags (stdlib JSON only) so the offline negative
// tests exercise them without spending a model.

// newInvocation matches a spacedock atomic-create invocation in a command string:
// either the `new` subcommand or the `--new` flag (its alias), in a `spacedock`
// or `${SPACEDOCK_BIN…}` launcher call. The slug is matched separately so the
// command must carry BOTH the create verb and the requested slug.
var newInvocation = regexp.MustCompile(`(?:spacedock|SPACEDOCK_BIN)[^\n]*?(?:\bnew\b|--new)`)

// nextIDInvocation matches a `status --next-id` candidate-preview command — the
// first half of the manual filing pair the atomic path replaces.
var nextIDInvocation = regexp.MustCompile(`--next-id\b`)

// commandFilesViaNew reports whether a command string is the atomic-create call
// for the requested slug: a `new`/`--new` invocation that names the slug.
func commandFilesViaNew(command, slug string) bool {
return newInvocation.MatchString(command) && strings.Contains(command, slug)
}

// assertClaudeFilingViaNew scans the stream-json transcript for the FO filing the
// seed via `spacedock … new <slug>` (a Bash tool call) and NOT via the manual
// `--next-id` + `Write` pair. It enforces both halves, because either alone
// false-passes the manual flow:
//
// 1. The FO ran a `spacedock … new <slug>` Bash command (the atomic-create path).
// 2. The FO did NOT emit the manual pair: a `--next-id` Bash command AND a
// `Write` tool_use creating the entity `.md`. A `Write` of the entity file
// after a `--next-id` preview is exactly the drift-prone flow `new` replaces,
// so its presence FAILS even if `new` was also run.
func assertClaudeFilingViaNew(stream, slug string) error {
filedViaNew := false
sawNextID := false
wroteEntityFile := false

for _, line := range strings.Split(stream, "\n") {
line = strings.TrimSpace(line)
if line == "" {
continue
}
var entry struct {
Message *struct {
Content []struct {
Type string `json:"type"`
Name string `json:"name"`
Input struct {
Command string `json:"command"`
FilePath string `json:"file_path"`
} `json:"input"`
} `json:"content"`
} `json:"message"`
}
if err := json.Unmarshal([]byte(line), &entry); err != nil || entry.Message == nil {
continue
}
for _, block := range entry.Message.Content {
if block.Type != "tool_use" {
continue
}
switch block.Name {
case "Bash":
if commandFilesViaNew(block.Input.Command, slug) {
filedViaNew = true
}
if nextIDInvocation.MatchString(block.Input.Command) {
sawNextID = true
}
case "Write":
if strings.Contains(block.Input.FilePath, slug) && strings.HasSuffix(block.Input.FilePath, ".md") {
wroteEntityFile = true
}
}
}
}

if !filedViaNew {
return fmt.Errorf("the FO did not file the seed via a `spacedock … new %s` command — it never used the atomic-create path", slug)
}
if sawNextID && wroteEntityFile {
return fmt.Errorf("the FO emitted the manual `--next-id` + `Write %s.md` pair — it hand-assembled the entity instead of letting `new` write it atomically", slug)
}
return nil
}

// codexCommandItem is one `codex exec --json` command_execution item: Codex runs
// every shell action — including writing a file via heredoc/apply_patch — as a
// command_execution, so both the `new` invocation and any manual file-write land
// here.
type codexCommandItem struct {
Type string `json:"type"`
Item struct {
Type string `json:"type"`
Command string `json:"command"`
} `json:"item"`
}

// assertCodexFilingViaNew scans the `codex exec --json` transcript for the FO
// filing the seed via `spacedock … new <slug>` and NOT via the manual flow. On
// Codex there is no `Write` tool — the manual pair would be a `--next-id` command
// followed by a shell file-write — so the discriminator is the `--next-id`
// candidate-preview command: the atomic path needs none. It enforces both halves:
//
// 1. The FO ran a `spacedock … new <slug>` command_execution.
// 2. The FO did NOT run a `--next-id` filing command (the manual pair's id
// source). `new` mints the id itself, so a `--next-id` here means the FO
// reached for the manual flow.
func assertCodexFilingViaNew(jsonl, slug string) error {
filedViaNew := false
sawNextID := false

for _, line := range strings.Split(jsonl, "\n") {
line = strings.TrimSpace(line)
if line == "" {
continue
}
var ev codexCommandItem
if err := json.Unmarshal([]byte(line), &ev); err != nil {
continue
}
if ev.Item.Type != "command_execution" {
continue
}
if commandFilesViaNew(ev.Item.Command, slug) {
filedViaNew = true
}
if nextIDInvocation.MatchString(ev.Item.Command) {
sawNextID = true
}
}

if !filedViaNew {
return fmt.Errorf("the FO did not file the seed via a `spacedock … new %s` command — it never used the atomic-create path", slug)
}
if sawNextID {
return fmt.Errorf("the FO ran a `--next-id` filing command — it reached for the manual preview-then-write flow instead of the atomic `new` path")
}
return nil
}
47 changes: 47 additions & 0 deletions internal/ensigncycle/shared_fixtures_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,3 +293,50 @@ func mergeHookGuardPrompt() string {
"Do not edit, archive, approve, force, set mod-block, or retry terminalization. Your final response must include the guard error mentioning merge hooks.",
)
}

// filingSlug is the slug the FO is asked to file. It is what the positive
// assertion looks for in the `spacedock … new <slug>` command and what the entity
// file lands as on disk.
const filingSlug = "wire-the-thing"

func writeFilingWorkflow(t *testing.T, root string) string {
t.Helper()
writeFile(t, filepath.Join(root, "README.md"), filingReadme())
gitInit(t, root)
// The entity does NOT exist yet — the FO files the first seed during the run.
// `spacedock new <slug>` writes the flat `<slug>.md` form (the minted id is
// stamped INTO the frontmatter, not into the filename). The runner stats this
// path AFTER the run to confirm the seed landed.
return filepath.Join(root, filingSlug+".md")
}

func filingReadme() string {
return "---\n" +
"commissioned-by: spacedock@1\n" +
"entity-type: task\n" +
"id-style: sequential\n" +
"stages:\n" +
" defaults:\n" +
" worktree: false\n" +
" concurrency: 1\n" +
" states:\n" +
" - name: backlog\n" +
" initial: true\n" +
" - name: done\n" +
" terminal: true\n" +
"---\n" +
"# Filing Fixture\n\n" +
"This fixture starts EMPTY: there are no entities yet. The first officer is asked to file one seed task. The id-style is `sequential`, so the manual flow (`status --next-id` then hand-writing the file) is available — the scenario proves the FO instead uses the atomic-create path.\n\n" +
"### backlog\n\nSeed tasks land here.\n\n- **Outputs:** A filed seed entity.\n\n" +
"### done\n\nTerminal state.\n"
}

func filingPrompt() string {
return fmt.Sprintf("%s\n\n%s\n%s\n%s\n%s",
"Use $spacedock:first-officer for this whole run.",
"Workflow directory: .",
"This workflow is empty. File one new seed task with the slug `"+filingSlug+"` and the title `Wire The Thing`, landing it in the initial backlog stage with a one-line description body.",
"File it using the blessed atomic-create path your contract teaches, not by hand-assembling frontmatter after a candidate-id preview.",
"Do not dispatch any workers and do not advance the entity past backlog. Your final response must confirm the seed task was filed.",
)
}
Loading
Loading