spacedock-dev · clkao · Jun 13, 2026 · Jun 13, 2026
diff --git a/docs/specs/scenario-testing-principles.md b/docs/specs/scenario-testing-principles.md
@@ -59,6 +59,7 @@ The first foundation is the host-neutral runtime scenarios already shipped and h
 - `rejection-flow` — the FO drives a two-cycle rejection trajectory: route the finding back through implementation, re-implement, and re-validate a second cycle reusing the kept-alive reviewer.
 - `feedback-3-cycle-escalation` — on the third consecutive REJECTED validation the FO escalates to the human instead of auto-bouncing a fourth time.
 - `merge-hook-guardrail` — the FO cannot bypass a registered merge hook by terminalizing without pr, mod-block, or force.
+- `filing` — the FO files a new seed entity via the atomic `spacedock new <slug>` path, not the drift-prone `--next-id` + hand-write pair.
 <!-- /seed-scenarios -->
 
 These IDs are the code-backed source of truth. They mirror the `sharedRuntimeScenarios()` table in `internal/ensigncycle`; the seed IDs declared above must equal that table. This block is machine-readable so a lock test can bind the doc to the code and red on drift in either direction — adding, dropping, or renaming a scenario on one side without the other. This is what makes the doc the human-readable face of a code-backed truth rather than prose bound to nothing.

diff --git a/internal/ensigncycle/claude_live_runner_test.go b/internal/ensigncycle/claude_live_runner_test.go
@@ -102,6 +102,7 @@ func claudeScenarioRunners() map[string]func(*testing.T, claudeLiveRunner, share
 		"rejection-flow":              runClaudeRejectionFlowScenario,
 		"feedback-3-cycle-escalation": runClaudeFeedback3CycleEscalationScenario,
 		"merge-hook-guardrail":        runClaudeMergeHookGuardrailScenario,
+		"filing":                      runClaudeFilingScenario,
 	}
 }
 
@@ -203,6 +204,27 @@ func runClaudeMergeHookGuardrailScenario(t *testing.T, runner claudeLiveRunner,
 	emitClaudeScenarioMetrics(t, scenario, result, runner.model)
 }
 
+// runClaudeFilingScenario drives the real FO against an EMPTY workflow and asks it
+// to file one seed entity. It grades the FO's recorded tool-call stream — the FO
+// filed via `spacedock … new <slug>`, not the `--next-id` + `Write` pair — because
+// the durable end-state file is indistinguishable between the two paths. The file
+// must also actually land (the run produced a real seed), so the stream grade is
+// proof of HOW, not just THAT, the entity was filed.
+func runClaudeFilingScenario(t *testing.T, runner claudeLiveRunner, scenario sharedRuntimeScenario) {
+	t.Helper()
+	workflowRoot := t.TempDir()
+	entityPath := writeFilingWorkflow(t, workflowRoot)
+
+	result := runner.run(t, scenario, workflowRoot, filingPrompt())
+	if _, err := os.Stat(entityPath); err != nil {
+		t.Fatalf("the FO did not land the seed entity at %s: %v\nFinal message:\n%s\nArtifacts: %s", entityPath, err, result.finalMessage, result.artifactDir)
+	}
+	if err := assertClaudeFilingViaNew(result.stream, filingSlug); err != nil {
+		t.Fatalf("%v\nFinal message:\n%s\nArtifacts: %s", err, result.finalMessage, result.artifactDir)
+	}
+	emitClaudeScenarioMetrics(t, scenario, result, runner.model)
+}
+
 // run launches the real `spacedock claude` front door for one shared scenario and
 // returns the (finalMessage, full stream) the shared assertions consume. The
 // launch shape is the spike WINNER: --plugin-dir + --skip-contract-check are the

diff --git a/internal/ensigncycle/codex_live_runner_test.go b/internal/ensigncycle/codex_live_runner_test.go
@@ -75,6 +75,7 @@ func codexScenarioRunners() map[string]func(*testing.T, codexLiveRunner, sharedR
 		"rejection-flow":              runCodexRejectionFlowScenario,
 		"feedback-3-cycle-escalation": runCodexFeedback3CycleEscalationScenario,
 		"merge-hook-guardrail":        runCodexMergeHookGuardrailScenario,
+		"filing":                      runCodexFilingScenario,
 	}
 }
 
@@ -214,6 +215,27 @@ func runCodexMergeHookGuardrailScenario(t *testing.T, runner codexLiveRunner, sc
 	emitCodexScenarioMetrics(t, scenario, result)
 }
 
+// runCodexFilingScenario drives the real FO against an EMPTY workflow and asks it
+// to file one seed entity. Like the Claude runner it grades the FO's recorded
+// command stream — the FO filed via `spacedock … new <slug>`, not a `--next-id`
+// preview-then-write — because the durable end-state file is indistinguishable
+// between the two paths. The file must also actually land, so the stream grade is
+// proof of HOW, not just THAT, the entity was filed.
+func runCodexFilingScenario(t *testing.T, runner codexLiveRunner, scenario sharedRuntimeScenario) {
+	t.Helper()
+	workflowRoot := t.TempDir()
+	entityPath := writeFilingWorkflow(t, workflowRoot)
+
+	result := runner.run(t, scenario, workflowRoot, filingPrompt())
+	if _, err := os.Stat(entityPath); err != nil {
+		t.Fatalf("the FO did not land the seed entity at %s: %v\nFinal message:\n%s\nArtifacts: %s", entityPath, err, result.finalMessage, result.artifactDir)
+	}
+	if err := assertCodexFilingViaNew(result.jsonl, filingSlug); err != nil {
+		t.Fatalf("%v\nFinal message:\n%s\nArtifacts: %s", err, result.finalMessage, result.artifactDir)
+	}
+	emitCodexScenarioMetrics(t, scenario, result)
+}
+
 // run launches `codex exec --json` for one shared scenario. Liveness is the SAME
 // streamWatcher the Claude runner and the live cycle use — one mechanism, no second
 // impl. drainToExit runs the process to exit accumulating the full --json

diff --git a/internal/ensigncycle/pi_shared_coverage_test.go b/internal/ensigncycle/pi_shared_coverage_test.go
@@ -27,6 +27,10 @@ func piSharedScenarioCoverageMap() map[string]piSharedScenarioCoverage {
 			mode:   "gap",
 			reason: "Pi currently has durable live coverage for subagent dispatch/front-door setup, but not a live-safe shared first-officer merge-hook runner.",
 		},
+		"filing": {
+			mode:   "gap",
+			reason: "Pi currently has durable live coverage for subagent dispatch/front-door setup, but not a live-safe shared first-officer filing runner.",
+		},
 	}
 }
 

diff --git a/internal/ensigncycle/shared_filing_negative_test.go b/internal/ensigncycle/shared_filing_negative_test.go
@@ -0,0 +1,98 @@
+package ensigncycle
+
+import "testing"
+
+// Offline positive + negative cases for the `filing` scenario assertions. They
+// build synthetic host streams — a stream that filed via `new` (passes) and the
+// SPECIFIC manual-flow streams the assertion guards against (`--next-id` + a
+// hand-write, must go red) — so a tautological assertion that only checked "a new
+// command appeared" would stay green on the manual flow and these cases fail it.
+// Offline (default tag): the assertions are pure functions over the transcript.
+
+// claudeToolUse builds a stream-json assistant line carrying one tool_use block.
+func claudeToolUse(name, inputJSON string) string {
+	return `{"type":"assistant","message":{"content":[{"type":"tool_use","name":"` + name + `","input":` + inputJSON + `}]}}`
+}
+
+// codexCommand builds a `codex exec --json` command_execution item line.
+func codexCommand(command string) string {
+	return `{"type":"item.completed","item":{"type":"command_execution","command":"` + command + `"}}`
+}
+
+func TestAssertClaudeFilingViaNew(t *testing.T) {
+	slug := filingSlug
+
+	// Positive: the FO filed via `spacedock new <slug>` piping a body on stdin.
+	filed := claudeToolUse("Bash", `{"command":"spacedock new `+slug+` --workflow-dir . <<'EOF'\n# Wire The Thing\nbody\nEOF"}`)
+	if err := assertClaudeFilingViaNew(filed, slug); err != nil {
+		t.Fatalf("expected a `new`-filed stream to pass: %v", err)
+	}
+
+	// Positive: the `--new` flag alias also counts.
+	filedAlias := claudeToolUse("Bash", `{"command":"spacedock status --new `+slug+` --workflow-dir ."}`)
+	if err := assertClaudeFilingViaNew(filedAlias, slug); err != nil {
+		t.Fatalf("expected the `--new` alias to count as atomic filing: %v", err)
+	}
+
+	// Negative: no atomic filing at all — the FO only previewed the id and never
+	// committed to a create path. Must fail on the missing-`new` half.
+	previewOnly := claudeToolUse("Bash", `{"command":"spacedock status --next-id --workflow-dir ."}`)
+	if err := assertClaudeFilingViaNew(previewOnly, slug); err == nil {
+		t.Fatal("expected a stream with no `new` command to fail")
+	}
+
+	// Negative: the manual pair — `--next-id` preview THEN a `Write` of the entity
+	// file. This is the drift-prone flow `new` replaces; it must fail even though
+	// the durable file would look identical.
+	manualPair := claudeToolUse("Bash", `{"command":"spacedock status --next-id --workflow-dir ."}`) + "\n" +
+		claudeToolUse("Write", `{"file_path":"001-`+slug+`.md","content":"---\nid: 001\n---\n"}`)
+	if err := assertClaudeFilingViaNew(manualPair, slug); err == nil {
+		t.Fatal("expected the manual `--next-id` + `Write` pair to fail even with no `new` command")
+	}
+
+	// Negative: BOTH `new` AND the manual pair appear — a run that filed atomically
+	// but ALSO hand-wrote must still fail on the pair check, so the positive half
+	// cannot mask a manual write.
+	newPlusManual := filed + "\n" + manualPair
+	if err := assertClaudeFilingViaNew(newPlusManual, slug); err == nil {
+		t.Fatal("expected `new` plus the manual `--next-id` + `Write` pair to fail on the pair check")
+	}
+
+	// A `--next-id` alone alongside `new` (no entity Write) is fine — previewing the
+	// candidate is not the manual flow without the hand-write that pairs with it.
+	newWithPreview := filed + "\n" + claudeToolUse("Bash", `{"command":"spacedock status --next-id --workflow-dir ."}`)
+	if err := assertClaudeFilingViaNew(newWithPreview, slug); err != nil {
+		t.Fatalf("expected `new` plus a bare `--next-id` preview (no entity Write) to pass: %v", err)
+	}
+}
+
+func TestAssertCodexFilingViaNew(t *testing.T) {
+	slug := filingSlug
+
+	// Positive: the FO filed via a `spacedock new <slug>` command_execution.
+	filed := codexCommand("spacedock new " + slug + " --workflow-dir .")
+	if err := assertCodexFilingViaNew(filed, slug); err != nil {
+		t.Fatalf("expected a `new`-filed Codex stream to pass: %v", err)
+	}
+
+	// Negative: no atomic filing — must fail on the missing-`new` half.
+	none := codexCommand("spacedock status --workflow-dir .")
+	if err := assertCodexFilingViaNew(none, slug); err == nil {
+		t.Fatal("expected a Codex stream with no `new` command to fail")
+	}
+
+	// Negative: the manual flow's id source — a `--next-id` command — appears. On
+	// Codex (no Write tool) the `--next-id` command itself is the discriminator;
+	// `new` needs none. Must fail even if `new` was also run.
+	newPlusNextID := filed + "\n" + codexCommand("spacedock status --next-id --workflow-dir .")
+	if err := assertCodexFilingViaNew(newPlusNextID, slug); err == nil {
+		t.Fatal("expected a `--next-id` filing command on Codex to fail even alongside `new`")
+	}
+
+	// Negative: only the manual `--next-id` preview, no `new` — fails on both halves
+	// (caught by the missing-`new` check first).
+	nextIDOnly := codexCommand("spacedock status --next-id --workflow-dir .")
+	if err := assertCodexFilingViaNew(nextIDOnly, slug); err == nil {
+		t.Fatal("expected a `--next-id`-only Codex stream to fail")
+	}
+}
diff --git a/internal/ensigncycle/shared_filing_test.go b/internal/ensigncycle/shared_filing_test.go
@@ -0,0 +1,155 @@
+package ensigncycle
+
+import (
+	"encoding/json"
+	"fmt"
+	"regexp"
+	"strings"
+)
+
+// The host-specific filing assertions for the `filing` scenario. They grade the
+// FO's recorded tool-call STREAM — not the end-state file, which looks identical
+// whether filed via `spacedock new` or hand-assembled, and not a grep of the
+// contract prose. The producer signal is: the FO ran a `spacedock … new <slug>`
+// invocation (the atomic-create path) and did NOT fall back to the manual
+// `--next-id` + file-write pair. Claude runs commands via the `Bash` tool and
+// writes files via the `Write` tool; Codex runs everything (including file
+// writes) as `command_execution` items — so the manual-pair shape differs per
+// host and the assertions live behind host adapters, like reviewer-reuse. They
+// sit under the DEFAULT build tags (stdlib JSON only) so the offline negative
+// tests exercise them without spending a model.
+
+// newInvocation matches a spacedock atomic-create invocation in a command string:
+// either the `new` subcommand or the `--new` flag (its alias), in a `spacedock`
+// or `${SPACEDOCK_BIN…}` launcher call. The slug is matched separately so the
+// command must carry BOTH the create verb and the requested slug.
+var newInvocation = regexp.MustCompile(`(?:spacedock|SPACEDOCK_BIN)[^\n]*?(?:\bnew\b|--new)`)
+
+// nextIDInvocation matches a `status --next-id` candidate-preview command — the
+// first half of the manual filing pair the atomic path replaces.
+var nextIDInvocation = regexp.MustCompile(`--next-id\b`)
+
+// commandFilesViaNew reports whether a command string is the atomic-create call
+// for the requested slug: a `new`/`--new` invocation that names the slug.
+func commandFilesViaNew(command, slug string) bool {
+	return newInvocation.MatchString(command) && strings.Contains(command, slug)
+}
+
+// assertClaudeFilingViaNew scans the stream-json transcript for the FO filing the
+// seed via `spacedock … new <slug>` (a Bash tool call) and NOT via the manual
+// `--next-id` + `Write` pair. It enforces both halves, because either alone
+// false-passes the manual flow:
+//
+//  1. The FO ran a `spacedock … new <slug>` Bash command (the atomic-create path).
+//  2. The FO did NOT emit the manual pair: a `--next-id` Bash command AND a
+//     `Write` tool_use creating the entity `.md`. A `Write` of the entity file
+//     after a `--next-id` preview is exactly the drift-prone flow `new` replaces,
+//     so its presence FAILS even if `new` was also run.
+func assertClaudeFilingViaNew(stream, slug string) error {
+	filedViaNew := false
+	sawNextID := false
+	wroteEntityFile := false
+
+	for _, line := range strings.Split(stream, "\n") {
+		line = strings.TrimSpace(line)
+		if line == "" {
+			continue
+		}
+		var entry struct {
+			Message *struct {
+				Content []struct {
+					Type  string `json:"type"`
+					Name  string `json:"name"`
+					Input struct {
+						Command  string `json:"command"`
+						FilePath string `json:"file_path"`
+					} `json:"input"`
+				} `json:"content"`
+			} `json:"message"`
+		}
+		if err := json.Unmarshal([]byte(line), &entry); err != nil || entry.Message == nil {
+			continue
+		}
+		for _, block := range entry.Message.Content {
+			if block.Type != "tool_use" {
+				continue
+			}
+			switch block.Name {
+			case "Bash":
+				if commandFilesViaNew(block.Input.Command, slug) {
+					filedViaNew = true
+				}
+				if nextIDInvocation.MatchString(block.Input.Command) {
+					sawNextID = true
+				}
+			case "Write":
+				if strings.Contains(block.Input.FilePath, slug) && strings.HasSuffix(block.Input.FilePath, ".md") {
+					wroteEntityFile = true
+				}
+			}
+		}
+	}
+
+	if !filedViaNew {
+		return fmt.Errorf("the FO did not file the seed via a `spacedock … new %s` command — it never used the atomic-create path", slug)
+	}
+	if sawNextID && wroteEntityFile {
+		return fmt.Errorf("the FO emitted the manual `--next-id` + `Write %s.md` pair — it hand-assembled the entity instead of letting `new` write it atomically", slug)
+	}
+	return nil
+}
+
+// codexCommandItem is one `codex exec --json` command_execution item: Codex runs
+// every shell action — including writing a file via heredoc/apply_patch — as a
+// command_execution, so both the `new` invocation and any manual file-write land
+// here.
+type codexCommandItem struct {
+	Type string `json:"type"`
+	Item struct {
+		Type    string `json:"type"`
+		Command string `json:"command"`
+	} `json:"item"`
+}
+
+// assertCodexFilingViaNew scans the `codex exec --json` transcript for the FO
+// filing the seed via `spacedock … new <slug>` and NOT via the manual flow. On
+// Codex there is no `Write` tool — the manual pair would be a `--next-id` command
+// followed by a shell file-write — so the discriminator is the `--next-id`
+// candidate-preview command: the atomic path needs none. It enforces both halves:
+//
+//  1. The FO ran a `spacedock … new <slug>` command_execution.
+//  2. The FO did NOT run a `--next-id` filing command (the manual pair's id
+//     source). `new` mints the id itself, so a `--next-id` here means the FO
+//     reached for the manual flow.
+func assertCodexFilingViaNew(jsonl, slug string) error {
+	filedViaNew := false
+	sawNextID := false
+
+	for _, line := range strings.Split(jsonl, "\n") {
+		line = strings.TrimSpace(line)
+		if line == "" {
+			continue
+		}
+		var ev codexCommandItem
+		if err := json.Unmarshal([]byte(line), &ev); err != nil {
+			continue
+		}
+		if ev.Item.Type != "command_execution" {
+			continue
+		}
+		if commandFilesViaNew(ev.Item.Command, slug) {
+			filedViaNew = true
+		}
+		if nextIDInvocation.MatchString(ev.Item.Command) {
+			sawNextID = true
+		}
+	}
+
+	if !filedViaNew {
+		return fmt.Errorf("the FO did not file the seed via a `spacedock … new %s` command — it never used the atomic-create path", slug)
+	}
+	if sawNextID {
+		return fmt.Errorf("the FO ran a `--next-id` filing command — it reached for the manual preview-then-write flow instead of the atomic `new` path")
+	}
+	return nil
+}
diff --git a/internal/ensigncycle/shared_fixtures_test.go b/internal/ensigncycle/shared_fixtures_test.go
@@ -293,3 +293,50 @@ func mergeHookGuardPrompt() string {
 		"Do not edit, archive, approve, force, set mod-block, or retry terminalization. Your final response must include the guard error mentioning merge hooks.",
 	)
 }
+
+// filingSlug is the slug the FO is asked to file. It is what the positive
+// assertion looks for in the `spacedock … new <slug>` command and what the entity
+// file lands as on disk.
+const filingSlug = "wire-the-thing"
+
+func writeFilingWorkflow(t *testing.T, root string) string {
+	t.Helper()
+	writeFile(t, filepath.Join(root, "README.md"), filingReadme())
+	gitInit(t, root)
+	// The entity does NOT exist yet — the FO files the first seed during the run.
+	// `spacedock new <slug>` writes the flat `<slug>.md` form (the minted id is
+	// stamped INTO the frontmatter, not into the filename). The runner stats this
+	// path AFTER the run to confirm the seed landed.
+	return filepath.Join(root, filingSlug+".md")
+}
+
+func filingReadme() string {
+	return "---\n" +
+		"commissioned-by: spacedock@1\n" +
+		"entity-type: task\n" +
+		"id-style: sequential\n" +
+		"stages:\n" +
+		"  defaults:\n" +
+		"    worktree: false\n" +
+		"    concurrency: 1\n" +
+		"  states:\n" +
+		"    - name: backlog\n" +
+		"      initial: true\n" +
+		"    - name: done\n" +
+		"      terminal: true\n" +
+		"---\n" +
+		"# Filing Fixture\n\n" +
+		"This fixture starts EMPTY: there are no entities yet. The first officer is asked to file one seed task. The id-style is `sequential`, so the manual flow (`status --next-id` then hand-writing the file) is available — the scenario proves the FO instead uses the atomic-create path.\n\n" +
+		"### backlog\n\nSeed tasks land here.\n\n- **Outputs:** A filed seed entity.\n\n" +
+		"### done\n\nTerminal state.\n"
+}
+
+func filingPrompt() string {
+	return fmt.Sprintf("%s\n\n%s\n%s\n%s\n%s",
+		"Use $spacedock:first-officer for this whole run.",
+		"Workflow directory: .",
+		"This workflow is empty. File one new seed task with the slug `"+filingSlug+"` and the title `Wire The Thing`, landing it in the initial backlog stage with a one-line description body.",
+		"File it using the blessed atomic-create path your contract teaches, not by hand-assembling frontmatter after a candidate-id preview.",
+		"Do not dispatch any workers and do not advance the entity past backlog. Your final response must confirm the seed task was filed.",
+	)
+}