Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 80 additions & 0 deletions skills/integration/survey_probe_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,83 @@ func TestSurveyInstallProbe(t *testing.T) {
}
})
}

// extractScaffoldIncumbentFn reads skills/survey/SKILL.md and returns the runnable
// `spacedock_incumbent()` bash function from the step-3 scaffold file-probe (the artifact
// under test). Extraction starts at the `spacedock_incumbent() {` line and runs to the
// closing two-space `}` line of the block. The test EXECUTES the shipped function rather
// than a copy, so the SKILL.md probe and this test cannot drift; removing the spacedock
// file-probe fails extraction here.
func extractScaffoldIncumbentFn(t *testing.T) string {
t.Helper()
path := filepath.Join(repoRoot(t), "skills", "survey", "SKILL.md")
data, err := os.ReadFile(path)
if err != nil {
t.Fatalf("read SKILL.md %s: %v", path, err)
}
lines := strings.Split(string(data), "\n")
var body []string
inFn := false
for _, line := range lines {
if strings.Contains(line, "spacedock_incumbent() {") {
inFn = true
}
if inFn {
body = append(body, line)
if strings.TrimRight(line, " ") == " }" { // the function's closing brace (two-space indent)
break
}
}
}
if len(body) == 0 {
t.Fatalf("expected a runnable spacedock_incumbent() function in SKILL.md step-3 scaffold probe, found none")
}
return strings.Join(body, "\n")
}

// runScaffoldProbe runs the extracted spacedock_incumbent function with cwd set to dir and
// returns its trimmed stdout. The probe inspects the filesystem under cwd, so dir IS the
// fixture condition — the outcome derives entirely from the fixture's on-disk state.
func runScaffoldProbe(t *testing.T, fn, dir string) string {
t.Helper()
bash, err := exec.LookPath("bash")
if err != nil {
t.Skip("bash not on PATH; the scaffold probe is a bash function")
}
cmd := exec.Command(bash, "-c", fn+"\nspacedock_incumbent")
cmd.Dir = dir
out, err := cmd.CombinedOutput()
if err != nil {
t.Fatalf("run scaffold probe in %s: %v\n%s", dir, err, out)
}
return strings.TrimRight(string(out), "\n")
}

// TestSurveyScaffoldIncumbentProbe (cycle-1 B): the spacedock-incumbent file-probe behavior
// test. It runs the exact step-3 `spacedock_incumbent` function from SKILL.md over a
// committed FIXTURE PAIR of directories: (i) a repo with a spacedock workflow on disk (a
// .spacedock-state checkout + a workflow README with spacedock frontmatter) must echo
// "spacedock"; (ii) a survey-self-only repo (no workflow on disk — the kind a `spacedock:survey`
// self-call leaves) must echo nothing. The oracle is the two fixtures' ON-DISK STATE, never
// a SKILL.md grep: the file-probe is what distinguishes a genuine incumbent from the survey's
// own self-call, which the DB tally's `family <> 'spacedock'` exclusion deliberately drops.
func TestSurveyScaffoldIncumbentProbe(t *testing.T) {
fn := extractScaffoldIncumbentFn(t)
base := filepath.Join("testdata", "scaffold")

// (i) a spacedock workflow on disk → named spacedock.
t.Run("workflow-on-disk", func(t *testing.T) {
got := runScaffoldProbe(t, fn, filepath.Join(base, "spacedock-on-disk"))
if got != "spacedock" {
t.Errorf("a repo with a spacedock workflow on disk must be named spacedock, got %q", got)
}
})

// (ii) survey-self-only, no workflow on disk → NOT named spacedock (the false-positive guard).
t.Run("survey-self-only", func(t *testing.T) {
got := runScaffoldProbe(t, fn, filepath.Join(base, "survey-self-only"))
if got != "" {
t.Errorf("a survey-self-only repo with no workflow on disk must NOT be named spacedock, got %q", got)
}
})
}
161 changes: 142 additions & 19 deletions skills/integration/survey_queries_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ func TestSurveyQuerySmoke(t *testing.T) {
for _, name := range []string{
"scoping", "codex-presence", "codex-scoped", "codex-workstreams", "codex-activity",
"scaffold-usage", "work-by-area", "decision-open", "mode-classification",
"dispatch-fact", "decision-no-followup",
} {
if _, ok := queries[name]; !ok {
t.Fatalf("recommended-SQL reference is missing the %q query (have: %v)", name, sortedQueryNames(queries))
Expand All @@ -161,8 +162,11 @@ func TestSurveyQuerySmoke(t *testing.T) {
// shares ONE `project` key; the cwd-prefix-union does the load-bearing work — it counts
// the in-repo Claude sessions (cwd AT root, subdir, worktree, plus the F/G worktree-shape
// + the mode-classification track sessions, all under the prefix) and EXCLUDES the
// blank-cwd session, the out-of-repo session, and ALL the codex rows. The fixture has 9
// in-repo Claude sessions: A,B,C + WT + issue-feed×2 + landing-copy×2 + mixed-bag.
// blank-cwd session, the out-of-repo session, the dispatched SUBAGENT sessions (file_path
// under %/subagents/%), and ALL the codex rows. The fixture has 17 in-repo non-subagent
// Claude sessions: A,B,C + WT + issue-feed×2 + landing-copy×2 + mixed-bag (9) + the two
// dispatch parents + two decision-no-followup + two knowledge-work `notes-ops` (6) + two
// knowledge-work `client-1on1s` (2, the cycle-2 second knowledge-work track).
t.Run("scoping", func(t *testing.T) {
rows := runQuery(t, db, queries["scoping"])
if len(rows) != 1 {
Expand All @@ -172,8 +176,8 @@ func TestSurveyQuerySmoke(t *testing.T) {
if len(fields) != 3 {
t.Fatalf("scoping row should have 3 fields (sessions|blank_cwd|span) — folded_keys is dropped, got: %q", rows[0])
}
if fields[0] != "9" {
t.Errorf("the cwd-prefix should count 9 in-repo Claude sessions, got sessions=%q", fields[0])
if fields[0] != "17" {
t.Errorf("the cwd-prefix should count 17 in-repo non-subagent Claude sessions, got sessions=%q", fields[0])
}
if fields[1] != "0" {
t.Errorf("the blank-cwd Claude session is outside the prefix and must not count, got blank_cwd=%q", fields[1])
Expand Down Expand Up @@ -299,15 +303,15 @@ func TestSurveyQuerySmoke(t *testing.T) {
})

// no-union (AC-2c): the added Codex rows must NOT inflate the Claude scope. The scoping
// query is asserted to 9 above (the Claude-only in-repo count), proving Codex stays out
// query is asserted to 17 above (the Claude-only in-repo count), proving Codex stays out
// of the Claude `sessions` count — a flagged presence, never a silent project union.
t.Run("codex-not-folded-into-scope", func(t *testing.T) {
rows := runQuery(t, db, queries["scoping"])
if len(rows) != 1 {
t.Fatalf("scoping should return one summary row, got %d: %v", len(rows), rows)
}
if sessions := strings.Split(rows[0], "|")[0]; sessions != "9" {
t.Errorf("the Codex rows must not be folded into the Claude scope; scoping.sessions should stay 9, got %q", sessions)
if sessions := strings.Split(rows[0], "|")[0]; sessions != "17" {
t.Errorf("the Codex rows must not be folded into the Claude scope; scoping.sessions should stay 17, got %q", sessions)
}
})

Expand Down Expand Up @@ -366,8 +370,10 @@ func TestSurveyQuerySmoke(t *testing.T) {
t.Errorf("a worktree edit must NOT bucket as `.worktrees` — the physical prefix must be stripped; got %v", edits)
}
// `.claude/worktrees/<wt>/internal/codex.go` strips to `internal` (the second worktree layout).
if edits["internal"] != "4" {
t.Errorf("internal should count 4 (build.go, parse.go, index.go, the .claude/worktrees-stripped codex.go), got %q in %v", edits["internal"], edits)
// internal counts 6: build.go, parse.go, index.go, the .claude/worktrees-stripped codex.go,
// plus the two decision-no-followup `internal/cache/` edits (warm.go, impl.go).
if edits["internal"] != "6" {
t.Errorf("internal should count 6 (build.go, parse.go, index.go, .claude/worktrees-stripped codex.go, + 2 decision-no-followup cache edits), got %q in %v", edits["internal"], edits)
}
// genuine config demotes to kind=config (still counted), NOT filtered.
for _, c := range []string{".claude", ".beads", "<external>"} {
Expand Down Expand Up @@ -399,14 +405,19 @@ func TestSurveyQuerySmoke(t *testing.T) {
}
})

// mode-classification (#324, G / AC-8a): classify each TRACK (keyed by git_branch) into a
// work MODE from the per-track signal tallies (veto density, gate-pass ratio, loop markers,
// edit-kind). The fixture carries a MECHANICAL track (issue-feed: gate-pass, worktree loop,
// code edits, no veto), an EXPLORATION track (landing-copy: vetoes, a rejected path, .md
// edits), and a NEITHER-DOMINANT track (mixed-bag → unlabeled). The labels DERIVE from the
// mode-classification (#324, G / AC-6#2 vocab + AC-5 archetype): classify each TRACK (keyed
// by git_branch) into a work MODE from the per-track signal tallies (veto density, gate-pass
// ratio, loop markers, edit-kind). The fixture carries a MANUAL track (issue-feed: gate-pass,
// worktree loop, code edits, no veto — the repetitive-but-substantive drive loop, the label
// renamed from `mechanical`), an EXPLORATION track (landing-copy: vetoes, a rejected path,
// .md edits), a KNOWLEDGE-WORK track (notes-ops: intake→process→file→log→close markers +
// content/ops `.md`+`.json` edits + a gate-pass batch confirm + zero veto + no issue→PR
// loop), and a NEITHER-DOMINANT track (mixed-bag → unlabeled). The labels DERIVE from the
// signal rows (the independent oracle), never from SKILL.md text. Non-vacuous: (i) swapping
// the mechanical track's rows to carry high vetoes + a rejected path + prose flips its label
// to exploration; (ii) the neither-dominant track stays unlabeled (no guessed automation).
// the manual track's rows to carry high vetoes + a rejected path + prose flips its label
// to exploration; (ii) the neither-dominant track stays unlabeled; (iii) stripping the
// knowledge-work loop markers drops notes-ops back to unlabeled (the kloop marker is the
// load-bearing gate of the knowledge-work score).
t.Run("mode-classification", func(t *testing.T) {
rows := runQuery(t, db, queries["mode-classification"])
mode := map[string]string{}
Expand All @@ -417,15 +428,31 @@ func TestSurveyQuerySmoke(t *testing.T) {
}
mode[f[0]] = f[1]
}
if mode["issue-feed"] != "mechanical" {
t.Errorf("the gate-pass/worktree-loop/code track should classify mechanical, got %q in %v", mode["issue-feed"], mode)
if mode["issue-feed"] != "manual" {
t.Errorf("the gate-pass/worktree-loop/code track should classify manual (the `mechanical`→`manual` rename), got %q in %v", mode["issue-feed"], mode)
}
if mode["landing-copy"] != "exploration" {
t.Errorf("the high-veto/rejected/prose track should classify exploration, got %q in %v", mode["landing-copy"], mode)
}
if mode["notes-ops"] != "knowledge-work" {
t.Errorf("the intake→process→file→log→close + content/ops-edits track should classify knowledge-work, got %q in %v", mode["notes-ops"], mode)
}
// cycle 2: a SECOND distinct knowledge-work track (client-1on1s) so the render names ≥2
// specific types. Both classify knowledge-work (the CLASS stays); the render qualifies
// each with its own workstream-derived type (notes-ops vs 1-1s & assessment).
if mode["client-1on1s"] != "knowledge-work" {
t.Errorf("the second knowledge-work track (people 1-1s & assessment) should classify knowledge-work, got %q in %v", mode["client-1on1s"], mode)
}
if mode["mixed-bag"] != "unlabeled" {
t.Errorf("a neither-dominant track must stay unlabeled (generic book-keeping, never a guessed automation pitch), got %q in %v", mode["mixed-bag"], mode)
}
// the `mechanical` label must NOT appear — it was renamed to `manual` everywhere the
// classifier emits it (reserved for genuinely-trivial tracks the classifier doesn't detect).
for track, m := range mode {
if m == "mechanical" {
t.Errorf("no track may classify `mechanical` after the rename; track %q is mechanical in %v", track, mode)
}
}
// non-vacuous (i): swap issue-feed's signals (high veto + rejected path + prose) → flips to exploration.
db2 := buildFixtureDB(t)
execSQLite(t, db2, `UPDATE messages SET content='[Request interrupted by user]' WHERE session_id='claude:91111111-1111-1111-1111-111111111111';`)
Expand All @@ -439,11 +466,23 @@ func TestSurveyQuerySmoke(t *testing.T) {
flipped[f[0]] = f[1]
}
if flipped["issue-feed"] != "exploration" {
t.Errorf("swapping the mechanical track's signals to the exploration signature must flip its label, got %q in %v", flipped["issue-feed"], flipped)
t.Errorf("swapping the manual track's signals to the exploration signature must flip its label, got %q in %v", flipped["issue-feed"], flipped)
}
if flipped["mixed-bag"] != "unlabeled" {
t.Errorf("the neither-dominant track must stay unlabeled under the signal swap, got %q in %v", flipped["mixed-bag"], flipped)
}
// non-vacuous (iii): strip the knowledge-work loop markers → notes-ops drops to unlabeled,
// proving the intake→process→file→log→close marker gates the knowledge-work score.
db3 := buildFixtureDB(t)
execSQLite(t, db3, `UPDATE messages SET content='ordinary work' WHERE session_id IN ('claude:c6111111-0000-0000-0000-000000000001','claude:c6222222-0000-0000-0000-000000000002');`)
stripped := map[string]string{}
for _, r := range runQuery(t, db3, queries["mode-classification"]) {
f := strings.Split(r, "|")
stripped[f[0]] = f[1]
}
if stripped["notes-ops"] != "unlabeled" {
t.Errorf("stripping the knowledge-work loop markers must drop notes-ops to unlabeled, got %q in %v", stripped["notes-ops"], stripped)
}
})

// decision-open (#320): the rejected AskUserQuestion is OPEN; the ExitPlanMode
Expand Down Expand Up @@ -476,6 +515,90 @@ func TestSurveyQuerySmoke(t *testing.T) {
t.Errorf("the OPEN frontier must sort first so the LIMIT cannot hide it, got leading row: %q", rows[0])
}
})

// dispatch-fact (#za, AC-3): count orchestration over the body's exact parent scope —
// a subagent (relationship_type='subagent') counts only when its PARENT is an in-repo,
// non-subagent Claude session. The fixture seeds two in-repo parents (P1→2 subagents,
// P2→1 ⇒ distinct=2, total=3) and one subagent of the OUT-of-repo session E (must NOT
// count). The expected 2|3 derives from the seeded rows. Non-vacuous: re-pointing the
// out-of-repo subagent's parent to an in-repo parent flips the counts 2|3→3|4, proving
// the parent-scope filter is load-bearing, not a constant.
t.Run("dispatch-fact", func(t *testing.T) {
rows := runQuery(t, db, queries["dispatch-fact"])
if len(rows) != 1 {
t.Fatalf("dispatch-fact should return one summary row, got %d: %v", len(rows), rows)
}
fields := strings.Split(rows[0], "|")
if len(fields) != 2 {
t.Fatalf("dispatch-fact row should have 2 fields (sessions_that_orchestrated|subagents_dispatched), got: %q", rows[0])
}
if fields[0] != "2" {
t.Errorf("dispatch-fact should count 2 distinct in-repo orchestrating parents (E's out-of-repo parent excluded), got %q", fields[0])
}
if fields[1] != "3" {
t.Errorf("dispatch-fact should count 3 in-repo-parented subagents (E's subagent excluded), got %q", fields[1])
}
// non-vacuous: re-point the out-of-repo subagent's parent to an in-repo parent → 2|3 becomes 3|4.
db2 := buildFixtureDB(t)
execSQLite(t, db2, `UPDATE sessions SET parent_session_id='claude:aaaaaaaa-1111-2222-3333-444444444444' WHERE id='claude:e4444444-0000-0000-0000-000000000004';`)
flipped := strings.Split(runQuery(t, db2, queries["dispatch-fact"])[0], "|")
if flipped[0] != "3" || flipped[1] != "4" {
t.Errorf("re-pointing the out-of-repo subagent to an in-repo parent must flip dispatch-fact 2|3→3|4 (parent-scope is load-bearing), got %s|%s", flipped[0], flipped[1])
}
})

// decision-no-followup (#9h, AC-1b): count `done` decisions with NO Edit/Write at a higher
// message ordinal in the same session. "Later" is the REAL chronological order via
// tool_calls.message_id → messages.id → messages.ordinal — NOT tool_calls.id insertion
// order. The fixture seeds NF1 (a done decision at ordinal 2 with an Edit at ordinal 1
// BEFORE it → counts) and NF2 (a done decision at ordinal 2 with a Write at ordinal 3
// AFTER it → does not). The existing decision tool_calls carry NULL message_id so they do
// not join — the oracle is exactly 1. Non-vacuous: inserting an Edit at a HIGHER ordinal
// than NF1's decision decrements the count 1→0, proving the ordinal compare (not insertion
// order: NF1's qualifying Edit has the LOWER tool_calls.id) is load-bearing.
t.Run("decision-no-followup", func(t *testing.T) {
rows := runQuery(t, db, queries["decision-no-followup"])
if len(rows) != 1 {
t.Fatalf("decision-no-followup should return one count row, got %d: %v", len(rows), rows)
}
if rows[0] != "1" {
t.Errorf("decision-no-followup should count 1 (NF1 has no later edit; NF2 does), got %q", rows[0])
}
// non-vacuous: insert an Edit at ordinal 3 in NF1 (higher than its decision's ordinal 2)
// → the count drops 1→0. The Edit's tool_calls.id is the highest, so a wrong insertion-order
// join would behave differently — proving the message_id→ordinal chronological join is load-bearing.
db2 := buildFixtureDB(t)
execSQLite(t, db2, `INSERT INTO messages (id,session_id,ordinal,role,content) VALUES (199,'claude:f0111111-0000-0000-0000-000000000001',3,'assistant','late edit');`)
execSQLite(t, db2, `INSERT INTO tool_calls (id,session_id,message_id,tool_name,input_json) VALUES (199,'claude:f0111111-0000-0000-0000-000000000001','199','Edit','{"file_path":"/repo/proj/internal/cache/late.go"}');`)
flipped := runQuery(t, db2, queries["decision-no-followup"])
if flipped[0] != "0" {
t.Errorf("inserting an Edit at a higher ordinal than NF1's decision must drop decision-no-followup 1→0 (chronological join is load-bearing), got %q", flipped[0])
}
})

// codex-workstreams all-unlabeled collapse (#h5, AC-4a query side): when every Codex-scoped
// session is unclassifiable, codex-workstreams returns ONLY `(unlabeled)` rows — the shape
// the render then collapses to a single honest "N sessions, unclassified" line instead of a
// (unlabeled)-only breakdown. The BASE fixture has ≥2 NAMED clusters (the conditional
// contrast: the render KEEPS the breakdown), already asserted in the codex-workstreams
// sub-test above. Here we mutate the named first_messages to encouragement/meta and assert
// the query collapses to a single all-(unlabeled) row — proving the collapse is conditional
// on the all-unlabeled shape, not a blanket removal.
t.Run("codex-workstreams-all-unlabeled", func(t *testing.T) {
db2 := buildFixtureDB(t)
execSQLite(t, db2, `UPDATE sessions SET first_message='You totally got this. Keep going, friend.' WHERE id IN ('codex:ffffffff-8888-9999-aaaa-bbbbbbbbbbbb','codex:f2f2f2f2-0000-1111-2222-333333333333','codex:f4f4f4f4-8888-9999-aaaa-bbbbbbbbbbbb');`)
rows := runQuery(t, db2, queries["codex-workstreams"])
if len(rows) != 1 {
t.Fatalf("an all-unlabeled Codex set must collapse to ONE codex-workstreams row, got %d: %v", len(rows), rows)
}
f := strings.Split(rows[0], "|")
if f[0] != "(unlabeled)" {
t.Errorf("the single collapsed row must be (unlabeled), got %q", f[0])
}
if f[1] != "4" {
t.Errorf("all 4 codex-scoped sessions should cluster into the single (unlabeled) row, got count %q", f[1])
}
})
}

// sortedQueryNames returns the labeled query names for a diagnostic message.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
id: sampleentity
title: A sample entity
status: ideation
---

Body.
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
commissioned-by: spacedock
stages:
- ideation
- implementation
- validation
---

# Dev workflow

A commissioned spacedock workflow.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Some project

An ordinary repo. Someone ran `spacedock:survey` here once, but never commissioned a workflow.
Loading
Loading