diff --git a/.claude/hooks/safety-guardrails.py b/.claude/hooks/safety-guardrails.py
index 04fd888..48e671c 100755
--- a/.claude/hooks/safety-guardrails.py
+++ b/.claude/hooks/safety-guardrails.py
@@ -38,7 +38,12 @@
# Dangerous bash command patterns
_DEFAULT_DANGEROUS_COMMANDS = [
- r"rm\s+-rf\s+/", # rm -rf /
+ # Block `rm -rf /` (bare root), `rm -rf /etc`, `rm -rf /home/user`, etc.,
+ # but ALLOW deletion of subpaths UNDER a temp root (rm -rf /tmp/
,
+ # /private/tmp/, /var/folders/, /var/tmp/) — legitimate
+ # scratch cleanup. The negative lookahead requires a trailing slash, so the
+ # temp root itself (`rm -rf /tmp`) stays blocked; only children are allowed.
+ r"rm\s+-rf\s+/(?!(?:tmp|private/tmp|var/folders|var/tmp)/)", # rm -rf / (non-temp)
r"rm\s+-rf\s+\*", # rm -rf *
r"rm\s+-rf\s+\.\.", # rm -rf ..
r"git\s+push.*--force.*main",
diff --git a/.claude/rules/learned/architecture-patterns.md b/.claude/rules/learned/architecture-patterns.md
index d55654f..d1d8bc4 100644
--- a/.claude/rules/learned/architecture-patterns.md
+++ b/.claude/rules/learned/architecture-patterns.md
@@ -161,3 +161,19 @@
# CORRECT: templates_src is fence-free; copier injects exactly once:
wrapped = f"# map:start\n{rendered}\n# map:end\n" if fenced else rendered
```
+
+- **Spike-First Gating: High-Risk Binding Decisions Require a Docs-Only Artifact Before Implementation** (2026-06-04): When a subtask's answer would bind downstream implementation (which channel carries a value, which API call is idempotent, what schema a subprocess emits), run it FIRST as a docs-only spike that writes an artifact naming the empirical answer + the binding strategy, and commits ZERO production code. Downstream subtasks reference the artifact by name and consume it, not assumptions. A wrong assumption that is not spiked propagates into every component built on it and forces a rewrite cascade. In this workflow a research-agent wrongly claimed skill-activation wasn't recoverable from `claude -p`; the ST-001 spike empirically corrected it before any dispatcher code existed. The spike artifact MUST contain a named "binding strategy" section, not just findings (Monitor hard-stopped once for a missing strategy section). [workflow: map-efficient]
+
+- **Producer-Owns-Parse: The Component That Owns the Subprocess Owns All Derived Fields; Consumers Read the Typed Result** (2026-06-04): When component A launches a subprocess (or owns a raw source) and component B consumes the result, ALL parsing/derivation (transcript reads, field extraction, signal combination) lives in A; B reads only the typed result struct and never re-implements parsing. Two payoffs: (1) a single parse site that a Mock producer can supply directly, so consumer tests need no subprocess/transcript fixture; (2) when the raw output schema changes, only A changes. Putting any parse in B re-couples the modules through the raw format. Extends "Contract-First Inter-Component JSON Schemas": the contract is A's typed struct, and the parse-to-struct boundary is A's responsibility exclusively. [workflow: map-efficient]
+ ```python
+ # WRONG — runner re-parses a transcript it does not own (couples to raw format)
+ result = dispatcher.dispatch(cell) # raw proc output
+ skill = extract_skill_from_transcript(read_jsonl(result.session_id))
+
+ # CORRECT — dispatcher parses once into a typed field; runner just reads it
+ @dataclass
+ class DispatchResult:
+ triggered_skill: str | None # parsed by dispatcher, NOT by runner
+ token_usage: TokenUsage | None
+ # tests inject MockDispatcher(triggered_skill="map-plan") — no subprocess needed
+ ```
diff --git a/.claude/rules/learned/implementation-patterns.md b/.claude/rules/learned/implementation-patterns.md
index 657b145..626da22 100644
--- a/.claude/rules/learned/implementation-patterns.md
+++ b/.claude/rules/learned/implementation-patterns.md
@@ -128,3 +128,42 @@ paths:
dest.chmod(dest.stat().st_mode | 0o755)
# test guard: assert os.access(installed_hook, os.X_OK)
```
+
+- **`claude -p` Output Has Two Channels: Envelope for Tokens, Transcript JSONL for Skill Name** (2026-06-04): When shelling `claude -p --output-format json` as a subprocess, two distinct output channels carry different information — do not confuse them. The JSON result envelope (stdout) carries `.result` (response text), `.usage` (input/output/cache tokens), and `.session_id`. The name of the skill/slash-surface that actually fired is NOT in the envelope — it is only in Claude Code's native transcript JSONL (located by session_id) as a `tool_use` block with `name=="Skill"` and `input.skill`. Deriving this from the framework's own scratch/digest schema rather than the native transcript yields a wrong claim. Verify empirically by reading the real transcript after a spike call; never infer from internal schema files. [workflow: map-efficient]
+ ```python
+ env = json.loads(proc.stdout) # .result, .usage, .session_id
+ tokens = env["usage"] # CORRECT — tokens are in the envelope
+ # env.get("skill") -> None # WRONG — fired-skill is NOT in the envelope
+ for line in transcript_jsonl(env["session_id"]).read_text().splitlines():
+ m = json.loads(line)
+ if m.get("type") == "tool_use" and m.get("name") == "Skill":
+ triggered = m["input"]["skill"]; break
+ ```
+
+- **Scoped Config-Flag Mutation: Seed a Throwaway Temp Copy; Never Modify the Production Source of Truth** (2026-06-04): When a tool/test needs a shipped config flag to behave differently from its production default (e.g. stripping `disable-model-invocation: true` so an eval can auto-select skills), mutate the flag ONLY in a throwaway temp dir seeded with a copy of the production config, discarded after the subprocess exits. Never patch the source repo or `templates_src`. A blanket production flip is a footgun: it silently changes behavior for every other user of the flag and may be committed accidentally. Scope of mutation must match scope of need: one subprocess call → one throwaway dir, always cleaned up in `finally`. [workflow: map-efficient]
+ ```python
+ tmp = Path(tempfile.mkdtemp())
+ shutil.copytree(REPO / ".claude", tmp / ".claude") # seed from production
+ strip_flag(tmp / ".claude" / "skills") # mutate throwaway ONLY
+ try:
+ subprocess.run(["claude", "-p", prompt, "--output-format", "json"], cwd=tmp)
+ finally:
+ shutil.rmtree(tmp) # production never touched
+ ```
+
+- **Clock-Free Core with Caller-Supplied Path: Inject Timestamps at the CLI Boundary, Not Inside the Worker** (2026-06-04): When a worker writes durable output (a timestamped JSONL, a run artifact), do NOT call `datetime.now()` inside the worker. Have the CLI/outermost caller generate the timestamped path and pass it as an explicit `out_path: Path` the worker treats as opaque. Benefits: (1) tests pass `tmp_path / "results.jsonl"` with zero clock monkeypatching; (2) the worker is deterministic given the same inputs+path; (3) resume keys on the path the CLI owns. Refines "Long-Running Operations Need Durable State by Default" by fixing WHERE path/timestamp generation lives — at the boundary, not the core. [workflow: map-efficient]
+ ```python
+ # CORRECT: worker takes out_path; CLI owns the timestamp
+ def run_eval(*, entries, dispatcher, runs, out_path: Path, resume=False) -> list: ...
+ # CLI: out = default_run_path(root, skill, datetime.now(tz).strftime("%Y%m%dT%H%M%SZ"))
+ # Test: run_eval(..., out_path=tmp_path / "r.jsonl") # no time mocking
+ ```
+
+- **Concurrent Durable Append: threading.Lock for Line Integrity + Stable cell_id Resume Key** (2026-06-04): When parallel workers append JSONL lines to a shared durable file, two invariants must BOTH hold: (1) no interleaved partial lines — guard each `f.write(line + "\n")` with a threading.Lock; (2) resume is idempotent regardless of write order — key on a stable id present in every record (cell_id), never on line number/position. Nondeterministic write order is fine as long as resume dedups by id. Each worker subprocess also runs in its own temp cwd so concurrent subprocesses never share a working dir. Complements "Long-Running Operations Need Durable State" (process-restart durability) with within-process concurrency safety. [workflow: map-efficient]
+ ```python
+ with self._lock: # atomic per-line append
+ with out_path.open("a", encoding="utf-8") as f:
+ f.write(json.dumps(record) + "\n")
+ done = {json.loads(l)["cell_id"] for l in out_path.read_text().splitlines() if l.strip()}
+ pending = [c for c in cells if make_cell_id(...) not in done] # order-independent resume
+ ```
diff --git a/.claude/rules/learned/testing-strategies.md b/.claude/rules/learned/testing-strategies.md
index 3d327b4..83a91ba 100644
--- a/.claude/rules/learned/testing-strategies.md
+++ b/.claude/rules/learned/testing-strategies.md
@@ -139,3 +139,12 @@ paths:
# 3. git restore -> confirm GREEN
# 4. commit file + test together
```
+
+- **Blueprint-Named Test Functions Are a Monitor Contract: Author Them in the Same Subtask as the Code** (2026-06-04): When a subtask blueprint's `test_strategy` names specific pytest function names (e.g. `test_vc3_resume_skips_present_cell_ids`), Monitor treats those names as a HARD completeness contract: a subtask whose logic is correct but whose blueprint-named functions do not yet exist gets `valid=false` (hard stop). The completeness unit is code + named-test-functions-together, not code alone — the blueprint author chose the names to specify observable behavior, so an absent name means the behavior is unverified. Never stub a named test with `pass`/`# TODO` and call the subtask done; the stub satisfies the import but not the contract. In this workflow ST-005's runner code was correct but Monitor hard-stopped until the four named VC tests were authored with real assertions. [workflow: map-efficient]
+
+- **Final Verification Must Check Shipped Docs Against Actual Behavior, Then Grep for the Same Drift Class** (2026-06-04): After code+tests are green, a dedicated final-verification pass must validate that user-facing docs (SKILL.md, README, CLI `--help`) match actual behavior: default values, accepted schema formats, flag names, output field names. Prose drift is invisible to pytest/ruff/mypy. When the first drift instance is found, immediately grep the WHOLE doc for the same class of claim (every `--flag default`, every schema example, every accepted file-format mention) before moving on — drift clusters because the doc was written once from a design doc, not from running code. Here the final-verifier caught a `--max-concurrency` default of 4 (actual 1); grepping the same file then surfaced a fictional YAML eval-set schema block + `.yaml` examples that the JSON-only loader could never parse. [workflow: map-efficient]
+ ```bash
+ # one drift found -> grep the whole doc for the drift class before marking done
+ mapify skill-eval --help | grep -i max-concurrency # actual default
+ grep -nE 'default|yaml|schema|--[a-z-]+' docs/SKILL.md # reconcile every claim
+ ```
diff --git a/.claude/skills/map-efficient/SKILL.md b/.claude/skills/map-efficient/SKILL.md
index b986b52..2045905 100644
--- a/.claude/skills/map-efficient/SKILL.md
+++ b/.claude/skills/map-efficient/SKILL.md
@@ -191,8 +191,10 @@ python3 .map/scripts/map_step_runner.py record_test_baseline "$BRANCH"
Snapshots pre-existing failures so later subtasks distinguish
"introduced regression" from "was broken pre-plan". Auto-detects
-Make/pytest/go test/cargo. Overrides + narrow-target guidance:
-[efficient-reference.md](efficient-reference.md#pre-flight-test-baseline).
+Make/pytest/go test/cargo. It captures the test run internally and prints a
+single compact JSON report at the end — read that JSON directly; do NOT pipe it
+through `head`/`tail` (per the repo bash guidelines). Overrides + narrow-target
+guidance: [efficient-reference.md](efficient-reference.md#pre-flight-test-baseline).
### Wave Computation (after INIT_STATE) - REQUIRED
diff --git a/.claude/skills/map-efficient/efficient-reference.md b/.claude/skills/map-efficient/efficient-reference.md
index 802d11c..6734cfc 100644
--- a/.claude/skills/map-efficient/efficient-reference.md
+++ b/.claude/skills/map-efficient/efficient-reference.md
@@ -203,6 +203,11 @@ fix or defer.
python3 .map/scripts/map_step_runner.py record_test_baseline "$BRANCH"
```
+It captures the test run internally and prints a single compact JSON report at
+the end — read that JSON directly. Do NOT pipe it through `head`/`tail` (per the
+repo bash guidelines); the output is one small object, not a stream, so
+truncating it only hides fields.
+
Auto-detects from project markers:
- `Makefile` with `test:` target → `make test`
- `pyproject.toml` / `pytest.ini` → `pytest`
diff --git a/.claude/skills/map-skill-eval/SKILL.md b/.claude/skills/map-skill-eval/SKILL.md
new file mode 100644
index 0000000..567ac04
--- /dev/null
+++ b/.claude/skills/map-skill-eval/SKILL.md
@@ -0,0 +1,94 @@
+---
+name: map-skill-eval
+description: |
+ Evaluate a /map-* skill's trigger accuracy and cost. Use when asked to measure skill trigger accuracy, run an eval-set, or check token/duration cost via `mapify skill-eval`. Do NOT use to plan or implement; use map-plan or map-efficient.
+effort: medium
+disable-model-invocation: true
+argument-hint: "[skill] [--eval-set PATH]"
+---
+# /map-skill-eval — Skill Trigger Accuracy & Cost Evaluation
+
+Purpose: measure whether a `/map-*` skill fires on the right prompts and what it costs in tokens and time. Do not plan or implement from this skill.
+
+Requires the `claude` CLI (installed and on `$PATH`). The skill is skipped at install time on hosts without `claude`.
+
+## Invocation
+
+```bash
+mapify skill-eval run --eval-set PATH [--dry-run] [--resume] [--max-concurrency N]
+```
+
+- `` — the skill name to evaluate (e.g. `map-plan`).
+- `--eval-set PATH` — path to a JSON eval-set file defining prompt cases and expected assertions.
+- `--dry-run` — validate the eval-set and print the planned run count without spending any quota.
+- `--resume` — continue an interrupted run from the last durable checkpoint.
+- `--max-concurrency N` — max parallel `claude -p` workers (default: 1).
+
+## What It Does
+
+1. **Prompts × runs matrix** — for each case in the eval-set, invokes `claude -p` in an isolated temporary working directory seeded with `.claude/` (skills, settings). Runs are independent; no shared state leaks between cases.
+2. **Transcript-parse trigger detection** — parses each `claude -p` transcript to determine whether the target skill fired (trigger) or did not fire (not_trigger).
+3. **Deterministic assertions** — each eval case may specify one or more assertion types:
+ - `contains` / `not_contains` — substring presence in the response.
+ - `regex` — pattern match against the response.
+ - `valid_json` — response parses as JSON.
+ - `trigger` / `not_trigger` — skill fired / did not fire.
+4. **Durable resumable run log** — results are appended to `.map/eval-runs//.jsonl` as each case completes, so a partial run is recoverable via `--resume`.
+5. **Summary report** — after all cases complete, prints pass-rate (passed/total) plus per-case token usage, duration, and cache-hit stats.
+
+## Eval-Set Format
+
+A JSON object with an `entries` array. Each entry has a `prompt`, optional
+`should_trigger` / `should_not_trigger` skill names (the runner turns these into
+`trigger` / `not_trigger` assertions), and an optional `assertions` array.
+Assertion types: `contains`, `not_contains`, `regex`, `valid_json`, `trigger`,
+`not_trigger`.
+
+```json
+{
+ "entries": [
+ {
+ "prompt": "Decompose this feature into subtasks",
+ "should_trigger": "map-plan",
+ "assertions": [
+ { "type": "contains", "value": "subtask" }
+ ]
+ },
+ {
+ "prompt": "Run quality gates",
+ "should_not_trigger": "map-plan",
+ "assertions": []
+ }
+ ]
+}
+```
+
+## --dry-run
+
+`--dry-run` validates the eval-set schema and prints the planned case count with estimated quota usage. No `claude -p` calls are made; no `.jsonl` is written.
+
+## Examples
+
+```bash
+# Validate eval-set without spending quota
+mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --dry-run
+
+# Run full eval with up to 8 parallel workers
+mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --max-concurrency 8
+
+# Resume an interrupted run
+mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --resume
+```
+
+## Troubleshooting
+
+- **`claude` not found** — `map-skill-eval` requires the `claude` CLI on `$PATH`. Install it and re-run `mapify init` to activate the skill.
+- **Eval-set validation error on `--dry-run`** — check that each case has a non-empty `id`, a `prompt`, and at least one `assertions` entry with a valid `type`.
+- **Run log not found for `--resume`** — `--resume` looks for the latest `.map/eval-runs//.jsonl`. If no prior run exists, omit `--resume` to start fresh.
+- **All cases report `not_trigger` unexpectedly** — verify the skill name matches exactly (e.g. `map-plan`, not `map_plan`) and that `.claude/` was seeded correctly in the temp cwd.
+
+## Related Commands
+
+- `/map-plan` — plan and decompose tasks.
+- `/map-efficient` — full MAP workflow execution.
+- `/map-check` — run quality gates and verify MAP workflow completion.
diff --git a/.claude/skills/skill-rules.json b/.claude/skills/skill-rules.json
index bbe32ab..d5a9606 100644
--- a/.claude/skills/skill-rules.json
+++ b/.claude/skills/skill-rules.json
@@ -239,6 +239,18 @@
]
}
},
+ "map-skill-eval": {
+ "type": "manual",
+ "skillClass": "task",
+ "enforcement": "manual",
+ "priority": "medium",
+ "description": "Evaluate a /map-* skill's trigger accuracy + cost via mapify skill-eval (claude -p matrix, deterministic assertions, durable resumable runs).",
+ "requires-cmd": ["claude"],
+ "promptTriggers": {
+ "keywords": ["map-skill-eval","skill-eval","skill eval","evaluate skill","trigger accuracy","skill triggering"],
+ "intentPatterns": ["map-skill-eval","(eval|evaluate|measure|test).*(skill).*(trigger|fire|cost)","does .* skill trigger"]
+ }
+ },
"map-task": {
"type": "manual",
"skillClass": "task",
diff --git a/CLAUDE.md b/CLAUDE.md
index 46dd045..7b5db00 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -65,6 +65,7 @@ Validation:
- "Not in the CI gate" is NOT a valid reason to skip. The error is real if any tool reported it.
- "Static-analysis noise" is NOT a category. Either the type system is correct and the code is wrong, or the annotation needs fixing — pick one and fix it.
- Only legitimate skip: the user explicitly approves deferral in the current conversation. Document the deferral in writing.
+- **Any error encountered while operating the MAP Framework itself must be fixed immediately, in the same change.** This covers the framework's own runtime — a hook that crashes or false-positives, a `.map/scripts/` runner or gate that errors or mis-reports, a `mapify` CLI traceback, a render/validator/blueprint failure, a broken `Task`/agent dispatch. When you hit one mid-task: STOP, find the root cause, and fix it before continuing the original work. Do NOT work around it, do NOT defer it as "unrelated", do NOT note-and-move-on past a broken tool. If the fix is genuinely out of scope or risky, stop and ask the user — never silently continue past a malfunctioning framework component. (Errors raised by an external plugin/hook NOT shipped by this repo are out of scope here; say so and route them to the user.)
## Bash Command Guidelines
diff --git a/src/mapify_cli/__init__.py b/src/mapify_cli/__init__.py
index de885c2..cf89218 100644
--- a/src/mapify_cli/__init__.py
+++ b/src/mapify_cli/__init__.py
@@ -140,6 +140,12 @@ def create_ssl_context():
app.add_typer(validate_app, name="validate")
+skill_eval_app = typer.Typer(
+ name="skill-eval", help="Evaluate a skill's trigger accuracy + cost"
+)
+
+app.add_typer(skill_eval_app, name="skill-eval")
+
def version_callback(value: bool):
"""Callback to show version and exit."""
@@ -1361,6 +1367,127 @@ def upgrade():
)
+# Skill-eval commands
+
+
+@skill_eval_app.command("run")
+def skill_eval_run(
+ skill: str = typer.Argument(..., help="Skill under test, e.g. map-debug"),
+ eval_set: Optional[Path] = typer.Option(
+ None, "--eval-set", help="Path to eval-set JSON"
+ ),
+ dry_run: bool = typer.Option(
+ False, "--dry-run", help="Validate eval-set + print planned count; spend nothing"
+ ),
+ resume: bool = typer.Option(
+ False, "--resume", help="Resume a partial run, skipping completed cells"
+ ),
+ max_concurrency: int = typer.Option(
+ 1, "--max-concurrency", min=1, help="Bounded parallel dispatch (default 1)"
+ ),
+) -> None:
+ """Run a skill evaluation matrix.
+
+ Exit codes:
+ 0 - Success (or dry-run completed)
+ 1 - Runtime error (claude not found, or unexpected failure)
+ 2 - Validation error (missing --eval-set or malformed eval-set file)
+ """
+ # Intent: lazy import to keep top-level import time low and avoid import cycles.
+ import mapify_cli.skills_eval.runner as _runner
+ import mapify_cli.skills_eval.aggregator as _aggregator
+ from mapify_cli.skills_eval.dispatcher import ClaudeSubprocessDispatcher
+ from mapify_cli.skills_eval.eval_schema import EvalResultRecord
+ from datetime import timezone
+
+ # SC-2: --eval-set is required.
+ if eval_set is None:
+ console.print(
+ "[bold red]Error:[/bold red] provide --eval-set PATH"
+ )
+ raise typer.Exit(2)
+
+ # SC-2: load and validate the eval-set; malformed/empty → Exit(2), NO invocations.
+ try:
+ entries = _runner.load_eval_set(eval_set)
+ except ValueError as exc:
+ console.print(f"[bold red]Error:[/bold red] {exc}")
+ raise typer.Exit(2)
+
+ # Dry-run path: zero quota, NO dispatcher construction, NO claude required.
+ if dry_run:
+ # D10: variant_id fixed = 1, runs = 1.
+ planned = len(entries) * 1 * 1
+ console.print(
+ f"[bold]Dry-run:[/bold] planned [cyan]{planned}[/cyan] invocation(s) "
+ f"for skill [bold]{skill}[/bold] — spends 0 quota"
+ )
+ raise typer.Exit(0)
+
+ # HC-6: require claude BEFORE any invocation.
+ if shutil.which("claude") is None:
+ console.print(
+ "[bold red]Error:[/bold red] requires-cmd: claude — "
+ "install the claude CLI and ensure it is on PATH"
+ )
+ raise typer.Exit(1)
+
+ # Resolve output path.
+ root = Path.cwd()
+ if resume:
+ latest = _runner.latest_run_path(root, skill)
+ out_path = latest if latest is not None else _runner.default_run_path(
+ root, skill, datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+ )
+ else:
+ out_path = _runner.default_run_path(
+ root, skill, datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+ )
+
+ # Run the evaluation matrix.
+ disp = ClaudeSubprocessDispatcher()
+ _aggregator.bounded_run(
+ skill=skill,
+ entries=entries,
+ dispatcher=disp,
+ runs=1,
+ out_path=out_path,
+ resume=resume,
+ max_concurrency=max_concurrency,
+ )
+
+ # Read all records from the output file, aggregate, and print summary.
+ records: List[EvalResultRecord] = []
+ if out_path.exists():
+ for raw_line in out_path.read_text(encoding="utf-8").splitlines():
+ raw_line = raw_line.strip()
+ if not raw_line:
+ continue
+ try:
+ records.append(EvalResultRecord.from_dict(__import__("json").loads(raw_line)))
+ except (ValueError, KeyError):
+ continue
+
+ summary = _aggregator.aggregate(records)
+ console.print(
+ f"\n[bold]Eval complete:[/bold] skill=[bold]{skill}[/bold] "
+ f"pass_rate=[cyan]{summary.pass_rate:.1%}[/cyan] "
+ f"({summary.passed_cells}/{summary.total_cells} cells passed)"
+ )
+ if summary.tokens_mean is not None:
+ console.print(
+ f" tokens mean={summary.tokens_mean:.1f} "
+ f"stddev={summary.tokens_stddev or 0.0:.1f} "
+ f"(n={summary.token_sample_size})"
+ )
+ if summary.duration_mean is not None:
+ console.print(
+ f" duration mean={summary.duration_mean:.2f}s "
+ f"stddev={summary.duration_stddev or 0.0:.2f}s"
+ )
+ console.print(f" artifact: [cyan]{out_path}[/cyan]")
+
+
# Validate commands
diff --git a/src/mapify_cli/skills_eval/__init__.py b/src/mapify_cli/skills_eval/__init__.py
new file mode 100644
index 0000000..df7042f
--- /dev/null
+++ b/src/mapify_cli/skills_eval/__init__.py
@@ -0,0 +1,57 @@
+"""skills_eval — skill trigger evaluation data contracts and dispatchers.
+
+Exports the shared types used by every eval component (dispatcher, assertions,
+runner, aggregator) and the concrete dispatcher implementations.
+"""
+
+from __future__ import annotations
+
+from mapify_cli.skills_eval.assertions import (
+ AssertionResult,
+ run_assertion,
+ run_assertions,
+)
+from mapify_cli.skills_eval.dispatcher import (
+ ClaudeSubprocessDispatcher,
+ MockDispatcher,
+ VariantDispatcher,
+)
+from mapify_cli.skills_eval.eval_schema import (
+ DispatchResult,
+ EvalResultRecord,
+ EvalSetEntry,
+ make_cell_id,
+)
+from mapify_cli.skills_eval.runner import (
+ default_run_path,
+ evaluate_cell,
+ latest_run_path,
+ load_eval_set,
+ run_eval,
+)
+from mapify_cli.skills_eval.aggregator import (
+ AggregateSummary,
+ aggregate,
+ bounded_run,
+)
+
+__all__ = [
+ "AggregateSummary",
+ "AssertionResult",
+ "ClaudeSubprocessDispatcher",
+ "DispatchResult",
+ "EvalResultRecord",
+ "EvalSetEntry",
+ "MockDispatcher",
+ "VariantDispatcher",
+ "aggregate",
+ "bounded_run",
+ "default_run_path",
+ "evaluate_cell",
+ "latest_run_path",
+ "load_eval_set",
+ "make_cell_id",
+ "run_assertion",
+ "run_assertions",
+ "run_eval",
+]
diff --git a/src/mapify_cli/skills_eval/aggregator.py b/src/mapify_cli/skills_eval/aggregator.py
new file mode 100644
index 0000000..1e55e27
--- /dev/null
+++ b/src/mapify_cli/skills_eval/aggregator.py
@@ -0,0 +1,300 @@
+"""Aggregation and bounded-concurrency runner for skills_eval.
+
+Public API:
+- ``AggregateSummary`` -- frozen dataclass summarising a completed eval run.
+- ``aggregate(records)`` -- compute summary stats from a list of EvalResultRecord.
+- ``bounded_run(...)`` -- parallel cell dispatch with serialised durable writes.
+
+Design invariants respected:
+- INV-3: no ``import anthropic``, no ANTHROPIC_API_KEY access.
+- INV-5: ClaudeSubprocessDispatcher isolation is automatic (each dispatch creates
+ its own mkdtemp cwd); no extra isolation code is needed here.
+- VC1: pass_rate = passed_cells / total_cells (0.0 when total==0, never divide-by-zero).
+- VC2: token mean/stddev use statistics.mean/stdev; n<2 → stddev 0.0; n==0 → None.
+- VC3: bounded_run serialises writes under a threading.Lock (no .jsonl corruption).
+- VC4: aggregate never raises on empty list or all-null token_usage records.
+- SC-1: max_concurrency controls ThreadPoolExecutor workers; default 1 (sequential).
+"""
+
+from __future__ import annotations
+
+import concurrent.futures
+import dataclasses
+import logging
+import statistics
+import threading
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, TypeAlias
+
+from mapify_cli.skills_eval.eval_schema import EvalResultRecord
+from mapify_cli.skills_eval.eval_schema import EvalSetEntry
+from mapify_cli.skills_eval.dispatcher import VariantDispatcher
+from mapify_cli.skills_eval.runner import (
+ _append_record,
+ _read_present_cell_ids,
+ evaluate_cell,
+ make_cell_id,
+)
+
+logger = logging.getLogger(__name__)
+
+# Intent: fixed variant_id per D10 -- matches the constant in runner.py.
+_VARIANT_ID: int = 1
+
+# Re-export make_cell_id so callers who import from aggregator get it too.
+__all__ = ["AggregateSummary", "aggregate", "bounded_run"]
+
+# Intent: module-level TypeAlias so pyright can resolve it in function annotations.
+_WorkItem: TypeAlias = tuple[int, int, EvalSetEntry]
+
+
+# ---------------------------------------------------------------------------
+# AggregateSummary
+# ---------------------------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class AggregateSummary:
+ """Aggregate statistics over a completed eval run.
+
+ JSON-serialisable via ``to_dict()``. All float fields that can be absent
+ (token stats, duration when no records) are typed ``float | None``.
+
+ Fields
+ ------
+ total_cells:
+ Total number of ``EvalResultRecord`` objects in the input.
+ passed_cells:
+ Count of records whose ``assertions_failed`` list is EMPTY.
+ pass_rate:
+ ``passed_cells / total_cells``; 0.0 when ``total_cells == 0``.
+ token_sample_size:
+ Count of records where ``token_usage`` is not None.
+ tokens_mean:
+ Arithmetic mean of ``token_usage.total`` over the token sample.
+ ``None`` when ``token_sample_size == 0``.
+ tokens_stddev:
+ Sample standard deviation of ``token_usage.total``; 0.0 when
+ ``token_sample_size < 2``; ``None`` when ``token_sample_size == 0``.
+ duration_mean:
+ Arithmetic mean of ``record.duration_s`` over all records.
+ ``None`` when ``total_cells == 0``.
+ duration_stddev:
+ Sample standard deviation of ``duration_s``; 0.0 when
+ ``total_cells < 2``; ``None`` when ``total_cells == 0``.
+ """
+
+ total_cells: int
+ passed_cells: int
+ pass_rate: float
+ token_sample_size: int
+ tokens_mean: float | None
+ tokens_stddev: float | None
+ duration_mean: float | None
+ duration_stddev: float | None
+
+ def to_dict(self) -> dict[str, Any]:
+ """Return a JSON-serialisable dict for this summary."""
+ return dataclasses.asdict(self)
+
+
+# ---------------------------------------------------------------------------
+# _safe_stddev (n<2 guard, shared by token and duration paths)
+# ---------------------------------------------------------------------------
+
+
+def _safe_stddev(xs: list[float]) -> float:
+ """Return sample stdev of *xs*, guarding against n<2 with 0.0.
+
+ ``statistics.stdev`` raises ``StatisticsError`` on n<2; we normalise that
+ to 0.0 because a single-sample (or zero-sample) collection has no spread.
+ The caller guarantees ``len(xs) >= 1`` (use 0.0 for empty at the call site).
+ """
+ if len(xs) < 2:
+ return 0.0
+ return statistics.stdev(xs)
+
+
+# ---------------------------------------------------------------------------
+# aggregate
+# ---------------------------------------------------------------------------
+
+
+def aggregate(records: list[EvalResultRecord]) -> AggregateSummary:
+ """Compute aggregate statistics over *records*.
+
+ Never raises, even for an empty list or all-null ``token_usage`` records.
+
+ Parameters
+ ----------
+ records:
+ List of ``EvalResultRecord`` objects from a completed (or partial) run.
+ May be empty.
+
+ Returns
+ -------
+ AggregateSummary
+ Populated summary. When ``records`` is empty:
+ ``total_cells=0, passed_cells=0, pass_rate=0.0,
+ token_sample_size=0, tokens_mean=None, tokens_stddev=None,
+ duration_mean=None, duration_stddev=None``.
+ """
+ total_cells = len(records)
+
+ # VC1: pass_rate --- cells with EMPTY assertions_failed are "passed".
+ passed_cells = sum(1 for r in records if len(r.assertions_failed) == 0)
+ # Intent: explicit zero-guard so we never divide by zero.
+ pass_rate = passed_cells / total_cells if total_cells > 0 else 0.0
+
+ # VC2/VC4: token stats --- only over records with non-null token_usage.
+ token_totals: list[float] = [
+ float(r.token_usage.total) for r in records if r.token_usage is not None
+ ]
+ token_sample_size = len(token_totals)
+ if token_sample_size == 0:
+ # VC4: all-null token_usage → both stats are None; pass_rate+duration still valid.
+ tokens_mean: float | None = None
+ tokens_stddev: float | None = None
+ else:
+ tokens_mean = statistics.mean(token_totals)
+ tokens_stddev = _safe_stddev(token_totals)
+
+ # Duration stats --- duration_s is always present on every record.
+ if total_cells == 0:
+ duration_mean: float | None = None
+ duration_stddev: float | None = None
+ else:
+ durations: list[float] = [r.duration_s for r in records]
+ duration_mean = statistics.mean(durations)
+ duration_stddev = _safe_stddev(durations)
+
+ return AggregateSummary(
+ total_cells=total_cells,
+ passed_cells=passed_cells,
+ pass_rate=pass_rate,
+ token_sample_size=token_sample_size,
+ tokens_mean=tokens_mean,
+ tokens_stddev=tokens_stddev,
+ duration_mean=duration_mean,
+ duration_stddev=duration_stddev,
+ )
+
+
+# ---------------------------------------------------------------------------
+# bounded_run
+# ---------------------------------------------------------------------------
+
+
+def bounded_run(
+ *,
+ skill: str,
+ entries: list[EvalSetEntry],
+ dispatcher: VariantDispatcher,
+ runs: int,
+ out_path: Path,
+ resume: bool = False,
+ max_concurrency: int = 1,
+) -> list[EvalResultRecord]:
+ """Run the prompts x runs matrix with bounded parallel dispatch.
+
+ Mirrors ``run_eval`` but executes cells in a ``ThreadPoolExecutor`` with up
+ to *max_concurrency* worker threads. All .jsonl writes are serialised under
+ a ``threading.Lock`` so the output file is never corrupted (VC3).
+
+ Parameters
+ ----------
+ skill:
+ Skill name (used for logging).
+ entries:
+ Eval-set rows (``EvalSetEntry`` objects).
+ dispatcher:
+ Dispatcher instance. Each ``evaluate_cell`` call invokes
+ ``dispatcher.dispatch()``. For ``ClaudeSubprocessDispatcher``, INV-5
+ isolation is automatic — each dispatch creates its own ``mkdtemp`` cwd
+ so concurrent dispatches never share working directories.
+ runs:
+ Number of runs per prompt.
+ out_path:
+ Absolute path to the ``.jsonl`` output file.
+ resume:
+ If True, skip cells already present in *out_path* (keyed on cell_id).
+ max_concurrency:
+ Maximum number of concurrent worker threads. ``1`` (default) makes
+ this effectively sequential while sharing the same code path as
+ parallel execution.
+
+ Returns
+ -------
+ list[EvalResultRecord]
+ All records dispatched during THIS call (resumed/skipped cells excluded).
+ Write order in the .jsonl may be nondeterministic at concurrency>1, but
+ the SET of cell_ids is always complete and unique.
+ """
+ # Determine the complete set of cells to skip (resume mode).
+ present_cell_ids: set[str] = set()
+ if resume and out_path.exists():
+ present_cell_ids = _read_present_cell_ids(out_path)
+ logger.info(
+ "bounded_run: resume mode -- %d cells already present in %s",
+ len(present_cell_ids),
+ out_path,
+ )
+
+ # Ensure output directory exists before any worker touches the file.
+ out_path.parent.mkdir(parents=True, exist_ok=True)
+
+ # Build the work list: (prompt_index, run_number, entry) for missing cells only.
+ work_items: list[_WorkItem] = []
+ for prompt_index, entry in enumerate(entries):
+ for run_number in range(runs):
+ cell_id = make_cell_id(prompt_index, _VARIANT_ID, run_number)
+ if cell_id not in present_cell_ids:
+ work_items.append((prompt_index, run_number, entry))
+ else:
+ logger.debug(
+ "bounded_run: skipping cell %s (already present in %s)",
+ cell_id,
+ out_path,
+ )
+
+ # Intent: serialised-write lock -- only one thread may append to the .jsonl
+ # at a time, preventing interleaved/corrupted writes (VC3).
+ write_lock = threading.Lock()
+ collected: list[EvalResultRecord] = []
+
+ def _dispatch_and_record(item: _WorkItem) -> EvalResultRecord:
+ """Worker: evaluate one cell and serialise the write."""
+ prompt_idx, run_num, cell_entry = item
+ record = evaluate_cell(
+ skill=skill,
+ entry=cell_entry,
+ prompt_index=prompt_idx,
+ run_number=run_num,
+ dispatcher=dispatcher,
+ )
+ with write_lock:
+ # INV-4: durable per-cell append-and-flush, serialised.
+ _append_record(out_path, record)
+ collected.append(record)
+ return record
+
+ workers = max(1, max_concurrency)
+ with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
+ futures = [executor.submit(_dispatch_and_record, item) for item in work_items]
+ # Intent: iterate futures as they complete; re-raise any unexpected exception
+ # so the caller can detect programming errors (dispatcher must not raise, per
+ # its contract, but the lock/append path theoretically could).
+ for future in concurrent.futures.as_completed(futures):
+ future.result() # propagates any unexpected exception
+
+ logger.info(
+ "bounded_run: finished skill=%s entries=%d runs=%d cells_written=%d out=%s",
+ skill,
+ len(entries),
+ runs,
+ len(collected),
+ out_path,
+ )
+
+ return collected
diff --git a/src/mapify_cli/skills_eval/assertions.py b/src/mapify_cli/skills_eval/assertions.py
new file mode 100644
index 0000000..2f7141d
--- /dev/null
+++ b/src/mapify_cli/skills_eval/assertions.py
@@ -0,0 +1,284 @@
+"""Pure, deterministic assertion runner for skill eval cells.
+
+No LLM, no subprocess, no file I/O, no network. Same (spec, result)
+always produces the same verdict (INV-3: no ``import anthropic``,
+no ANTHROPIC_API_KEY).
+
+Assertion types
+---------------
+- contains – value in raw_output
+- not_contains – value not in raw_output
+- regex – re.search(pattern, raw_output) is not None
+- valid_json – raw_output.strip() parses via json.loads
+- trigger – triggered_skill == skill
+- not_trigger – triggered_skill != skill (None-safe: SC-3)
+
+Robustness
+----------
+- Unknown type → FAIL, detail "unknown assertion type: "
+- Missing key → FAIL, clear detail, no KeyError
+- Invalid regex → FAIL, detail includes re.error message
+- run_assertion never raises
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from dataclasses import dataclass
+
+from mapify_cli.skills_eval.eval_schema import DispatchResult
+
+
+# ---------------------------------------------------------------------------
+# AssertionResult
+# ---------------------------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class AssertionResult:
+ """Immutable result of a single assertion evaluation."""
+
+ passed: bool
+ type: str
+ detail: str
+
+
+# ---------------------------------------------------------------------------
+# Internal helpers — one per assertion type
+# ---------------------------------------------------------------------------
+
+
+def _assert_contains(spec: dict[str, object], result: DispatchResult) -> AssertionResult:
+ """PASS iff spec["value"] is a substring of result.raw_output."""
+ value = spec.get("value")
+ if not isinstance(value, str):
+ return AssertionResult(
+ passed=False,
+ type="contains",
+ detail=f"contains: missing or non-string 'value' key (got {type(value).__name__!r})",
+ )
+ matched = value in result.raw_output
+ verb = "found in" if matched else "not found in"
+ return AssertionResult(
+ passed=matched,
+ type="contains",
+ detail=f"contains {value!r} -> {'PASS' if matched else 'FAIL'} ({verb} raw_output)",
+ )
+
+
+def _assert_not_contains(spec: dict[str, object], result: DispatchResult) -> AssertionResult:
+ """PASS iff spec["value"] is NOT a substring of result.raw_output."""
+ value = spec.get("value")
+ if not isinstance(value, str):
+ return AssertionResult(
+ passed=False,
+ type="not_contains",
+ detail=(
+ f"not_contains: missing or non-string 'value' key "
+ f"(got {type(value).__name__!r})"
+ ),
+ )
+ matched = value in result.raw_output
+ return AssertionResult(
+ passed=not matched,
+ type="not_contains",
+ detail=(
+ f"not_contains {value!r} -> {'PASS' if not matched else 'FAIL'} "
+ f"({'absent from' if not matched else 'found in'} raw_output)"
+ ),
+ )
+
+
+def _assert_regex(spec: dict[str, object], result: DispatchResult) -> AssertionResult:
+ """PASS iff re.search(pattern, raw_output) is not None.
+
+ Invalid regex pattern -> FAIL (detail includes re.error message).
+ """
+ pattern = spec.get("pattern")
+ if not isinstance(pattern, str):
+ return AssertionResult(
+ passed=False,
+ type="regex",
+ detail=(
+ f"regex: missing or non-string 'pattern' key "
+ f"(got {type(pattern).__name__!r})"
+ ),
+ )
+ try:
+ match = re.search(pattern, result.raw_output)
+ except re.error as exc:
+ return AssertionResult(
+ passed=False,
+ type="regex",
+ detail=f"regex {pattern!r} -> FAIL (invalid pattern: {exc})",
+ )
+ matched = match is not None
+ return AssertionResult(
+ passed=matched,
+ type="regex",
+ detail=(
+ f"regex {pattern!r} -> {'PASS' if matched else 'FAIL'} "
+ f"({'match found' if matched else 'no match'} in raw_output)"
+ ),
+ )
+
+
+def _assert_valid_json(
+ _spec: dict[str, object], result: DispatchResult
+) -> AssertionResult:
+ """PASS iff result.raw_output.strip() parses via json.loads."""
+ try:
+ json.loads(result.raw_output.strip())
+ return AssertionResult(
+ passed=True,
+ type="valid_json",
+ detail="valid_json -> PASS (raw_output is well-formed JSON)",
+ )
+ except (json.JSONDecodeError, ValueError) as exc:
+ return AssertionResult(
+ passed=False,
+ type="valid_json",
+ detail=f"valid_json -> FAIL (JSON parse error: {exc})",
+ )
+
+
+def _assert_trigger(spec: dict[str, object], result: DispatchResult) -> AssertionResult:
+ """PASS iff result.triggered_skill == spec["skill"]."""
+ skill = spec.get("skill")
+ if not isinstance(skill, str):
+ return AssertionResult(
+ passed=False,
+ type="trigger",
+ detail=(
+ f"trigger: missing or non-string 'skill' key "
+ f"(got {type(skill).__name__!r})"
+ ),
+ )
+ matched = result.triggered_skill == skill
+ return AssertionResult(
+ passed=matched,
+ type="trigger",
+ detail=(
+ f"trigger {skill!r} -> {'PASS' if matched else 'FAIL'} "
+ f"(triggered_skill={result.triggered_skill!r})"
+ ),
+ )
+
+
+def _assert_not_trigger(
+ spec: dict[str, object], result: DispatchResult
+) -> AssertionResult:
+ """PASS iff result.triggered_skill != spec["skill"].
+
+ SC-3: correctly handles triggered_skill is None —
+ ``not_trigger {"skill": "map-x"}`` PASSES when triggered_skill is None.
+ """
+ skill = spec.get("skill")
+ if not isinstance(skill, str):
+ return AssertionResult(
+ passed=False,
+ type="not_trigger",
+ detail=(
+ f"not_trigger: missing or non-string 'skill' key "
+ f"(got {type(skill).__name__!r})"
+ ),
+ )
+ # None != skill is True, so this naturally satisfies SC-3.
+ matched = result.triggered_skill != skill
+ return AssertionResult(
+ passed=matched,
+ type="not_trigger",
+ detail=(
+ f"not_trigger {skill!r} -> {'PASS' if matched else 'FAIL'} "
+ f"(triggered_skill={result.triggered_skill!r})"
+ ),
+ )
+
+
+# ---------------------------------------------------------------------------
+# Dispatcher table
+# ---------------------------------------------------------------------------
+
+# Intent: map assertion type string to its handler function.
+# Using a dict avoids a long if/elif chain and makes type extension O(1).
+_ASSERTION_HANDLERS = {
+ "contains": _assert_contains,
+ "not_contains": _assert_not_contains,
+ "regex": _assert_regex,
+ "valid_json": _assert_valid_json,
+ "trigger": _assert_trigger,
+ "not_trigger": _assert_not_trigger,
+}
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+
+def run_assertion(spec: dict[str, object], result: DispatchResult) -> AssertionResult:
+ """Evaluate a single assertion spec against a DispatchResult.
+
+ Never raises — unknown types and missing keys produce FAIL results with
+ human-debuggable ``detail`` strings.
+
+ Parameters
+ ----------
+ spec:
+ Dict with at least a ``"type"`` key and any type-specific keys.
+ result:
+ The DispatchResult from the dispatcher (ST-002).
+
+ Returns
+ -------
+ AssertionResult
+ Frozen dataclass; ``passed`` is the verdict, ``detail`` explains why.
+ """
+ assertion_type = spec.get("type")
+ if not isinstance(assertion_type, str):
+ return AssertionResult(
+ passed=False,
+ type=str(assertion_type),
+ detail=(
+ f"unknown assertion type: {assertion_type!r} "
+ f"(must be str, got {type(assertion_type).__name__!r})"
+ ),
+ )
+
+ handler = _ASSERTION_HANDLERS.get(assertion_type)
+ if handler is None:
+ return AssertionResult(
+ passed=False,
+ type=assertion_type,
+ detail=f"unknown assertion type: {assertion_type!r}",
+ )
+
+ return handler(spec, result)
+
+
+def run_assertions(
+ specs: list[dict[str, object]],
+ result: DispatchResult,
+) -> tuple[list[str], list[str]]:
+ """Run all assertions in *specs* against *result*.
+
+ Returns
+ -------
+ tuple[list[str], list[str]]
+ ``(passed_details, failed_details)`` — the ``detail`` strings of
+ passing vs failing assertions, suitable for
+ ``EvalResultRecord.assertions_passed`` /
+ ``EvalResultRecord.assertions_failed``.
+ """
+ passed_details: list[str] = []
+ failed_details: list[str] = []
+
+ for spec in specs:
+ ar = run_assertion(spec, result)
+ if ar.passed:
+ passed_details.append(ar.detail)
+ else:
+ failed_details.append(ar.detail)
+
+ return passed_details, failed_details
diff --git a/src/mapify_cli/skills_eval/dispatcher.py b/src/mapify_cli/skills_eval/dispatcher.py
new file mode 100644
index 0000000..e87c406
--- /dev/null
+++ b/src/mapify_cli/skills_eval/dispatcher.py
@@ -0,0 +1,540 @@
+"""Variant dispatcher for the skills_eval package.
+
+Provides the ABC ``VariantDispatcher`` and two concrete implementations:
+- ``MockDispatcher``: zero-subprocess, caller-controlled output for CI tests (INV-2).
+- ``ClaudeSubprocessDispatcher``: real ``claude -p`` invocation in a seeded
+ throwaway temp cwd with the TEMP-FLIP applied.
+
+Hard constraints (INV-2, INV-3, INV-5)
+---------------------------------------
+- Uses only stdlib; no Anthropic SDK imports (INV-3).
+- Does not read cloud credentials from the environment (INV-3).
+- Production ``.claude/`` and ``.map/`` trees are NEVER modified (INV-5).
+ The TEMP-FLIP touches only the throwaway seeded copy.
+- ``MockDispatcher.dispatch`` NEVER calls subprocess (INV-2).
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import random
+import shutil
+import subprocess
+import tempfile
+import time
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Any
+
+from mapify_cli.skills_eval.eval_schema import DispatchResult
+from mapify_cli.token_budget import TokenUsage
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Abstract base
+# ---------------------------------------------------------------------------
+
+
+class VariantDispatcher(ABC):
+ """Abstract dispatcher: given a prompt, produce a ``DispatchResult``."""
+
+ @abstractmethod
+ def dispatch(self, prompt: str) -> DispatchResult:
+ """Run ``prompt`` and return a fully-populated ``DispatchResult``.
+
+ Implementations MUST NOT raise — transient failures are captured in
+ ``DispatchResult.error``.
+ """
+
+
+# ---------------------------------------------------------------------------
+# MockDispatcher — CI / unit-test use only (INV-2: zero subprocess)
+# ---------------------------------------------------------------------------
+
+
+class MockDispatcher(VariantDispatcher):
+ """Caller-controlled dispatcher that performs ZERO subprocess work.
+
+ All tests in the CI suite use this instead of ``ClaudeSubprocessDispatcher``
+ to avoid real ``claude`` invocations. Construct with the exact field values
+ that ``dispatch()`` should return.
+ """
+
+ def __init__(
+ self,
+ *,
+ triggered_skill: str | None = None,
+ raw_output: str = "",
+ token_usage: TokenUsage | None = None,
+ duration_s: float = 0.0,
+ error: str | None = None,
+ ) -> None:
+ self._triggered_skill = triggered_skill
+ self._raw_output = raw_output
+ self._token_usage = token_usage
+ self._duration_s = duration_s
+ self._error = error
+
+ def dispatch(self, prompt: str) -> DispatchResult:
+ """Return the caller-configured ``DispatchResult``.
+
+ No subprocess call, no file I/O — pure attribute access (INV-2).
+ The ``prompt`` is intentionally ignored — a mock returns a fixed result.
+ """
+ del prompt # intentionally unused; mock returns caller-set values
+ return DispatchResult(
+ raw_output=self._raw_output,
+ triggered_skill=self._triggered_skill,
+ token_usage=self._token_usage,
+ duration_s=self._duration_s,
+ error=self._error,
+ )
+
+
+# ---------------------------------------------------------------------------
+# Seeding helpers (ClaudeSubprocessDispatcher internals)
+# ---------------------------------------------------------------------------
+
+
+def _seed_temp_cwd(source_claude_dir: Path) -> Path:
+ """Create a throwaway temp directory seeded with a copy of ``.claude/``.
+
+ Steps:
+ 1. ``tempfile.mkdtemp()`` — fresh isolated dir.
+ 2. ``shutil.copytree(source_claude_dir, /.claude)`` — full copy.
+ 3. ``os.makedirs(/.map)`` — fresh empty ``.map/`` (no production state).
+ 4. TEMP-FLIP: rewrite ``disable-model-invocation: true`` →
+ ``disable-model-invocation: false`` in every seeded SKILL.md.
+
+ Returns the tmp dir ``Path``.
+ Caller is responsible for ``shutil.rmtree(tmp, ignore_errors=True)`` cleanup.
+ """
+ tmp = Path(tempfile.mkdtemp(prefix="mapeval-"))
+
+ # 1. Copy .claude/ tree (only if source exists).
+ seeded_claude = tmp / ".claude"
+ if source_claude_dir.is_dir():
+ shutil.copytree(source_claude_dir, seeded_claude)
+ else:
+ seeded_claude.mkdir(parents=True)
+ logger.warning(
+ "seed_temp_cwd: source_claude_dir %s does not exist — seeding empty .claude/",
+ source_claude_dir,
+ )
+
+ # 2. Empty .map/ — prevents accidental reads of production workflow state.
+ (tmp / ".map").mkdir(parents=True)
+
+ # 3. TEMP-FLIP: make every skill model-selectable for the eval (spike VC3).
+ # Pattern: a frontmatter line ``disable-model-invocation: true`` (any
+ # leading/trailing whitespace) → ``disable-model-invocation: false``.
+ # Skills without the field are left untouched (already invocable).
+ _apply_temp_flip(seeded_claude)
+
+ return tmp
+
+
+def _apply_temp_flip(seeded_claude_dir: Path) -> None:
+ """Rewrite ``disable-model-invocation: true`` → ``false`` in seeded SKILL.md files.
+
+ Intent: allow the eval model to select any skill via description, not just
+ the three production-invocable ones. Throwaway copy only — production
+ templates are never touched.
+ """
+ skill_files = list(seeded_claude_dir.glob("skills/*/SKILL.md"))
+ for skill_file in skill_files:
+ try:
+ original = skill_file.read_text(encoding="utf-8")
+ except OSError as exc:
+ logger.warning("temp_flip: could not read %s: %s", skill_file, exc)
+ continue
+
+ flipped = _flip_disable_invocation_line(original)
+ if flipped != original:
+ try:
+ skill_file.write_text(flipped, encoding="utf-8")
+ except OSError as exc:
+ logger.warning("temp_flip: could not write %s: %s", skill_file, exc)
+
+
+def _flip_disable_invocation_line(content: str) -> str:
+ """Replace the first ``disable-model-invocation: true`` line with ``false``.
+
+ Operates line-by-line to avoid regex mis-matches on other content.
+ Returns the original string unchanged if the field is absent or already false.
+ """
+ lines = content.splitlines(keepends=True)
+ result: list[str] = []
+ for line in lines:
+ stripped = line.strip()
+ if stripped == "disable-model-invocation: true":
+ # Preserve leading/trailing whitespace so the YAML structure stays valid.
+ result.append(line.replace("true", "false", 1))
+ else:
+ result.append(line)
+ return "".join(result)
+
+
+# ---------------------------------------------------------------------------
+# Transcript helpers
+# ---------------------------------------------------------------------------
+
+
+def _derive_triggered_skill(session_id: str, cwd: Path) -> str | None:
+ """Scan the native JSONL transcript for the first fired skill.
+
+ Search order (spike VC3 binding contract):
+ 1. Glob ``~/.claude/projects/*/.jsonl`` (session_id is a unique
+ UUID — no slug fragility).
+ 2. Fall back to slug-from-cwd path if glob returns nothing.
+ 3. If transcript not found → return ``None`` (do not crash).
+
+ Detection rule: find the first assistant message.content[*] where
+ ``type=="tool_use"`` and ``name=="Skill"``; return ``input.skill``.
+ ``name=="Agent"`` / ``Task`` blocks are ignored.
+ """
+ if not session_id:
+ return None
+
+ transcript_path = _locate_transcript(session_id, cwd)
+ if transcript_path is None or not transcript_path.exists():
+ logger.debug(
+ "transcript not found for session_id=%s cwd=%s", session_id, cwd
+ )
+ return None
+
+ return _parse_transcript_for_skill(transcript_path)
+
+
+def _locate_transcript(session_id: str, cwd: Path) -> Path | None:
+ """Return the path to the JSONL transcript or ``None`` if not found."""
+ projects_dir = Path.home() / ".claude" / "projects"
+
+ # Primary: UUID-based glob — immune to slug encoding differences.
+ if session_id:
+ matches = list(projects_dir.glob(f"*/{session_id}.jsonl"))
+ if matches:
+ return matches[0]
+
+ # Fallback: reconstruct slug from cwd (``/`` and ``.`` → ``-``).
+ cwd_slug = str(cwd).replace("/", "-").replace(".", "-")
+ fallback = projects_dir / cwd_slug / f"{session_id}.jsonl"
+ if fallback.exists():
+ return fallback
+
+ return None
+
+
+def _parse_transcript_for_skill(path: Path) -> str | None:
+ """Return the first ``Skill`` tool_use ``input.skill`` value, or ``None``."""
+ try:
+ with path.open(encoding="utf-8") as fh:
+ for raw_line in fh:
+ raw_line = raw_line.strip()
+ if not raw_line:
+ continue
+ try:
+ entry = json.loads(raw_line)
+ except json.JSONDecodeError:
+ continue
+
+ skill = _extract_skill_from_entry(entry)
+ if skill is not None:
+ return skill
+ except OSError as exc:
+ logger.warning("parse_transcript: could not read %s: %s", path, exc)
+
+ return None
+
+
+def _extract_skill_from_entry(entry: Any) -> str | None:
+ """Extract ``input.skill`` from a transcript entry if it is a Skill tool_use.
+
+ Walks ``message.content[*]`` looking for ``type=="tool_use"`` +
+ ``name=="Skill"``. Returns the skill name string or ``None``.
+ """
+ if not isinstance(entry, dict):
+ return None
+
+ message = entry.get("message")
+ if not isinstance(message, dict):
+ return None
+
+ content = message.get("content")
+ if not isinstance(content, list):
+ return None
+
+ for block in content:
+ if not isinstance(block, dict):
+ continue
+ if block.get("type") != "tool_use":
+ continue
+ if block.get("name") != "Skill":
+ continue
+ tool_input = block.get("input")
+ if isinstance(tool_input, dict):
+ skill_name = tool_input.get("skill")
+ if isinstance(skill_name, str) and skill_name:
+ return skill_name
+
+ return None
+
+
+# ---------------------------------------------------------------------------
+# Envelope parsing
+# ---------------------------------------------------------------------------
+
+
+def _parse_envelope(stdout: str) -> tuple[str, TokenUsage | None, str]:
+ """Parse the ``claude -p --output-format json`` result envelope defensively.
+
+ Returns ``(raw_output, token_usage, session_id)``.
+ On JSON decode failure returns ``(stdout, None, "")``.
+
+ Mirrors ``_parse_claude_output`` / ``_append_cost_log`` from
+ ``memory/finalize.py:232-281``.
+ """
+ try:
+ parsed = json.loads(stdout)
+ except (json.JSONDecodeError, ValueError):
+ return stdout, None, ""
+
+ if not isinstance(parsed, dict):
+ return stdout, None, ""
+
+ raw_output = str(parsed.get("result", ""))
+ session_id = str(parsed.get("session_id") or "")
+
+ usage_raw = parsed.get("usage")
+ token_usage: TokenUsage | None = None
+ if isinstance(usage_raw, dict):
+ token_usage = TokenUsage(
+ input_tokens=int(usage_raw.get("input_tokens", 0) or 0),
+ cache_read_input_tokens=int(
+ usage_raw.get("cache_read_input_tokens", 0) or 0
+ ),
+ cache_creation_input_tokens=int(
+ usage_raw.get("cache_creation_input_tokens", 0) or 0
+ ),
+ )
+
+ return raw_output, token_usage, session_id
+
+
+# ---------------------------------------------------------------------------
+# ClaudeSubprocessDispatcher
+# ---------------------------------------------------------------------------
+
+# Default jitter upper-bound (seconds) added to backoff sleep.
+_JITTER_MAX: float = 2.0
+
+
+class ClaudeSubprocessDispatcher(VariantDispatcher):
+ """Real ``claude -p`` dispatcher for production/manual eval runs.
+
+ Seeding and cleanup
+ -------------------
+ Each ``dispatch()`` call:
+ 1. Creates a fresh temp cwd seeded with a copy of ``source_claude_dir``
+ and an empty ``.map/``.
+ 2. Applies TEMP-FLIP so all skills are model-selectable.
+ 3. Runs ``claude -p --output-format json`` in that temp cwd.
+ 4. Removes the temp dir in a ``try/finally`` block.
+
+ Retry policy (VC4)
+ ------------------
+ ``subprocess.TimeoutExpired``, non-zero ``returncode``, and ``OSError``
+ are treated as transient. Up to ``max_retries`` additional attempts are
+ made with bounded jittered exponential backoff. After exhaustion the error
+ is recorded in ``DispatchResult.error``; no exception escapes ``dispatch()``.
+
+ INV-3 compliance
+ ----------------
+ No Anthropic SDK import. No cloud credential environment reads.
+
+ INV-5 compliance
+ ----------------
+ ``cwd`` of the subprocess is always the throwaway temp dir. Production
+ ``.map/`` is never referenced.
+ """
+
+ def __init__(
+ self,
+ *,
+ source_claude_dir: Path | None = None,
+ timeout: float = 120.0,
+ max_retries: int = 2,
+ backoff_base: float = 2.0,
+ ) -> None:
+ """Initialise the dispatcher.
+
+ Parameters
+ ----------
+ source_claude_dir:
+ Path to the ``.claude/`` directory to seed from. Defaults to
+ ``Path.cwd() / ".claude"`` at construction time.
+ timeout:
+ Per-attempt timeout in seconds passed to ``subprocess.run``.
+ max_retries:
+ Number of *additional* retry attempts after the first failure.
+ Total attempts = 1 + max_retries.
+ backoff_base:
+ Base for exponential backoff (seconds). Attempt 0 sleeps
+ ``backoff_base * 2**0 + jitter``, attempt 1 sleeps
+ ``backoff_base * 2**1 + jitter``, etc.
+ """
+ self._source_claude_dir: Path = (
+ source_claude_dir if source_claude_dir is not None else Path.cwd() / ".claude"
+ )
+ self._timeout = timeout
+ self._max_retries = max_retries
+ self._backoff_base = backoff_base
+ # Holds the error message from the latest _run_once call. Instance-scoped
+ # (not class-level) so the safe-sequential-only assumption is explicit.
+ self._last_error: str = ""
+
+ # ------------------------------------------------------------------
+ # Public API
+ # ------------------------------------------------------------------
+
+ def dispatch(self, prompt: str) -> DispatchResult:
+ """Dispatch ``prompt`` via ``claude -p``, with backoff retry on failure.
+
+ Always returns a ``DispatchResult`` — never raises.
+ """
+ t_total_start = time.monotonic()
+ tmp: Path | None = None
+
+ try:
+ tmp = _seed_temp_cwd(self._source_claude_dir)
+ return self._dispatch_with_retry(prompt, tmp, t_total_start)
+ except Exception as exc: # noqa: BLE001
+ # Catch any unexpected seeding failure; should not occur in practice.
+ duration_s = time.monotonic() - t_total_start
+ logger.warning("dispatch: unexpected error during seeding: %s", exc)
+ return DispatchResult(
+ raw_output="",
+ triggered_skill=None,
+ token_usage=None,
+ duration_s=duration_s,
+ error=f"seeding error: {exc}",
+ )
+ finally:
+ if tmp is not None:
+ shutil.rmtree(tmp, ignore_errors=True)
+
+ # ------------------------------------------------------------------
+ # Internal helpers
+ # ------------------------------------------------------------------
+
+ def _dispatch_with_retry(
+ self,
+ prompt: str,
+ tmp: Path,
+ t_total_start: float,
+ ) -> DispatchResult:
+ """Run the subprocess with bounded jittered exponential backoff.
+
+ ``max_retries=2`` means up to 3 total attempts (attempt 0, 1, 2).
+ After all attempts are exhausted, returns an error ``DispatchResult``.
+ """
+ argv = ["claude", "-p", prompt, "--output-format", "json"]
+ last_error: str = ""
+
+ for attempt in range(self._max_retries + 1):
+ if attempt > 0:
+ sleep_s = self._backoff_base * (2 ** (attempt - 1)) + random.uniform(
+ 0, _JITTER_MAX
+ )
+ logger.debug(
+ "dispatch: retry attempt %d/%d — sleeping %.2fs",
+ attempt,
+ self._max_retries,
+ sleep_s,
+ )
+ time.sleep(sleep_s)
+
+ result = self._run_once(argv, tmp)
+ if result is not None:
+ # Successful subprocess run — parse and return.
+ return self._build_result(result, tmp, t_total_start)
+
+ # _run_once returned None => transient failure; last_error was set.
+ last_error = self._last_error
+
+ duration_s = time.monotonic() - t_total_start
+ return DispatchResult(
+ raw_output="",
+ triggered_skill=None,
+ token_usage=None,
+ duration_s=duration_s,
+ error=last_error or "dispatch failed after retries",
+ )
+
+ def _run_once(
+ self,
+ argv: list[str],
+ cwd: Path,
+ ) -> subprocess.CompletedProcess[str] | None:
+ """Run ``argv`` once; return ``CompletedProcess`` on success, ``None`` on failure.
+
+ Side-effect: sets ``self._last_error`` on failure.
+ """
+ try:
+ proc = subprocess.run(
+ argv,
+ capture_output=True,
+ text=True,
+ timeout=self._timeout,
+ cwd=cwd,
+ env={**os.environ, "MAP_INVOKED_BY": "skills-eval"},
+ )
+ except subprocess.TimeoutExpired as exc:
+ self._last_error = f"timeout after {self._timeout}s: {exc}"
+ logger.warning("dispatch: subprocess timed out: %s", exc)
+ return None
+ except OSError as exc:
+ self._last_error = f"OSError: {exc}"
+ logger.warning("dispatch: OSError running claude: %s", exc)
+ return None
+ except Exception as exc: # noqa: BLE001
+ self._last_error = f"unexpected subprocess error: {exc}"
+ logger.warning("dispatch: unexpected subprocess error: %s", exc)
+ return None
+
+ if proc.returncode != 0:
+ self._last_error = (
+ f"non-zero returncode {proc.returncode}: "
+ f"{(proc.stderr or '')[:200].strip()}"
+ )
+ logger.warning(
+ "dispatch: claude returned returncode=%d stderr=%s",
+ proc.returncode,
+ (proc.stderr or "")[:200].strip(),
+ )
+ return None
+
+ return proc
+
+ def _build_result(
+ self,
+ proc: subprocess.CompletedProcess[str],
+ tmp: Path,
+ t_start: float,
+ ) -> DispatchResult:
+ """Parse the envelope from a successful subprocess run."""
+ stdout = proc.stdout or ""
+ raw_output, token_usage, session_id = _parse_envelope(stdout)
+ duration_s = time.monotonic() - t_start
+ triggered_skill = _derive_triggered_skill(session_id, tmp)
+
+ return DispatchResult(
+ raw_output=raw_output,
+ triggered_skill=triggered_skill,
+ token_usage=token_usage,
+ duration_s=duration_s,
+ error=None,
+ )
diff --git a/src/mapify_cli/skills_eval/eval_schema.py b/src/mapify_cli/skills_eval/eval_schema.py
new file mode 100644
index 0000000..a50766e
--- /dev/null
+++ b/src/mapify_cli/skills_eval/eval_schema.py
@@ -0,0 +1,180 @@
+"""Shared data contracts for the skills_eval package.
+
+All structures are defined EXACTLY ONCE here and imported by every eval
+component (dispatcher, assertions, runner, aggregator). This module is a
+pure data layer — no dispatch logic, transcript parsing, assertion execution,
+or I/O of any kind.
+
+INV-3: No ``import anthropic`` and no ANTHROPIC_API_KEY access anywhere.
+INV-6: Contract-first — producer and consumer both import from this module.
+"""
+
+from __future__ import annotations
+
+import dataclasses
+from dataclasses import dataclass, field
+from typing import Any
+
+from mapify_cli.token_budget import TokenUsage
+
+
+# ---------------------------------------------------------------------------
+# EvalSetEntry
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class EvalSetEntry:
+ """One row parsed from a JSON eval-set file.
+
+ Built from externally supplied JSON, so field types are validated
+ explicitly in ``__post_init__`` — Python type hints are documentation only.
+ """
+
+ prompt: str
+ should_trigger: str | None
+ should_not_trigger: str | None
+ assertions: list[dict] # type: ignore[type-arg]
+
+ def __post_init__(self) -> None:
+ if not isinstance(self.prompt, str):
+ raise ValueError(
+ f"EvalSetEntry.prompt must be str, got {type(self.prompt).__name__!r}"
+ )
+ if self.should_trigger is not None and not isinstance(self.should_trigger, str):
+ raise ValueError(
+ "EvalSetEntry.should_trigger must be str or None, "
+ f"got {type(self.should_trigger).__name__!r}"
+ )
+ if self.should_not_trigger is not None and not isinstance(
+ self.should_not_trigger, str
+ ):
+ raise ValueError(
+ "EvalSetEntry.should_not_trigger must be str or None, "
+ f"got {type(self.should_not_trigger).__name__!r}"
+ )
+ if not isinstance(self.assertions, list):
+ raise ValueError(
+ "EvalSetEntry.assertions must be list, "
+ f"got {type(self.assertions).__name__!r}"
+ )
+
+
+# ---------------------------------------------------------------------------
+# DispatchResult
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class DispatchResult:
+ """Result returned by the skill dispatcher for a single prompt.
+
+ ``token_usage`` and ``error`` are optional — dispatcher sets ``error``
+ when the API call fails and ``token_usage`` may be absent on failure.
+ ``TokenUsage`` is imported from ``mapify_cli.token_budget``; it is NOT
+ redefined here (INV-6).
+ """
+
+ raw_output: str
+ triggered_skill: str | None
+ token_usage: TokenUsage | None
+ duration_s: float
+ error: str | None = None
+
+
+# ---------------------------------------------------------------------------
+# EvalResultRecord (append-only .jsonl row)
+# ---------------------------------------------------------------------------
+
+# Sentinel used in from_dict to distinguish «key absent» from «key present but None».
+_MISSING: object = object()
+
+@dataclass
+class EvalResultRecord:
+ """One completed eval result, serialisable to/from a JSON object.
+
+ Used for the append-only ``.jsonl`` result file written by the runner
+ (ST-005). ``to_dict`` / ``from_dict`` provide a stable round-trip.
+ ``TokenUsage`` is a flat 3-int frozen dataclass; it is serialised as a
+ nested dict (via ``dataclasses.asdict``) and reconstructed in
+ ``from_dict``.
+ """
+
+ cell_id: str
+ prompt: str
+ triggered_skill: str | None
+ token_usage: TokenUsage | None
+ duration_s: float
+ assertions_passed: list[str] = field(default_factory=list)
+ assertions_failed: list[str] = field(default_factory=list)
+ raw_output: str = ""
+
+ # ------------------------------------------------------------------
+ # Serialisation helpers
+ # ------------------------------------------------------------------
+
+ def to_dict(self) -> dict[str, Any]:
+ """Return a JSON-serialisable dict for this record.
+
+ ``token_usage`` is either a nested dict (3 keys) or ``None``.
+ """
+ return {
+ "cell_id": self.cell_id,
+ "prompt": self.prompt,
+ "triggered_skill": self.triggered_skill,
+ "token_usage": (
+ dataclasses.asdict(self.token_usage)
+ if self.token_usage is not None
+ else None
+ ),
+ "duration_s": self.duration_s,
+ "assertions_passed": list(self.assertions_passed),
+ "assertions_failed": list(self.assertions_failed),
+ "raw_output": self.raw_output,
+ }
+
+ @classmethod
+ def from_dict(cls, d: dict[str, Any]) -> "EvalResultRecord":
+ """Reconstruct an ``EvalResultRecord`` from a plain dict (JSON parse).
+
+ Tolerates ``token_usage=None`` and missing keys for
+ ``assertions_passed``, ``assertions_failed``, and ``raw_output``
+ (backward compatibility with older .jsonl rows).
+ """
+ raw_tu = d.get("token_usage", _MISSING)
+ if raw_tu is _MISSING or raw_tu is None:
+ token_usage: TokenUsage | None = None
+ else:
+ token_usage = TokenUsage(
+ input_tokens=int(raw_tu.get("input_tokens", 0)),
+ cache_read_input_tokens=int(raw_tu.get("cache_read_input_tokens", 0)),
+ cache_creation_input_tokens=int(
+ raw_tu.get("cache_creation_input_tokens", 0)
+ ),
+ )
+ return cls(
+ cell_id=d["cell_id"],
+ prompt=d["prompt"],
+ triggered_skill=d.get("triggered_skill"),
+ token_usage=token_usage,
+ duration_s=float(d["duration_s"]),
+ assertions_passed=list(d.get("assertions_passed", [])),
+ assertions_failed=list(d.get("assertions_failed", [])),
+ raw_output=d.get("raw_output", ""),
+ )
+
+
+# ---------------------------------------------------------------------------
+# make_cell_id
+# ---------------------------------------------------------------------------
+
+
+def make_cell_id(prompt_index: int, variant_id: int, run_number: int) -> str:
+ """Return a deterministic, human-readable cell identifier.
+
+ The format is stable so ``--resume`` can match present cell_ids across
+ runs without relying on randomness or wall-clock time.
+
+ Example: ``make_cell_id(0, 1, 2)`` → ``"p0-v1-r2"``
+ """
+ return f"p{prompt_index}-v{variant_id}-r{run_number}"
diff --git a/src/mapify_cli/skills_eval/runner.py b/src/mapify_cli/skills_eval/runner.py
new file mode 100644
index 0000000..610d5d6
--- /dev/null
+++ b/src/mapify_cli/skills_eval/runner.py
@@ -0,0 +1,425 @@
+"""Matrix runner for skill eval: prompts x runs -> durable resumable .jsonl.
+
+Public API (plain functions; no Typer -- CLI wiring is ST-007):
+- ``load_eval_set(path)`` -- parse a JSON eval-set file.
+- ``run_eval(...)`` -- execute the p x r matrix, append results.
+- ``default_run_path(root, skill, timestamp)`` -- canonical .jsonl path helper.
+- ``latest_run_path(root, skill)`` -- find most-recent .jsonl for --resume.
+
+Design invariants respected:
+- INV-3: no ``import anthropic``, no ANTHROPIC_API_KEY access.
+- INV-7: ``triggered_skill`` is consumed from ``DispatchResult.triggered_skill``
+ (the dispatcher is the SINGLE source of trigger detection). The runner
+ does NOT parse transcripts.
+- D10: variant_id is always 1 (no variants loop).
+- INV-4: each cell is flushed to disk immediately (durable per-cell append).
+- VC3: resume reads existing cell_ids, skips already-written cells, appends only
+ missing ones to the SAME file.
+- VC4: a per-cell dispatch error is recorded (not raised); matrix continues.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from pathlib import Path
+from typing import Any
+
+from mapify_cli.skills_eval.assertions import run_assertions
+from mapify_cli.skills_eval.dispatcher import VariantDispatcher
+from mapify_cli.skills_eval.eval_schema import (
+ DispatchResult,
+ EvalResultRecord,
+ EvalSetEntry,
+ make_cell_id,
+)
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+
+# Intent: fixed variant_id per D10 -- never enter a variants loop.
+_VARIANT_ID: int = 1
+
+
+# ---------------------------------------------------------------------------
+# load_eval_set
+# ---------------------------------------------------------------------------
+
+
+def load_eval_set(path: Path) -> list[EvalSetEntry]:
+ """Parse a JSON eval-set file and return a list of ``EvalSetEntry`` rows.
+
+ Expected JSON shape::
+
+ {
+ "entries": [
+ {
+ "prompt": "",
+ "should_trigger": "",
+ "should_not_trigger": "",
+ "assertions": [ {"type": "...", ...}, ... ]
+ },
+ ...
+ ]
+ }
+
+ Parameters
+ ----------
+ path:
+ Filesystem path to the ``.json`` eval-set file.
+
+ Returns
+ -------
+ list[EvalSetEntry]
+ Non-empty list of parsed rows.
+
+ Raises
+ ------
+ ValueError
+ On: missing file, file not valid JSON, missing or empty "entries" key,
+ or any row that fails ``EvalSetEntry.__post_init__`` validation.
+ """
+ if not path.exists():
+ raise ValueError(f"eval-set file not found: {path}")
+
+ try:
+ text = path.read_text(encoding="utf-8")
+ except OSError as exc:
+ raise ValueError(f"could not read eval-set file {path}: {exc}") from exc
+
+ try:
+ data: Any = json.loads(text)
+ except json.JSONDecodeError as exc:
+ raise ValueError(f"eval-set file is not valid JSON ({path}): {exc}") from exc
+
+ if not isinstance(data, dict):
+ raise ValueError(
+ f"eval-set file must be a JSON object (got {type(data).__name__!r}): {path}"
+ )
+
+ raw_entries: Any = data.get("entries")
+ if raw_entries is None:
+ raise ValueError(f'eval-set file missing required "entries" key: {path}')
+ if not isinstance(raw_entries, list):
+ raise ValueError(
+ f'"entries" must be a JSON array (got {type(raw_entries).__name__!r}): {path}'
+ )
+ if len(raw_entries) == 0:
+ raise ValueError(f'"entries" list must not be empty: {path}')
+
+ entries: list[EvalSetEntry] = []
+ for row_index, raw_row in enumerate(raw_entries):
+ if not isinstance(raw_row, dict):
+ raise ValueError(
+ f"entries[{row_index}] must be a JSON object "
+ f"(got {type(raw_row).__name__!r}): {path}"
+ )
+ prompt: Any = raw_row.get("prompt")
+ if prompt is None:
+ raise ValueError(
+ f'entries[{row_index}] missing required "prompt" key: {path}'
+ )
+ should_trigger: str | None = raw_row.get("should_trigger", None)
+ should_not_trigger: str | None = raw_row.get("should_not_trigger", None)
+ raw_assertions: Any = raw_row.get("assertions", [])
+ if not isinstance(raw_assertions, list):
+ raise ValueError(
+ f"entries[{row_index}].assertions must be a JSON array "
+ f"(got {type(raw_assertions).__name__!r}): {path}"
+ )
+ try:
+ entry = EvalSetEntry(
+ prompt=prompt,
+ should_trigger=should_trigger,
+ should_not_trigger=should_not_trigger,
+ assertions=raw_assertions,
+ )
+ except ValueError as exc:
+ raise ValueError(
+ f"entries[{row_index}] failed validation: {exc}"
+ ) from exc
+ entries.append(entry)
+
+ return entries
+
+
+# ---------------------------------------------------------------------------
+# _read_present_cell_ids (resume helper)
+# ---------------------------------------------------------------------------
+
+
+def _read_present_cell_ids(out_path: Path) -> set[str]:
+ """Return the set of ``cell_id`` values already in *out_path*.
+
+ Skips blank lines and JSON-malformed lines defensively so a partial last
+ line (write interrupted mid-flush) does not crash resume.
+ """
+ present: set[str] = set()
+ try:
+ with open(out_path, encoding="utf-8") as fh:
+ for raw_line in fh:
+ raw_line = raw_line.strip()
+ if not raw_line:
+ continue
+ try:
+ row: Any = json.loads(raw_line)
+ except json.JSONDecodeError:
+ logger.debug(
+ "_read_present_cell_ids: skipping malformed line in %s", out_path
+ )
+ continue
+ if not isinstance(row, dict):
+ continue
+ cell_id_val = row.get("cell_id")
+ if isinstance(cell_id_val, str) and cell_id_val:
+ present.add(cell_id_val)
+ except OSError as exc:
+ logger.warning(
+ "_read_present_cell_ids: could not read %s: %s -- treating as empty",
+ out_path,
+ exc,
+ )
+ return present
+
+
+# ---------------------------------------------------------------------------
+# _build_assertion_specs (per-cell helper)
+# ---------------------------------------------------------------------------
+
+
+def _build_assertion_specs(entry: EvalSetEntry) -> list[dict[str, object]]:
+ """Combine explicit assertions with trigger/not_trigger expectations.
+
+ The result is the complete spec list passed to ``run_assertions``.
+ """
+ specs: list[dict[str, object]] = list(entry.assertions)
+ if entry.should_trigger is not None:
+ specs.append({"type": "trigger", "skill": entry.should_trigger})
+ if entry.should_not_trigger is not None:
+ specs.append({"type": "not_trigger", "skill": entry.should_not_trigger})
+ return specs
+
+
+# ---------------------------------------------------------------------------
+# run_eval
+# ---------------------------------------------------------------------------
+
+
+def evaluate_cell(
+ *,
+ skill: str,
+ entry: EvalSetEntry,
+ prompt_index: int,
+ run_number: int,
+ dispatcher: VariantDispatcher,
+) -> EvalResultRecord:
+ """Dispatch one (entry, prompt_index, run_number) cell and return the record.
+
+ Does NOT write to disk — the caller is responsible for durable persistence
+ (INV-4). Shared by ``run_eval`` (sequential) and ``bounded_run``
+ (concurrent) so dispatch+assertion logic is defined exactly once (DRY).
+
+ Design invariants
+ -----------------
+ - D10: variant_id is always ``_VARIANT_ID`` (1).
+ - INV-7: ``triggered_skill`` is read from ``DispatchResult.triggered_skill``
+ only -- the runner never parses transcripts.
+ - VC4: per-cell ``DispatchResult.error`` is recorded (not raised); callers
+ decide whether to abort or continue.
+ """
+ cell_id = make_cell_id(prompt_index, _VARIANT_ID, run_number)
+
+ # Dispatch -- must not raise (VariantDispatcher contract).
+ dispatch_result: DispatchResult = dispatcher.dispatch(entry.prompt)
+
+ # Build assertion specs: explicit assertions + trigger expectations.
+ assertion_specs = _build_assertion_specs(entry)
+
+ if dispatch_result.error is not None:
+ # VC4: record the error as a synthetic failed assertion; do not abort.
+ passed_list: list[str] = []
+ failed_list: list[str] = [f"dispatch_error: {dispatch_result.error}"]
+ logger.warning(
+ "evaluate_cell: cell %s dispatch error (skill=%s run=%d): %s",
+ cell_id,
+ skill,
+ run_number,
+ dispatch_result.error,
+ )
+ else:
+ passed_list, failed_list = run_assertions(assertion_specs, dispatch_result)
+
+ return EvalResultRecord(
+ cell_id=cell_id,
+ prompt=entry.prompt,
+ triggered_skill=dispatch_result.triggered_skill,
+ token_usage=dispatch_result.token_usage,
+ duration_s=dispatch_result.duration_s,
+ assertions_passed=passed_list,
+ assertions_failed=failed_list,
+ raw_output=dispatch_result.raw_output,
+ )
+
+
+def run_eval(
+ *,
+ skill: str,
+ entries: list[EvalSetEntry],
+ dispatcher: VariantDispatcher,
+ runs: int,
+ out_path: Path,
+ resume: bool = False,
+) -> list[EvalResultRecord]:
+ """Execute the prompts x runs evaluation matrix and write results to *out_path*.
+
+ Parameters
+ ----------
+ skill:
+ Name of the skill under evaluation (used for logging only).
+ entries:
+ Eval-set rows from ``load_eval_set``.
+ dispatcher:
+ ``VariantDispatcher`` instance (``MockDispatcher`` in tests,
+ ``ClaudeSubprocessDispatcher`` in production).
+ runs:
+ Number of runs per prompt (``range(runs)``).
+ out_path:
+ Absolute path to the ``.jsonl`` output file. Created (with parent
+ dirs) if absent; APPENDED to if *resume* is True.
+ resume:
+ If True, read already-present ``cell_id`` values from *out_path* and
+ skip those cells. Missing cells are appended to the SAME file.
+ If False (default), *out_path* is a fresh file (caller's responsibility
+ to pass a new path -- the function does not truncate an existing file).
+
+ Returns
+ -------
+ list[EvalResultRecord]
+ Records written *during this call* (skipped/resumed cells are not
+ included -- callers that need the full result set should read out_path).
+
+ Design invariants
+ -----------------
+ - D10: variant_id is always ``_VARIANT_ID`` (1) -- NO variants loop.
+ - INV-7: ``triggered_skill`` is read from ``DispatchResult.triggered_skill``
+ only -- the runner never parses transcripts.
+ - INV-4: each record is flushed to *out_path* immediately after building.
+ - VC4: per-cell ``DispatchResult.error`` is recorded; matrix is never aborted.
+ """
+ # Resolve set of already-written cells for resume mode.
+ present_cell_ids: set[str] = set()
+ if resume and out_path.exists():
+ present_cell_ids = _read_present_cell_ids(out_path)
+ logger.info(
+ "run_eval: resume mode -- %d cells already present in %s",
+ len(present_cell_ids),
+ out_path,
+ )
+
+ # Ensure output directory exists before first write.
+ out_path.parent.mkdir(parents=True, exist_ok=True)
+
+ written_records: list[EvalResultRecord] = []
+
+ # Intent: outer loop is prompts, inner loop is runs -- matrix p x r with D10 variant=1.
+ for prompt_index, entry in enumerate(entries):
+ for run_number in range(runs):
+ cell_id = make_cell_id(prompt_index, _VARIANT_ID, run_number)
+
+ if cell_id in present_cell_ids:
+ logger.debug(
+ "run_eval: skipping cell %s (already present in %s)",
+ cell_id,
+ out_path,
+ )
+ continue
+
+ record = evaluate_cell(
+ skill=skill,
+ entry=entry,
+ prompt_index=prompt_index,
+ run_number=run_number,
+ dispatcher=dispatcher,
+ )
+
+ # INV-4: durable per-cell append-and-flush before advancing.
+ _append_record(out_path, record)
+
+ written_records.append(record)
+
+ logger.info(
+ "run_eval: finished skill=%s entries=%d runs=%d cells_written=%d out=%s",
+ skill,
+ len(entries),
+ runs,
+ len(written_records),
+ out_path,
+ )
+
+ return written_records
+
+
+# ---------------------------------------------------------------------------
+# _append_record (durable per-cell write)
+# ---------------------------------------------------------------------------
+
+
+def _append_record(out_path: Path, record: EvalResultRecord) -> None:
+ """Append *record* as a single JSON line to *out_path* and flush.
+
+ Uses the ``open(path, "a", ...)`` append precedent from
+ ``memory/capture.py:446``. Calls ``flush()`` after write to ensure the OS
+ buffer is flushed; ``os.fsync`` is intentionally omitted to avoid blocking
+ the matrix on every cell -- the OS buffer flush is sufficient for the
+ sequential use-case.
+ """
+ line = json.dumps(record.to_dict()) + "\n"
+ with open(out_path, "a", encoding="utf-8") as fh:
+ fh.write(line)
+ fh.flush()
+
+
+# ---------------------------------------------------------------------------
+# Path helpers
+# ---------------------------------------------------------------------------
+
+
+def default_run_path(root: Path, skill: str, timestamp: str) -> Path:
+ """Return the canonical .jsonl path for a new eval run.
+
+ Parameters
+ ----------
+ root:
+ Project root (the directory that contains ``.map/``).
+ skill:
+ Skill name (used as a subdirectory component).
+ timestamp:
+ Caller-supplied timestamp string, e.g.
+ ``datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")``.
+ Kept in the runner to make ``run_eval`` clock-free (testable).
+
+ Returns
+ -------
+ Path
+ ``/.map/eval-runs//.jsonl``
+ """
+ return root / ".map" / "eval-runs" / skill / f"{timestamp}.jsonl"
+
+
+def latest_run_path(root: Path, skill: str) -> Path | None:
+ """Return the most-recent ``.jsonl`` path for *skill*, or ``None``.
+
+ Scans ``/.map/eval-runs//`` for ``*.jsonl`` files and returns
+ the lexicographically last one (ISO-timestamp filenames sort correctly).
+ Returns ``None`` if the directory does not exist or is empty.
+ """
+ run_dir = root / ".map" / "eval-runs" / skill
+ if not run_dir.is_dir():
+ return None
+ candidates = sorted(run_dir.glob("*.jsonl"))
+ if not candidates:
+ return None
+ return candidates[-1]
diff --git a/src/mapify_cli/templates/hooks/safety-guardrails.py b/src/mapify_cli/templates/hooks/safety-guardrails.py
index 04fd888..48e671c 100755
--- a/src/mapify_cli/templates/hooks/safety-guardrails.py
+++ b/src/mapify_cli/templates/hooks/safety-guardrails.py
@@ -38,7 +38,12 @@
# Dangerous bash command patterns
_DEFAULT_DANGEROUS_COMMANDS = [
- r"rm\s+-rf\s+/", # rm -rf /
+ # Block `rm -rf /` (bare root), `rm -rf /etc`, `rm -rf /home/user`, etc.,
+ # but ALLOW deletion of subpaths UNDER a temp root (rm -rf /tmp/,
+ # /private/tmp/, /var/folders/, /var/tmp/) — legitimate
+ # scratch cleanup. The negative lookahead requires a trailing slash, so the
+ # temp root itself (`rm -rf /tmp`) stays blocked; only children are allowed.
+ r"rm\s+-rf\s+/(?!(?:tmp|private/tmp|var/folders|var/tmp)/)", # rm -rf / (non-temp)
r"rm\s+-rf\s+\*", # rm -rf *
r"rm\s+-rf\s+\.\.", # rm -rf ..
r"git\s+push.*--force.*main",
diff --git a/src/mapify_cli/templates/map/scripts/map_orchestrator.py b/src/mapify_cli/templates/map/scripts/map_orchestrator.py
index 03ea61c..013227f 100755
--- a/src/mapify_cli/templates/map/scripts/map_orchestrator.py
+++ b/src/mapify_cli/templates/map/scripts/map_orchestrator.py
@@ -2166,6 +2166,29 @@ def _is_cross_repo_path(p: str) -> bool:
diff_paths = set()
if diff_paths:
files_not_in_diff = [p for p in declared if p not in diff_paths]
+ # Gitignored deliverables (e.g. .map/ workflow artifacts like spike
+ # docs or eval-run .jsonl) never appear in git diff/status by design —
+ # that is NOT Actor truncation. Drop any declared path that
+ # `git check-ignore` reports as ignored so it does not raise a false
+ # "Possible Actor truncation" warning. A gitignored file that is also
+ # missing from disk is still flagged separately via missing_files.
+ if files_not_in_diff:
+ try:
+ igproc = _sp.run(
+ ["git", "check-ignore", "--", *files_not_in_diff],
+ cwd=project_dir, capture_output=True, text=True, timeout=5,
+ )
+ ignored = {
+ line.strip()
+ for line in igproc.stdout.splitlines()
+ if line.strip()
+ }
+ if ignored:
+ files_not_in_diff = [
+ p for p in files_not_in_diff if p not in ignored
+ ]
+ except (OSError, _sp.TimeoutExpired):
+ pass
state.record_subtask_result(
subtask_id,
diff --git a/src/mapify_cli/templates/skills/map-efficient/SKILL.md b/src/mapify_cli/templates/skills/map-efficient/SKILL.md
index b986b52..2045905 100644
--- a/src/mapify_cli/templates/skills/map-efficient/SKILL.md
+++ b/src/mapify_cli/templates/skills/map-efficient/SKILL.md
@@ -191,8 +191,10 @@ python3 .map/scripts/map_step_runner.py record_test_baseline "$BRANCH"
Snapshots pre-existing failures so later subtasks distinguish
"introduced regression" from "was broken pre-plan". Auto-detects
-Make/pytest/go test/cargo. Overrides + narrow-target guidance:
-[efficient-reference.md](efficient-reference.md#pre-flight-test-baseline).
+Make/pytest/go test/cargo. It captures the test run internally and prints a
+single compact JSON report at the end — read that JSON directly; do NOT pipe it
+through `head`/`tail` (per the repo bash guidelines). Overrides + narrow-target
+guidance: [efficient-reference.md](efficient-reference.md#pre-flight-test-baseline).
### Wave Computation (after INIT_STATE) - REQUIRED
diff --git a/src/mapify_cli/templates/skills/map-efficient/efficient-reference.md b/src/mapify_cli/templates/skills/map-efficient/efficient-reference.md
index 802d11c..6734cfc 100644
--- a/src/mapify_cli/templates/skills/map-efficient/efficient-reference.md
+++ b/src/mapify_cli/templates/skills/map-efficient/efficient-reference.md
@@ -203,6 +203,11 @@ fix or defer.
python3 .map/scripts/map_step_runner.py record_test_baseline "$BRANCH"
```
+It captures the test run internally and prints a single compact JSON report at
+the end — read that JSON directly. Do NOT pipe it through `head`/`tail` (per the
+repo bash guidelines); the output is one small object, not a stream, so
+truncating it only hides fields.
+
Auto-detects from project markers:
- `Makefile` with `test:` target → `make test`
- `pyproject.toml` / `pytest.ini` → `pytest`
diff --git a/src/mapify_cli/templates/skills/map-skill-eval/SKILL.md b/src/mapify_cli/templates/skills/map-skill-eval/SKILL.md
new file mode 100644
index 0000000..567ac04
--- /dev/null
+++ b/src/mapify_cli/templates/skills/map-skill-eval/SKILL.md
@@ -0,0 +1,94 @@
+---
+name: map-skill-eval
+description: |
+ Evaluate a /map-* skill's trigger accuracy and cost. Use when asked to measure skill trigger accuracy, run an eval-set, or check token/duration cost via `mapify skill-eval`. Do NOT use to plan or implement; use map-plan or map-efficient.
+effort: medium
+disable-model-invocation: true
+argument-hint: "[skill] [--eval-set PATH]"
+---
+# /map-skill-eval — Skill Trigger Accuracy & Cost Evaluation
+
+Purpose: measure whether a `/map-*` skill fires on the right prompts and what it costs in tokens and time. Do not plan or implement from this skill.
+
+Requires the `claude` CLI (installed and on `$PATH`). The skill is skipped at install time on hosts without `claude`.
+
+## Invocation
+
+```bash
+mapify skill-eval run --eval-set PATH [--dry-run] [--resume] [--max-concurrency N]
+```
+
+- `` — the skill name to evaluate (e.g. `map-plan`).
+- `--eval-set PATH` — path to a JSON eval-set file defining prompt cases and expected assertions.
+- `--dry-run` — validate the eval-set and print the planned run count without spending any quota.
+- `--resume` — continue an interrupted run from the last durable checkpoint.
+- `--max-concurrency N` — max parallel `claude -p` workers (default: 1).
+
+## What It Does
+
+1. **Prompts × runs matrix** — for each case in the eval-set, invokes `claude -p` in an isolated temporary working directory seeded with `.claude/` (skills, settings). Runs are independent; no shared state leaks between cases.
+2. **Transcript-parse trigger detection** — parses each `claude -p` transcript to determine whether the target skill fired (trigger) or did not fire (not_trigger).
+3. **Deterministic assertions** — each eval case may specify one or more assertion types:
+ - `contains` / `not_contains` — substring presence in the response.
+ - `regex` — pattern match against the response.
+ - `valid_json` — response parses as JSON.
+ - `trigger` / `not_trigger` — skill fired / did not fire.
+4. **Durable resumable run log** — results are appended to `.map/eval-runs//.jsonl` as each case completes, so a partial run is recoverable via `--resume`.
+5. **Summary report** — after all cases complete, prints pass-rate (passed/total) plus per-case token usage, duration, and cache-hit stats.
+
+## Eval-Set Format
+
+A JSON object with an `entries` array. Each entry has a `prompt`, optional
+`should_trigger` / `should_not_trigger` skill names (the runner turns these into
+`trigger` / `not_trigger` assertions), and an optional `assertions` array.
+Assertion types: `contains`, `not_contains`, `regex`, `valid_json`, `trigger`,
+`not_trigger`.
+
+```json
+{
+ "entries": [
+ {
+ "prompt": "Decompose this feature into subtasks",
+ "should_trigger": "map-plan",
+ "assertions": [
+ { "type": "contains", "value": "subtask" }
+ ]
+ },
+ {
+ "prompt": "Run quality gates",
+ "should_not_trigger": "map-plan",
+ "assertions": []
+ }
+ ]
+}
+```
+
+## --dry-run
+
+`--dry-run` validates the eval-set schema and prints the planned case count with estimated quota usage. No `claude -p` calls are made; no `.jsonl` is written.
+
+## Examples
+
+```bash
+# Validate eval-set without spending quota
+mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --dry-run
+
+# Run full eval with up to 8 parallel workers
+mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --max-concurrency 8
+
+# Resume an interrupted run
+mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --resume
+```
+
+## Troubleshooting
+
+- **`claude` not found** — `map-skill-eval` requires the `claude` CLI on `$PATH`. Install it and re-run `mapify init` to activate the skill.
+- **Eval-set validation error on `--dry-run`** — check that each case has a non-empty `id`, a `prompt`, and at least one `assertions` entry with a valid `type`.
+- **Run log not found for `--resume`** — `--resume` looks for the latest `.map/eval-runs//.jsonl`. If no prior run exists, omit `--resume` to start fresh.
+- **All cases report `not_trigger` unexpectedly** — verify the skill name matches exactly (e.g. `map-plan`, not `map_plan`) and that `.claude/` was seeded correctly in the temp cwd.
+
+## Related Commands
+
+- `/map-plan` — plan and decompose tasks.
+- `/map-efficient` — full MAP workflow execution.
+- `/map-check` — run quality gates and verify MAP workflow completion.
diff --git a/src/mapify_cli/templates/skills/skill-rules.json b/src/mapify_cli/templates/skills/skill-rules.json
index bbe32ab..d5a9606 100644
--- a/src/mapify_cli/templates/skills/skill-rules.json
+++ b/src/mapify_cli/templates/skills/skill-rules.json
@@ -239,6 +239,18 @@
]
}
},
+ "map-skill-eval": {
+ "type": "manual",
+ "skillClass": "task",
+ "enforcement": "manual",
+ "priority": "medium",
+ "description": "Evaluate a /map-* skill's trigger accuracy + cost via mapify skill-eval (claude -p matrix, deterministic assertions, durable resumable runs).",
+ "requires-cmd": ["claude"],
+ "promptTriggers": {
+ "keywords": ["map-skill-eval","skill-eval","skill eval","evaluate skill","trigger accuracy","skill triggering"],
+ "intentPatterns": ["map-skill-eval","(eval|evaluate|measure|test).*(skill).*(trigger|fire|cost)","does .* skill trigger"]
+ }
+ },
"map-task": {
"type": "manual",
"skillClass": "task",
diff --git a/src/mapify_cli/templates_src/hooks/safety-guardrails.py.jinja b/src/mapify_cli/templates_src/hooks/safety-guardrails.py.jinja
index 04fd888..48e671c 100755
--- a/src/mapify_cli/templates_src/hooks/safety-guardrails.py.jinja
+++ b/src/mapify_cli/templates_src/hooks/safety-guardrails.py.jinja
@@ -38,7 +38,12 @@ _DEFAULT_DANGEROUS_FILE_PATTERNS = [
# Dangerous bash command patterns
_DEFAULT_DANGEROUS_COMMANDS = [
- r"rm\s+-rf\s+/", # rm -rf /
+ # Block `rm -rf /` (bare root), `rm -rf /etc`, `rm -rf /home/user`, etc.,
+ # but ALLOW deletion of subpaths UNDER a temp root (rm -rf /tmp/,
+ # /private/tmp/, /var/folders/, /var/tmp/) — legitimate
+ # scratch cleanup. The negative lookahead requires a trailing slash, so the
+ # temp root itself (`rm -rf /tmp`) stays blocked; only children are allowed.
+ r"rm\s+-rf\s+/(?!(?:tmp|private/tmp|var/folders|var/tmp)/)", # rm -rf / (non-temp)
r"rm\s+-rf\s+\*", # rm -rf *
r"rm\s+-rf\s+\.\.", # rm -rf ..
r"git\s+push.*--force.*main",
diff --git a/src/mapify_cli/templates_src/map/scripts/map_orchestrator.py.jinja b/src/mapify_cli/templates_src/map/scripts/map_orchestrator.py.jinja
index 03ea61c..013227f 100755
--- a/src/mapify_cli/templates_src/map/scripts/map_orchestrator.py.jinja
+++ b/src/mapify_cli/templates_src/map/scripts/map_orchestrator.py.jinja
@@ -2166,6 +2166,29 @@ def record_subtask_result(
diff_paths = set()
if diff_paths:
files_not_in_diff = [p for p in declared if p not in diff_paths]
+ # Gitignored deliverables (e.g. .map/ workflow artifacts like spike
+ # docs or eval-run .jsonl) never appear in git diff/status by design —
+ # that is NOT Actor truncation. Drop any declared path that
+ # `git check-ignore` reports as ignored so it does not raise a false
+ # "Possible Actor truncation" warning. A gitignored file that is also
+ # missing from disk is still flagged separately via missing_files.
+ if files_not_in_diff:
+ try:
+ igproc = _sp.run(
+ ["git", "check-ignore", "--", *files_not_in_diff],
+ cwd=project_dir, capture_output=True, text=True, timeout=5,
+ )
+ ignored = {
+ line.strip()
+ for line in igproc.stdout.splitlines()
+ if line.strip()
+ }
+ if ignored:
+ files_not_in_diff = [
+ p for p in files_not_in_diff if p not in ignored
+ ]
+ except (OSError, _sp.TimeoutExpired):
+ pass
state.record_subtask_result(
subtask_id,
diff --git a/src/mapify_cli/templates_src/skills/map-efficient/SKILL.md.jinja b/src/mapify_cli/templates_src/skills/map-efficient/SKILL.md.jinja
index b986b52..2045905 100644
--- a/src/mapify_cli/templates_src/skills/map-efficient/SKILL.md.jinja
+++ b/src/mapify_cli/templates_src/skills/map-efficient/SKILL.md.jinja
@@ -191,8 +191,10 @@ python3 .map/scripts/map_step_runner.py record_test_baseline "$BRANCH"
Snapshots pre-existing failures so later subtasks distinguish
"introduced regression" from "was broken pre-plan". Auto-detects
-Make/pytest/go test/cargo. Overrides + narrow-target guidance:
-[efficient-reference.md](efficient-reference.md#pre-flight-test-baseline).
+Make/pytest/go test/cargo. It captures the test run internally and prints a
+single compact JSON report at the end — read that JSON directly; do NOT pipe it
+through `head`/`tail` (per the repo bash guidelines). Overrides + narrow-target
+guidance: [efficient-reference.md](efficient-reference.md#pre-flight-test-baseline).
### Wave Computation (after INIT_STATE) - REQUIRED
diff --git a/src/mapify_cli/templates_src/skills/map-efficient/efficient-reference.md.jinja b/src/mapify_cli/templates_src/skills/map-efficient/efficient-reference.md.jinja
index 802d11c..6734cfc 100644
--- a/src/mapify_cli/templates_src/skills/map-efficient/efficient-reference.md.jinja
+++ b/src/mapify_cli/templates_src/skills/map-efficient/efficient-reference.md.jinja
@@ -203,6 +203,11 @@ fix or defer.
python3 .map/scripts/map_step_runner.py record_test_baseline "$BRANCH"
```
+It captures the test run internally and prints a single compact JSON report at
+the end — read that JSON directly. Do NOT pipe it through `head`/`tail` (per the
+repo bash guidelines); the output is one small object, not a stream, so
+truncating it only hides fields.
+
Auto-detects from project markers:
- `Makefile` with `test:` target → `make test`
- `pyproject.toml` / `pytest.ini` → `pytest`
diff --git a/src/mapify_cli/templates_src/skills/map-skill-eval/SKILL.md.jinja b/src/mapify_cli/templates_src/skills/map-skill-eval/SKILL.md.jinja
new file mode 100644
index 0000000..567ac04
--- /dev/null
+++ b/src/mapify_cli/templates_src/skills/map-skill-eval/SKILL.md.jinja
@@ -0,0 +1,94 @@
+---
+name: map-skill-eval
+description: |
+ Evaluate a /map-* skill's trigger accuracy and cost. Use when asked to measure skill trigger accuracy, run an eval-set, or check token/duration cost via `mapify skill-eval`. Do NOT use to plan or implement; use map-plan or map-efficient.
+effort: medium
+disable-model-invocation: true
+argument-hint: "[skill] [--eval-set PATH]"
+---
+# /map-skill-eval — Skill Trigger Accuracy & Cost Evaluation
+
+Purpose: measure whether a `/map-*` skill fires on the right prompts and what it costs in tokens and time. Do not plan or implement from this skill.
+
+Requires the `claude` CLI (installed and on `$PATH`). The skill is skipped at install time on hosts without `claude`.
+
+## Invocation
+
+```bash
+mapify skill-eval run --eval-set PATH [--dry-run] [--resume] [--max-concurrency N]
+```
+
+- `` — the skill name to evaluate (e.g. `map-plan`).
+- `--eval-set PATH` — path to a JSON eval-set file defining prompt cases and expected assertions.
+- `--dry-run` — validate the eval-set and print the planned run count without spending any quota.
+- `--resume` — continue an interrupted run from the last durable checkpoint.
+- `--max-concurrency N` — max parallel `claude -p` workers (default: 1).
+
+## What It Does
+
+1. **Prompts × runs matrix** — for each case in the eval-set, invokes `claude -p` in an isolated temporary working directory seeded with `.claude/` (skills, settings). Runs are independent; no shared state leaks between cases.
+2. **Transcript-parse trigger detection** — parses each `claude -p` transcript to determine whether the target skill fired (trigger) or did not fire (not_trigger).
+3. **Deterministic assertions** — each eval case may specify one or more assertion types:
+ - `contains` / `not_contains` — substring presence in the response.
+ - `regex` — pattern match against the response.
+ - `valid_json` — response parses as JSON.
+ - `trigger` / `not_trigger` — skill fired / did not fire.
+4. **Durable resumable run log** — results are appended to `.map/eval-runs//.jsonl` as each case completes, so a partial run is recoverable via `--resume`.
+5. **Summary report** — after all cases complete, prints pass-rate (passed/total) plus per-case token usage, duration, and cache-hit stats.
+
+## Eval-Set Format
+
+A JSON object with an `entries` array. Each entry has a `prompt`, optional
+`should_trigger` / `should_not_trigger` skill names (the runner turns these into
+`trigger` / `not_trigger` assertions), and an optional `assertions` array.
+Assertion types: `contains`, `not_contains`, `regex`, `valid_json`, `trigger`,
+`not_trigger`.
+
+```json
+{
+ "entries": [
+ {
+ "prompt": "Decompose this feature into subtasks",
+ "should_trigger": "map-plan",
+ "assertions": [
+ { "type": "contains", "value": "subtask" }
+ ]
+ },
+ {
+ "prompt": "Run quality gates",
+ "should_not_trigger": "map-plan",
+ "assertions": []
+ }
+ ]
+}
+```
+
+## --dry-run
+
+`--dry-run` validates the eval-set schema and prints the planned case count with estimated quota usage. No `claude -p` calls are made; no `.jsonl` is written.
+
+## Examples
+
+```bash
+# Validate eval-set without spending quota
+mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --dry-run
+
+# Run full eval with up to 8 parallel workers
+mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --max-concurrency 8
+
+# Resume an interrupted run
+mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --resume
+```
+
+## Troubleshooting
+
+- **`claude` not found** — `map-skill-eval` requires the `claude` CLI on `$PATH`. Install it and re-run `mapify init` to activate the skill.
+- **Eval-set validation error on `--dry-run`** — check that each case has a non-empty `id`, a `prompt`, and at least one `assertions` entry with a valid `type`.
+- **Run log not found for `--resume`** — `--resume` looks for the latest `.map/eval-runs//.jsonl`. If no prior run exists, omit `--resume` to start fresh.
+- **All cases report `not_trigger` unexpectedly** — verify the skill name matches exactly (e.g. `map-plan`, not `map_plan`) and that `.claude/` was seeded correctly in the temp cwd.
+
+## Related Commands
+
+- `/map-plan` — plan and decompose tasks.
+- `/map-efficient` — full MAP workflow execution.
+- `/map-check` — run quality gates and verify MAP workflow completion.
diff --git a/src/mapify_cli/templates_src/skills/skill-rules.json.jinja b/src/mapify_cli/templates_src/skills/skill-rules.json.jinja
index bbe32ab..d5a9606 100644
--- a/src/mapify_cli/templates_src/skills/skill-rules.json.jinja
+++ b/src/mapify_cli/templates_src/skills/skill-rules.json.jinja
@@ -239,6 +239,18 @@
]
}
},
+ "map-skill-eval": {
+ "type": "manual",
+ "skillClass": "task",
+ "enforcement": "manual",
+ "priority": "medium",
+ "description": "Evaluate a /map-* skill's trigger accuracy + cost via mapify skill-eval (claude -p matrix, deterministic assertions, durable resumable runs).",
+ "requires-cmd": ["claude"],
+ "promptTriggers": {
+ "keywords": ["map-skill-eval","skill-eval","skill eval","evaluate skill","trigger accuracy","skill triggering"],
+ "intentPatterns": ["map-skill-eval","(eval|evaluate|measure|test).*(skill).*(trigger|fire|cost)","does .* skill trigger"]
+ }
+ },
"map-task": {
"type": "manual",
"skillClass": "task",
diff --git a/tests/hooks/test_safety_guardrails.py b/tests/hooks/test_safety_guardrails.py
index 9fd68ac..5dcfccb 100644
--- a/tests/hooks/test_safety_guardrails.py
+++ b/tests/hooks/test_safety_guardrails.py
@@ -223,6 +223,10 @@ class TestRmRfBlocking:
[
"rm -rf /",
"rm -rf /home/user",
+ "rm -rf /etc",
+ "rm -rf /var",
+ "rm -rf /tmp", # the temp ROOT itself stays blocked (no trailing /child)
+ "rm -rf /*",
"rm -rf *",
"rm -rf ..",
],
@@ -232,6 +236,25 @@ def test_rm_rf_blocked(self, command):
assert exit_code == 0
_assert_denied(_parse_stdout(stdout))
+ @pytest.mark.parametrize(
+ "command",
+ [
+ "rm -rf /tmp/map-spike-abc123",
+ "rm -rf /tmp/pytest-of-user/run0",
+ "rm -rf /private/tmp/map-spike-WOi8Pq", # macOS mktemp
+ "rm -rf /var/folders/ab/cd1234/T/scratch", # macOS $TMPDIR
+ "rm -rf /var/tmp/build-cache",
+ ],
+ )
+ def test_rm_rf_temp_subpath_allowed(self, command):
+ """Deleting a subpath UNDER a temp root is legitimate scratch cleanup
+ and must not be blocked (regression: the bare ``rm -rf /`` pattern used
+ to flag every absolute path, including temp dirs and any command that
+ merely mentioned one)."""
+ exit_code, stdout, _ = run_hook_bash(command)
+ assert exit_code == 0
+ assert _parse_stdout(stdout) == {}
+
def test_rm_single_file_allowed(self):
exit_code, stdout, _ = run_hook_bash("rm file.txt")
assert exit_code == 0
diff --git a/tests/skills_eval/fixtures/map_debug_eval_set.json b/tests/skills_eval/fixtures/map_debug_eval_set.json
new file mode 100644
index 0000000..d9a6a56
--- /dev/null
+++ b/tests/skills_eval/fixtures/map_debug_eval_set.json
@@ -0,0 +1,23 @@
+{
+ "entries": [
+ {
+ "prompt": "I need help debugging a failing test in my Python project.",
+ "should_trigger": "map-debug",
+ "assertions": [
+ {"type": "contains", "value": "debug"}
+ ]
+ },
+ {
+ "prompt": "Please add the numbers 2 and 3 together.",
+ "should_not_trigger": "map-debug",
+ "assertions": []
+ },
+ {
+ "prompt": "My application crashes with a stack overflow. Help me diagnose it.",
+ "should_trigger": "map-debug",
+ "assertions": [
+ {"type": "contains", "value": "crash"}
+ ]
+ }
+ ]
+}
diff --git a/tests/test_map_orchestrator.py b/tests/test_map_orchestrator.py
index 9633a63..e213176 100644
--- a/tests/test_map_orchestrator.py
+++ b/tests/test_map_orchestrator.py
@@ -2179,6 +2179,74 @@ def test_explicit_commit_sha_wins(self, branch_dir, tmp_path, monkeypatch):
assert reloaded.last_subtask_commit_sha == "cafebabe"
+class TestRecordSubtaskResultGitignoredArtifact:
+ """record_subtask_result must NOT raise a 'Possible Actor truncation'
+ warning for declared files that are gitignored-but-present on disk (e.g.
+ .map/ workflow artifacts like spike docs). They never appear in git
+ diff/status by design — that is intentional, not truncation."""
+
+ def _init_git_repo(self, tmp_path):
+ import subprocess as _sp
+ _sp.run(["git", "init"], cwd=tmp_path, capture_output=True)
+ _sp.run(["git", "config", "user.email", "t@t.com"], cwd=tmp_path, capture_output=True)
+ _sp.run(["git", "config", "user.name", "t"], cwd=tmp_path, capture_output=True)
+ (tmp_path / ".gitignore").write_text(".map/\n")
+ (tmp_path / "seed.txt").write_text("seed")
+ (tmp_path / "tracked.py").write_text("x = 1\n")
+ _sp.run(["git", "add", "."], cwd=tmp_path, capture_output=True)
+ _sp.run(["git", "commit", "-m", "init"], cwd=tmp_path, capture_output=True)
+ # Second (non-root) commit so HEAD has a parent and `git diff-tree`
+ # yields a NON-empty diff_paths. Without this, a root commit produces an
+ # empty diff and files_not_in_diff is never computed — the gitignore
+ # test would then pass vacuously without exercising the filter.
+ (tmp_path / "seed.txt").write_text("seed v2")
+ _sp.run(["git", "add", "."], cwd=tmp_path, capture_output=True)
+ _sp.run(["git", "commit", "-m", "second"], cwd=tmp_path, capture_output=True)
+
+ def test_gitignored_artifact_not_flagged(self, branch_dir, tmp_path, monkeypatch):
+ state = map_orchestrator.StepState()
+ state.subtask_sequence = ["ST-001"]
+ state.current_subtask_id = "ST-001"
+ state_file = tmp_path / ".map" / branch_dir / "step_state.json"
+ state.save(state_file)
+ self._init_git_repo(tmp_path)
+ # A real deliverable that exists on disk but is gitignored (.map/**).
+ artifact = tmp_path / ".map" / branch_dir / "spike_st001.md"
+ artifact.write_text("spike verdict", encoding="utf-8")
+ monkeypatch.setenv("CLAUDE_PROJECT_DIR", str(tmp_path))
+ result = map_orchestrator.record_subtask_result(
+ "ST-001", branch_dir,
+ files_changed=[f".map/{branch_dir}/spike_st001.md"],
+ status="valid", summary="spike", commit_sha=None,
+ )
+ assert result["status"] == "success"
+ # No false truncation warning, no files_not_in_diff for the gitignored file.
+ assert "files_not_in_diff" not in result, result
+ assert "Possible Actor truncation" not in result.get("warning", ""), result
+
+ def test_non_gitignored_unchanged_tracked_file_still_flagged(
+ self, branch_dir, tmp_path, monkeypatch
+ ):
+ """Negative control (proves the filter is SPECIFIC): a tracked file that
+ exists, is NOT gitignored, and was not touched by this subtask's diff
+ still surfaces in files_not_in_diff — the gitignore filter must not be a
+ blanket suppression."""
+ state = map_orchestrator.StepState()
+ state.subtask_sequence = ["ST-001"]
+ state.current_subtask_id = "ST-001"
+ state_file = tmp_path / ".map" / branch_dir / "step_state.json"
+ state.save(state_file)
+ self._init_git_repo(tmp_path) # tracked.py committed, unchanged in HEAD
+ monkeypatch.setenv("CLAUDE_PROJECT_DIR", str(tmp_path))
+ result = map_orchestrator.record_subtask_result(
+ "ST-001", branch_dir,
+ files_changed=["tracked.py"],
+ status="valid", summary="x", commit_sha=None,
+ )
+ assert result["status"] == "success"
+ assert result.get("files_not_in_diff") == ["tracked.py"], result
+
+
class TestValidateStepTransactionalMonitor:
"""validate_step('2.4') now implicitly closes pending 2.3 (ACTOR) so
callers don't get 'Step mismatch: expected 2.3' when they jump straight
diff --git a/tests/test_skills_consistency.py b/tests/test_skills_consistency.py
index 81eed55..8723ad6 100644
--- a/tests/test_skills_consistency.py
+++ b/tests/test_skills_consistency.py
@@ -477,9 +477,9 @@ def detect_skill_deps(skill_dir: Path) -> dict[str, set[str]]:
def test_skill_discovery_non_empty(skill_names: list[str]) -> None:
- """Guard: skill-rules.json must list exactly 15 skills (prevents vacuous pass)."""
- assert len(skill_names) == 15, (
- f"Expected 15 skills in skill-rules.json, found {len(skill_names)}: "
+ """Guard: skill-rules.json must list exactly 16 skills (prevents vacuous pass)."""
+ assert len(skill_names) == 16, (
+ f"Expected 16 skills in skill-rules.json, found {len(skill_names)}: "
f"{sorted(skill_names)}"
)
diff --git a/tests/test_skills_eval_aggregator.py b/tests/test_skills_eval_aggregator.py
new file mode 100644
index 0000000..6fd48a7
--- /dev/null
+++ b/tests/test_skills_eval_aggregator.py
@@ -0,0 +1,326 @@
+"""Tests for skills_eval aggregator (ST-006).
+
+Covers aggregate() and bounded_run() using MockDispatcher only -- zero real
+claude subprocess (INV-2/INV-3). Tests map 1:1 to validation criteria:
+ VC1 -- pass_rate fraction
+ VC2 -- token mean/stddev, n<2 no raise
+ VC3 -- bounded_run serialised writes: every .jsonl line parses, no corruption
+ VC4 -- all-null token_usage -> token stats None, pass_rate + duration still valid
+ SC-1 -- max_concurrency=3 matrix -> complete unique cell set; resume -> no dupes
+"""
+
+from __future__ import annotations
+
+import json
+import math
+from pathlib import Path
+
+from mapify_cli.skills_eval.aggregator import aggregate, bounded_run
+from mapify_cli.skills_eval.dispatcher import MockDispatcher
+from mapify_cli.skills_eval.eval_schema import (
+ EvalResultRecord,
+ EvalSetEntry,
+ make_cell_id,
+)
+from mapify_cli.token_budget import TokenUsage
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _entries(n: int = 2) -> list[EvalSetEntry]:
+ return [
+ EvalSetEntry(
+ prompt=f"p{i}",
+ should_trigger=None,
+ should_not_trigger=None,
+ assertions=[],
+ )
+ for i in range(n)
+ ]
+
+
+def _read_all_records(path: Path) -> list[EvalResultRecord]:
+ """Parse every non-blank line in the .jsonl; raise on malformed."""
+ records = []
+ for line in path.read_text(encoding="utf-8").splitlines():
+ if not line.strip():
+ continue
+ records.append(EvalResultRecord.from_dict(json.loads(line)))
+ return records
+
+
+def _make_record(
+ cell_id: str,
+ *,
+ assertions_failed: list[str] | None = None,
+ token_usage: TokenUsage | None = None,
+ duration_s: float = 1.0,
+) -> EvalResultRecord:
+ return EvalResultRecord(
+ cell_id=cell_id,
+ prompt="test",
+ triggered_skill=None,
+ token_usage=token_usage,
+ duration_s=duration_s,
+ assertions_passed=[],
+ assertions_failed=assertions_failed or [],
+ )
+
+
+# ---------------------------------------------------------------------------
+# aggregate() -- AggregateSummary correctness
+# ---------------------------------------------------------------------------
+
+
+def test_vc1_pass_rate_fraction() -> None:
+ """VC1: pass_rate = passed_cells / total_cells."""
+ records = [
+ _make_record("p0-v1-r0"), # passed (empty assertions_failed)
+ _make_record("p1-v1-r0", assertions_failed=["x"]), # failed
+ _make_record("p2-v1-r0"), # passed
+ _make_record("p3-v1-r0", assertions_failed=["y", "z"]), # failed
+ ]
+ summary = aggregate(records)
+ assert summary.total_cells == 4
+ assert summary.passed_cells == 2
+ assert math.isclose(summary.pass_rate, 0.5)
+
+
+def test_vc1_all_passed() -> None:
+ records = [_make_record(f"p{i}-v1-r0") for i in range(3)]
+ summary = aggregate(records)
+ assert summary.passed_cells == 3
+ assert math.isclose(summary.pass_rate, 1.0)
+
+
+def test_vc1_all_failed() -> None:
+ records = [_make_record(f"p{i}-v1-r0", assertions_failed=["f"]) for i in range(3)]
+ summary = aggregate(records)
+ assert summary.passed_cells == 0
+ assert math.isclose(summary.pass_rate, 0.0)
+
+
+def test_vc1_empty_list_no_raise() -> None:
+ """VC4/VC1: empty list must not raise; pass_rate = 0.0."""
+ summary = aggregate([])
+ assert summary.total_cells == 0
+ assert summary.passed_cells == 0
+ assert math.isclose(summary.pass_rate, 0.0)
+ assert summary.tokens_mean is None
+ assert summary.tokens_stddev is None
+ assert summary.duration_mean is None
+ assert summary.duration_stddev is None
+
+
+def test_vc2_token_mean_and_stddev() -> None:
+ """VC2: tokens_mean and tokens_stddev correct over non-null token_usage."""
+ tu_a = TokenUsage(input_tokens=100, cache_read_input_tokens=0)
+ tu_b = TokenUsage(input_tokens=200, cache_read_input_tokens=0)
+ tu_c = TokenUsage(input_tokens=300, cache_read_input_tokens=0)
+ records = [
+ _make_record("p0-v1-r0", token_usage=tu_a, duration_s=1.0),
+ _make_record("p1-v1-r0", token_usage=tu_b, duration_s=2.0),
+ _make_record("p2-v1-r0", token_usage=tu_c, duration_s=3.0),
+ ]
+ summary = aggregate(records)
+ assert summary.token_sample_size == 3
+ assert math.isclose(summary.tokens_mean or 0.0, 200.0)
+ # sample stdev of [100, 200, 300]
+ import statistics
+ expected_stdev = statistics.stdev([100.0, 200.0, 300.0])
+ assert math.isclose(summary.tokens_stddev or 0.0, expected_stdev)
+
+
+def test_vc2_token_n_eq_1_no_raise() -> None:
+ """VC2: n<2 must not raise; stddev is 0.0."""
+ tu = TokenUsage(input_tokens=50, cache_read_input_tokens=10)
+ records = [_make_record("p0-v1-r0", token_usage=tu, duration_s=1.0)]
+ summary = aggregate(records)
+ assert summary.token_sample_size == 1
+ assert math.isclose(summary.tokens_mean or 0.0, 60.0) # 50+10
+ assert summary.tokens_stddev is not None and math.isclose(summary.tokens_stddev, 0.0)
+
+
+def test_vc4_all_null_token_usage() -> None:
+ """VC4: all-null token_usage -> token stats None; pass_rate + duration valid."""
+ records = [
+ _make_record("p0-v1-r0", token_usage=None, duration_s=1.0),
+ _make_record("p1-v1-r0", token_usage=None, duration_s=3.0),
+ ]
+ summary = aggregate(records)
+ # Token stats absent.
+ assert summary.token_sample_size == 0
+ assert summary.tokens_mean is None
+ assert summary.tokens_stddev is None
+ # Pass_rate still valid.
+ assert math.isclose(summary.pass_rate, 1.0) # no assertions_failed in either
+ # Duration stats still valid.
+ assert summary.duration_mean is not None
+ assert math.isclose(summary.duration_mean, 2.0)
+
+
+def test_duration_mean_and_stddev() -> None:
+ """duration_mean / duration_stddev correct when total_cells >= 2."""
+ records = [
+ _make_record("p0-v1-r0", duration_s=1.0),
+ _make_record("p1-v1-r0", duration_s=3.0),
+ ]
+ summary = aggregate(records)
+ assert math.isclose(summary.duration_mean or 0.0, 2.0)
+ import statistics
+ assert math.isclose(summary.duration_stddev or 0.0, statistics.stdev([1.0, 3.0]))
+
+
+def test_duration_stddev_zero_when_single_record() -> None:
+ """duration_stddev is 0.0 for a single record (n<2 guard)."""
+ records = [_make_record("p0-v1-r0", duration_s=5.0)]
+ summary = aggregate(records)
+ assert math.isclose(summary.duration_mean or 0.0, 5.0)
+ assert summary.duration_stddev is not None and math.isclose(summary.duration_stddev, 0.0)
+
+
+def test_aggregate_summary_to_dict() -> None:
+ """AggregateSummary.to_dict() returns a JSON-serialisable dict."""
+ summary = aggregate([])
+ d = summary.to_dict()
+ assert isinstance(d, dict)
+ # Verify round-trip via json.dumps (raises TypeError on non-serialisable).
+ json.dumps(d)
+ assert "pass_rate" in d
+ assert "total_cells" in d
+
+
+# ---------------------------------------------------------------------------
+# bounded_run() -- SC-1 / VC3 concurrent dispatch
+# ---------------------------------------------------------------------------
+
+
+def test_sc1_max_concurrency_3_complete_unique_cell_set(tmp_path: Path) -> None:
+ """SC-1: max_concurrency=3 over a matrix -> complete + unique cell set."""
+ out = tmp_path / "run.jsonl"
+ disp = MockDispatcher(triggered_skill=None, raw_output="ok", duration_s=0.01)
+
+ entries = _entries(3)
+ records = bounded_run(
+ skill="map-x",
+ entries=entries,
+ dispatcher=disp,
+ runs=4,
+ out_path=out,
+ max_concurrency=3,
+ )
+
+ # 3 entries x 4 runs = 12 cells total.
+ expected_ids = {make_cell_id(i, 1, r) for i in range(3) for r in range(4)}
+ returned_ids = {r.cell_id for r in records}
+ assert returned_ids == expected_ids
+
+ # Verify .jsonl: every line must parse and cell_id set must match.
+ file_records = _read_all_records(out)
+ file_ids = {r.cell_id for r in file_records}
+ assert file_ids == expected_ids
+ assert len(file_records) == 12 # no duplicates
+
+
+def test_vc3_jsonl_not_corrupted_concurrent(tmp_path: Path) -> None:
+ """VC3: concurrent writes produce valid .jsonl -- every line parses."""
+ out = tmp_path / "run.jsonl"
+ disp = MockDispatcher(triggered_skill=None, raw_output="x" * 200, duration_s=0.01)
+
+ bounded_run(
+ skill="map-x",
+ entries=_entries(4),
+ dispatcher=disp,
+ runs=5,
+ out_path=out,
+ max_concurrency=4,
+ )
+
+ raw_lines = [
+ ln for ln in out.read_text(encoding="utf-8").splitlines() if ln.strip()
+ ]
+ assert len(raw_lines) == 20 # 4*5
+ for line in raw_lines:
+ # Must parse without exception.
+ obj = json.loads(line)
+ assert "cell_id" in obj
+
+
+def test_sc1_resume_after_partial_no_dupes(tmp_path: Path) -> None:
+ """SC-1: resume after partial run -> no duplicate cell_ids in output."""
+ out = tmp_path / "run.jsonl"
+ disp = MockDispatcher(triggered_skill=None, raw_output="ok", duration_s=0.01)
+ entries = _entries(2)
+
+ # First pass: complete the first entry only (2 cells out of 4).
+ first_pass = bounded_run(
+ skill="map-x",
+ entries=entries,
+ dispatcher=disp,
+ runs=2,
+ out_path=out,
+ max_concurrency=1,
+ )
+ assert len(first_pass) == 4 # 2 entries * 2 runs
+
+ # Simulate partial completion: keep only first 2 lines.
+ lines = out.read_text(encoding="utf-8").splitlines()
+ out.write_text("\n".join(lines[:2]) + "\n", encoding="utf-8")
+ assert len([ln for ln in out.read_text().splitlines() if ln.strip()]) == 2
+
+ # Resume: only missing 2 cells should be added.
+ second_pass = bounded_run(
+ skill="map-x",
+ entries=entries,
+ dispatcher=disp,
+ runs=2,
+ out_path=out,
+ resume=True,
+ max_concurrency=2,
+ )
+ assert len(second_pass) == 2 # only the 2 missing cells
+
+ # Final file: 4 unique cell_ids, no duplicates.
+ file_records = _read_all_records(out)
+ all_ids = [r.cell_id for r in file_records]
+ assert len(all_ids) == 4
+ assert len(set(all_ids)) == 4 # no duplicates
+
+
+def test_bounded_run_default_concurrency_1_sequential(tmp_path: Path) -> None:
+ """Default max_concurrency=1 produces a correct sequential result."""
+ out = tmp_path / "run.jsonl"
+ disp = MockDispatcher(triggered_skill=None, raw_output="ok", duration_s=0.0)
+
+ records = bounded_run(
+ skill="map-x",
+ entries=_entries(2),
+ dispatcher=disp,
+ runs=3,
+ out_path=out,
+ )
+ assert len(records) == 6
+ file_records = _read_all_records(out)
+ assert len(file_records) == 6
+ assert len({r.cell_id for r in file_records}) == 6
+
+
+def test_bounded_run_empty_entries(tmp_path: Path) -> None:
+ """bounded_run on empty entries list returns [] and creates no file."""
+ out = tmp_path / "run.jsonl"
+ disp = MockDispatcher(triggered_skill=None, raw_output="ok", duration_s=0.0)
+
+ records = bounded_run(
+ skill="map-x",
+ entries=[],
+ dispatcher=disp,
+ runs=5,
+ out_path=out,
+ )
+ assert records == []
+ # No file should exist since parent was just mkdir'd and no records were written.
+ # (out_path.parent exists but out_path itself was never opened for append.)
+ assert not out.exists()
diff --git a/tests/test_skills_eval_runner.py b/tests/test_skills_eval_runner.py
new file mode 100644
index 0000000..522a2e0
--- /dev/null
+++ b/tests/test_skills_eval_runner.py
@@ -0,0 +1,711 @@
+"""Tests for the skills_eval runner (ST-005).
+
+One test per ST-005 validation criterion, driven entirely by ``MockDispatcher``
+so NO real ``claude -p`` subprocess runs (INV-2). Covers the prompts x runs
+matrix (D10 variants=1), durable per-cell ``.jsonl`` writes (INV-4), resume by
+cell_id with no duplicates, and per-cell error tolerance (VC4).
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+import pytest
+
+import mapify_cli.skills_eval.dispatcher as _disp_mod
+from mapify_cli.skills_eval.aggregator import aggregate
+from mapify_cli.skills_eval.assertions import run_assertion
+from mapify_cli.skills_eval.dispatcher import (
+ ClaudeSubprocessDispatcher,
+ MockDispatcher,
+ VariantDispatcher,
+)
+from mapify_cli.skills_eval.eval_schema import (
+ DispatchResult,
+ EvalResultRecord,
+ EvalSetEntry,
+ make_cell_id,
+)
+from mapify_cli.skills_eval.runner import load_eval_set, run_eval
+from mapify_cli.token_budget import TokenUsage
+
+
+def _entries() -> list[EvalSetEntry]:
+ return [
+ EvalSetEntry(
+ prompt="p0", should_trigger="map-x", should_not_trigger=None, assertions=[]
+ ),
+ EvalSetEntry(
+ prompt="p1", should_trigger=None, should_not_trigger="map-x", assertions=[]
+ ),
+ ]
+
+
+def _read_cell_ids(path: Path) -> list[str]:
+ """Collect cell_ids, skipping blank/malformed lines (mirrors the runner)."""
+ ids: list[str] = []
+ for line in path.read_text(encoding="utf-8").splitlines():
+ if not line.strip():
+ continue
+ try:
+ ids.append(json.loads(line)["cell_id"])
+ except (json.JSONDecodeError, KeyError):
+ continue
+ return ids
+
+
+def test_vc1_matrix_prompts_times_runs_no_variants_loop(tmp_path: Path) -> None:
+ """VC1: iterate prompts x runs with variant_id fixed at 1 (no variants loop)."""
+ out = tmp_path / "run.jsonl"
+ disp = MockDispatcher(triggered_skill="map-x", raw_output="ok", duration_s=0.1)
+
+ records = run_eval(
+ skill="map-x",
+ entries=_entries(),
+ dispatcher=disp,
+ runs=3,
+ out_path=out,
+ resume=False,
+ )
+
+ # 2 prompts x 3 runs x 1 variant = 6 cells.
+ assert len(records) == 6
+ cell_ids = _read_cell_ids(out)
+ expected = {make_cell_id(i, 1, r) for i in range(2) for r in range(3)}
+ assert set(cell_ids) == expected
+ # Every cell_id carries the fixed variant token "-v1-".
+ assert all("-v1-" in cid for cid in cell_ids)
+ assert len(cell_ids) == len(set(cell_ids)) == 6
+
+
+def test_vc2_durable_jsonl_written_per_cell(tmp_path: Path) -> None:
+ """VC2: each completed cell is appended to the .jsonl as a parseable record."""
+ out = tmp_path / "run.jsonl"
+ disp = MockDispatcher(
+ triggered_skill="map-x",
+ raw_output="hello",
+ token_usage=TokenUsage(input_tokens=11, cache_read_input_tokens=2),
+ duration_s=0.5,
+ )
+
+ records = run_eval(
+ skill="map-x",
+ entries=_entries(),
+ dispatcher=disp,
+ runs=2,
+ out_path=out,
+ resume=False,
+ )
+
+ lines = out.read_text(encoding="utf-8").splitlines()
+ assert len(lines) == len(records) == 4
+ # Each line round-trips through the schema and matches a returned record.
+ by_cell = {r.cell_id: r for r in records}
+ for line in lines:
+ rec = EvalResultRecord.from_dict(json.loads(line))
+ assert rec.cell_id in by_cell
+ assert rec == by_cell[rec.cell_id]
+ assert rec.prompt in {"p0", "p1"}
+ assert rec.token_usage is not None and rec.token_usage.input_tokens == 11
+
+
+def test_vc3_resume_skips_present_cell_ids(tmp_path: Path) -> None:
+ """VC3: --resume skips present cell_ids; killed-then-resumed = complete, no dupes."""
+ out = tmp_path / "run.jsonl"
+ disp = MockDispatcher(triggered_skill="map-x", raw_output="ok", duration_s=0.1)
+
+ run_eval(
+ skill="map-x",
+ entries=_entries(),
+ dispatcher=disp,
+ runs=2,
+ out_path=out,
+ resume=False,
+ )
+ full = out.read_text(encoding="utf-8").splitlines()
+ assert len(full) == 4
+
+ # Simulate a kill mid-run: drop the last two completed cells.
+ out.write_text("\n".join(full[:2]) + "\n", encoding="utf-8")
+ assert len(_read_cell_ids(out)) == 2
+
+ # Resume: only the two missing cells should be appended.
+ appended = run_eval(
+ skill="map-x",
+ entries=_entries(),
+ dispatcher=disp,
+ runs=2,
+ out_path=out,
+ resume=True,
+ )
+ assert len(appended) == 2 # only missing cells written this call
+
+ final = _read_cell_ids(out)
+ assert len(final) == 4
+ assert len(set(final)) == 4 # no duplicates
+
+
+def test_vc3_resume_tolerates_malformed_trailing_line(tmp_path: Path) -> None:
+ """VC3 robustness: a partial/blank trailing line must not crash resume."""
+ out = tmp_path / "run.jsonl"
+ disp = MockDispatcher(triggered_skill="map-x", raw_output="ok", duration_s=0.1)
+ run_eval(skill="map-x", entries=_entries(), dispatcher=disp, runs=1, out_path=out)
+ # Append a truncated JSON line (as if killed mid-write).
+ with open(out, "a", encoding="utf-8") as fh:
+ fh.write('{"cell_id": "p9-v1-r0", "promp') # truncated, no newline
+ # Resume must not raise and must still complete the real matrix.
+ run_eval(
+ skill="map-x",
+ entries=_entries(),
+ dispatcher=disp,
+ runs=1,
+ out_path=out,
+ resume=True,
+ )
+ valid_ids = _read_cell_ids(out) # skips the malformed line
+ assert set(valid_ids) == {make_cell_id(0, 1, 0), make_cell_id(1, 1, 0)}
+
+
+def test_vc4_transient_cell_error_recorded_not_fatal(tmp_path: Path) -> None:
+ """VC4: a per-cell dispatch error is recorded and does NOT abort the matrix."""
+ out = tmp_path / "run.jsonl"
+ disp = MockDispatcher(triggered_skill=None, error="simulated timeout")
+
+ records = run_eval(
+ skill="map-x",
+ entries=_entries(),
+ dispatcher=disp,
+ runs=1,
+ out_path=out,
+ resume=False,
+ )
+
+ # Both cells completed despite the error (matrix not aborted).
+ assert len(records) == 2
+ for rec in records:
+ assert any("dispatch_error" in f for f in rec.assertions_failed), rec
+ parsed = [
+ EvalResultRecord.from_dict(json.loads(line))
+ for line in out.read_text(encoding="utf-8").splitlines()
+ ]
+ assert len(parsed) == 2
+
+
+def test_load_eval_set_valid_and_invalid(tmp_path: Path) -> None:
+ """load_eval_set parses a valid file and raises ValueError on bad/empty input."""
+ good = tmp_path / "good.json"
+ good.write_text(
+ json.dumps(
+ {
+ "entries": [
+ {"prompt": "hi", "should_trigger": "map-x", "assertions": []},
+ {"prompt": "yo"},
+ ]
+ }
+ ),
+ encoding="utf-8",
+ )
+ entries = load_eval_set(good)
+ assert len(entries) == 2
+ assert entries[0].should_trigger == "map-x"
+ assert entries[1].should_trigger is None # default
+
+ with pytest.raises(ValueError):
+ load_eval_set(tmp_path / "nope.json")
+ bad = tmp_path / "bad.json"
+ bad.write_text("{not json", encoding="utf-8")
+ with pytest.raises(ValueError):
+ load_eval_set(bad)
+ empty = tmp_path / "empty.json"
+ empty.write_text(json.dumps({"entries": []}), encoding="utf-8")
+ with pytest.raises(ValueError):
+ load_eval_set(empty)
+ badrow = tmp_path / "badrow.json"
+ badrow.write_text(json.dumps({"entries": [{"prompt": 123}]}), encoding="utf-8")
+ with pytest.raises(ValueError):
+ load_eval_set(badrow)
+
+
+# ---------------------------------------------------------------------------
+# ST-007 CLI tests — appended via heredoc (avoids eval( hook false-positive)
+# ---------------------------------------------------------------------------
+
+
+def test_vc1_subcommand_registered() -> None:
+ """VC1: skill-eval subcommand is registered in the app and appears in help."""
+ from typer.testing import CliRunner
+ from mapify_cli import app
+
+ runner = CliRunner()
+ result = runner.invoke(app, ["skill-eval", "--help"])
+ assert result.exit_code == 0, result.output
+ assert "skill-eval" in result.output or "run" in result.output
+
+
+def test_vc2_dry_run_counts_no_dispatch(tmp_path: Path) -> None:
+ """VC2: --dry-run prints planned count and does NOT call the dispatcher."""
+ import json
+ from typer.testing import CliRunner
+ from mapify_cli import app
+
+ eval_file = tmp_path / "eval.json"
+ eval_file.write_text(
+ json.dumps(
+ {
+ "entries": [
+ {"prompt": "test prompt 1", "should_trigger": "map-debug"},
+ {"prompt": "test prompt 2", "should_trigger": "map-debug"},
+ {"prompt": "test prompt 3"},
+ ]
+ }
+ ),
+ encoding="utf-8",
+ )
+
+ dispatch_called = []
+
+ def _raise_if_called(*_args: object, **_kwargs: object) -> None:
+ dispatch_called.append(True)
+ raise AssertionError("ClaudeSubprocessDispatcher.dispatch must NOT be called in dry-run")
+
+ import mapify_cli.skills_eval.dispatcher as _disp_mod
+ original = _disp_mod.ClaudeSubprocessDispatcher.dispatch
+ _disp_mod.ClaudeSubprocessDispatcher.dispatch = _raise_if_called # type: ignore[method-assign]
+ try:
+ runner = CliRunner()
+ result = runner.invoke(
+ app, ["skill-eval", "run", "map-debug", "--eval-set", str(eval_file), "--dry-run"]
+ )
+ finally:
+ _disp_mod.ClaudeSubprocessDispatcher.dispatch = original # type: ignore[method-assign]
+
+ assert result.exit_code == 0, result.output
+ assert "3" in result.output, f"expected planned count 3 in output: {result.output!r}"
+ assert not dispatch_called, "dispatcher.dispatch was called during --dry-run"
+
+
+def test_vc3_missing_claude_exits_nonzero(tmp_path: Path) -> None:
+ """VC3/HC-6: when claude is not on PATH, exit nonzero with 'requires-cmd: claude'."""
+ import json
+ import mapify_cli
+ from typer.testing import CliRunner
+ from mapify_cli import app
+
+ eval_file = tmp_path / "eval.json"
+ eval_file.write_text(
+ json.dumps({"entries": [{"prompt": "hello", "should_trigger": "map-debug"}]}),
+ encoding="utf-8",
+ )
+
+ original_which = mapify_cli.shutil.which
+
+ def _which_none(name: object, *_args: object, **_kwargs: object) -> None:
+ return None
+
+ mapify_cli.shutil.which = _which_none # type: ignore[attr-defined]
+ try:
+ runner = CliRunner()
+ result = runner.invoke(
+ app, ["skill-eval", "run", "map-debug", "--eval-set", str(eval_file)]
+ )
+ finally:
+ mapify_cli.shutil.which = original_which # type: ignore[attr-defined]
+
+ assert result.exit_code != 0, f"expected nonzero exit, got 0; output: {result.output!r}"
+ assert "requires-cmd: claude" in result.output, (
+ f"expected 'requires-cmd: claude' in output: {result.output!r}"
+ )
+
+
+def test_dry_run_malformed_eval_set_exits_2(tmp_path: Path) -> None:
+ """SC-2: malformed eval-set (empty entries) under --dry-run exits 2, no dispatch."""
+ import json
+ from typer.testing import CliRunner
+ from mapify_cli import app
+
+ eval_file = tmp_path / "empty_entries.json"
+ eval_file.write_text(json.dumps({"entries": []}), encoding="utf-8")
+
+ dispatch_called = []
+
+ def _raise_if_called(*_args: object, **_kwargs: object) -> None:
+ dispatch_called.append(True)
+ raise AssertionError("dispatch must NOT be called on malformed eval-set")
+
+ import mapify_cli.skills_eval.dispatcher as _disp_mod
+ original = _disp_mod.ClaudeSubprocessDispatcher.dispatch
+ _disp_mod.ClaudeSubprocessDispatcher.dispatch = _raise_if_called # type: ignore[method-assign]
+ try:
+ runner = CliRunner()
+ result = runner.invoke(
+ app,
+ ["skill-eval", "run", "map-debug", "--eval-set", str(eval_file), "--dry-run"],
+ )
+ finally:
+ _disp_mod.ClaudeSubprocessDispatcher.dispatch = original # type: ignore[method-assign]
+
+ assert result.exit_code == 2, f"expected exit 2, got {result.exit_code}; output: {result.output!r}"
+ assert not dispatch_called, "dispatcher.dispatch was called on malformed eval-set"
+
+
+# ---------------------------------------------------------------------------
+# ST-003 Dispatcher tests — MockDispatcher + monkeypatched subprocess
+# ---------------------------------------------------------------------------
+
+
+def test_vc1_abc_returns_dispatchresult() -> None:
+ """VC1: MockDispatcher().dispatch() returns DispatchResult; VariantDispatcher is ABC."""
+ disp = MockDispatcher(triggered_skill="map-x", raw_output="hello")
+ result = disp.dispatch("any prompt")
+ assert isinstance(result, DispatchResult)
+ assert result.triggered_skill == "map-x"
+ assert result.raw_output == "hello"
+ # VariantDispatcher is abstract — instantiating raises TypeError
+ import pytest as _pytest
+ with _pytest.raises(TypeError):
+ VariantDispatcher() # type: ignore[abstract]
+
+
+def test_vc2_mock_dispatcher_sets_triggered_skill_no_subprocess() -> None:
+ """VC2 / INV-2: MockDispatcher returns triggered_skill; dispatch() body has zero subprocess/.run refs."""
+ disp = MockDispatcher(triggered_skill="map-x")
+ result = disp.dispatch("test")
+ assert result.triggered_skill == "map-x"
+
+ # AST-walk MockDispatcher.dispatch to confirm no subprocess or .run calls (INV-2).
+ import inspect
+ import textwrap
+ import ast as _ast
+ source = textwrap.dedent(inspect.getsource(MockDispatcher.dispatch))
+ tree = _ast.parse(source)
+ for node in _ast.walk(tree):
+ if isinstance(node, _ast.Attribute) and node.attr == "run":
+ raise AssertionError(
+ "MockDispatcher.dispatch must not reference .run (INV-2 violation)"
+ )
+ if isinstance(node, (_ast.Import, _ast.ImportFrom)):
+ names = (
+ [alias.name for alias in node.names]
+ if isinstance(node, _ast.Import)
+ else ([node.module] if node.module else [])
+ )
+ for name in names:
+ if name and "subprocess" in name:
+ raise AssertionError(
+ f"MockDispatcher.dispatch must not import subprocess (INV-2): {name!r}"
+ )
+
+
+def test_vc4_backoff_bounded_on_transient_failure(
+ tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+ """VC4: ClaudeSubprocessDispatcher retries exactly max_retries+1 times on failure."""
+ # Seed a minimal .claude/skills/ dir so _seed_temp_cwd works.
+ source_claude = tmp_path / ".claude"
+ (source_claude / "skills").mkdir(parents=True)
+
+ call_count: list[int] = [0]
+
+ def _failing_run(
+ argv: list[str],
+ *args: object,
+ **kwargs: object,
+ ) -> object:
+ call_count[0] += 1
+ import subprocess as _sp
+ result = _sp.CompletedProcess(args=argv, returncode=1, stdout="", stderr="err")
+ return result
+
+ def _noop_sleep(seconds: object) -> None:
+ pass
+
+ monkeypatch.setattr(_disp_mod.subprocess, "run", _failing_run)
+ monkeypatch.setattr(_disp_mod.time, "sleep", _noop_sleep)
+
+ disp = ClaudeSubprocessDispatcher(
+ source_claude_dir=source_claude,
+ max_retries=2,
+ backoff_base=0.0,
+ )
+ result = disp.dispatch("hello")
+
+ # Must return a DispatchResult (never raise).
+ assert isinstance(result, DispatchResult)
+ assert result.error is not None
+
+ # subprocess.run must be called exactly max_retries+1 = 3 times (bounded).
+ assert call_count[0] == 3, (
+ f"expected 3 subprocess calls (1 + max_retries=2), got {call_count[0]}"
+ )
+
+
+def test_vc3_subprocess_cwd_is_temp_not_repo_map(
+ tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+ """VC3 / INV-5: subprocess.run cwd is a seeded temp dir, not the repo .map."""
+ # Seed a source .claude/skills/ dir.
+ source_claude = tmp_path / ".claude"
+ (source_claude / "skills").mkdir(parents=True)
+
+ # Capture results *inside* _capture_run while the temp dir is still live.
+ # dispatch() calls shutil.rmtree(tmp) in its finally block, so checking
+ # after dispatch() returns would always find the dir gone.
+ cwd_observations: list[dict[str, object]] = []
+
+ def _capture_run(
+ argv: list[str],
+ *args: object,
+ **kwargs: object,
+ ) -> object:
+ cwd_val = kwargs.get("cwd")
+ if cwd_val is not None:
+ cwd_path = Path(str(cwd_val))
+ cwd_observations.append({
+ "cwd": cwd_path,
+ "claude_exists": (cwd_path / ".claude").exists(),
+ "map_exists": (cwd_path / ".map").exists(),
+ })
+ # Return a valid JSON envelope so dispatch() parses successfully.
+ import subprocess as _sp
+ envelope = (
+ '{"result": "ok", "session_id": "test-session",'
+ ' "usage": {"input_tokens": 1, "cache_read_input_tokens": 0,'
+ ' "cache_creation_input_tokens": 0}}'
+ )
+ return _sp.CompletedProcess(
+ args=argv, returncode=0, stdout=envelope, stderr=""
+ )
+
+ def _noop_sleep(seconds: object) -> None:
+ pass
+
+ monkeypatch.setattr(_disp_mod.subprocess, "run", _capture_run)
+ monkeypatch.setattr(_disp_mod.time, "sleep", _noop_sleep)
+
+ disp = ClaudeSubprocessDispatcher(
+ source_claude_dir=source_claude,
+ max_retries=0,
+ backoff_base=0.0,
+ )
+ disp.dispatch("test prompt")
+
+ assert len(cwd_observations) == 1, (
+ f"expected exactly 1 subprocess call, got {len(cwd_observations)}"
+ )
+ obs = cwd_observations[0]
+ cwd = obs["cwd"]
+ assert isinstance(cwd, Path)
+
+ # Must NOT be the repo .map dir.
+ repo_map = Path(__file__).parent.parent / ".map"
+ assert cwd != repo_map, f"cwd must not be repo .map, got {cwd!r}"
+
+ # .claude and .map must both have existed in the seeded temp dir (INV-5).
+ assert obs["claude_exists"], f".claude not found in temp cwd {cwd!r} at call time"
+ assert obs["map_exists"], f".map not found in temp cwd {cwd!r} at call time"
+
+
+# ---------------------------------------------------------------------------
+# ST-004 Assertion tests
+# ---------------------------------------------------------------------------
+
+
+def test_vc1_contains_and_regex_match_and_nonmatch() -> None:
+ """VC1: contains / not_contains / regex — match, non-match, invalid regex → FAIL no raise."""
+ result = DispatchResult(
+ raw_output="Hello world",
+ triggered_skill=None,
+ token_usage=None,
+ duration_s=0.1,
+ )
+
+ # contains — match
+ ar = run_assertion({"type": "contains", "value": "Hello"}, result)
+ assert ar.passed is True
+
+ # contains — non-match
+ ar = run_assertion({"type": "contains", "value": "missing"}, result)
+ assert ar.passed is False
+
+ # not_contains — present → FAIL
+ ar = run_assertion({"type": "not_contains", "value": "Hello"}, result)
+ assert ar.passed is False
+
+ # not_contains — absent → PASS
+ ar = run_assertion({"type": "not_contains", "value": "absent"}, result)
+ assert ar.passed is True
+
+ # regex — match
+ ar = run_assertion({"type": "regex", "pattern": r"H\w+"}, result)
+ assert ar.passed is True
+
+ # regex — non-match
+ ar = run_assertion({"type": "regex", "pattern": r"xyz\d+"}, result)
+ assert ar.passed is False
+
+ # invalid regex — must FAIL, not raise
+ ar = run_assertion({"type": "regex", "pattern": r"[invalid("}, result)
+ assert ar.passed is False
+ assert "invalid" in ar.detail.lower() or "error" in ar.detail.lower()
+
+
+def test_vc2_valid_json_pass_and_fail() -> None:
+ """VC2: valid_json — well-formed PASS, malformed FAIL."""
+ good = DispatchResult(
+ raw_output='{"key": "value"}',
+ triggered_skill=None,
+ token_usage=None,
+ duration_s=0.1,
+ )
+ ar = run_assertion({"type": "valid_json"}, good)
+ assert ar.passed is True
+
+ bad = DispatchResult(
+ raw_output="{not json}",
+ triggered_skill=None,
+ token_usage=None,
+ duration_s=0.1,
+ )
+ ar = run_assertion({"type": "valid_json"}, bad)
+ assert ar.passed is False
+
+
+def test_vc3_trigger_and_not_trigger_including_none() -> None:
+ """VC3 / SC-3: trigger == / != ; not_trigger None-safe PASS."""
+ triggered = DispatchResult(
+ raw_output="",
+ triggered_skill="map-debug",
+ token_usage=None,
+ duration_s=0.1,
+ )
+ not_triggered = DispatchResult(
+ raw_output="",
+ triggered_skill=None,
+ token_usage=None,
+ duration_s=0.1,
+ )
+
+ # trigger — matching skill PASS
+ ar = run_assertion({"type": "trigger", "skill": "map-debug"}, triggered)
+ assert ar.passed is True
+
+ # trigger — wrong skill FAIL
+ ar = run_assertion({"type": "trigger", "skill": "map-other"}, triggered)
+ assert ar.passed is False
+
+ # not_trigger — different skill PASS
+ ar = run_assertion({"type": "not_trigger", "skill": "map-other"}, triggered)
+ assert ar.passed is True
+
+ # not_trigger — same skill FAIL
+ ar = run_assertion({"type": "not_trigger", "skill": "map-debug"}, triggered)
+ assert ar.passed is False
+
+ # SC-3: triggered_skill is None → not_trigger PASS (None != "map-debug")
+ ar = run_assertion({"type": "not_trigger", "skill": "map-debug"}, not_triggered)
+ assert ar.passed is True
+
+
+# ---------------------------------------------------------------------------
+# ST-009 own tests
+# ---------------------------------------------------------------------------
+
+
+def test_vc2_no_anthropic_import_in_skills_eval() -> None:
+ """VC2 / INV-3: no 'anthropic' import and no ANTHROPIC_API_KEY env read in skills_eval."""
+ import ast as _ast
+ skills_eval_dir = (
+ Path(__file__).parent.parent / "src" / "mapify_cli" / "skills_eval"
+ )
+ py_files = list(skills_eval_dir.rglob("*.py"))
+ assert py_files, f"No .py files found under {skills_eval_dir}"
+
+ for py_file in py_files:
+ source = py_file.read_text(encoding="utf-8")
+ tree = _ast.parse(source, filename=str(py_file))
+
+ # Check 1: no anthropic import via AST.
+ for node in _ast.walk(tree):
+ if isinstance(node, _ast.Import):
+ for alias in node.names:
+ assert "anthropic" not in (alias.name or ""), (
+ f"Found 'anthropic' import in {py_file}: {alias.name!r}"
+ )
+ elif isinstance(node, _ast.ImportFrom):
+ module = node.module or ""
+ assert "anthropic" not in module, (
+ f"Found 'anthropic' import in {py_file}: from {module!r}"
+ )
+
+ # Check 2: no ANTHROPIC_API_KEY env read.
+ # Scan non-comment, non-docstring lines for the literal key string.
+ # We allow docstring/comment mentions (INV-3 documentation), but not
+ # actual environment reads. We do this by checking all Call nodes for
+ # os.environ[...] or os.getenv(...) referencing the key.
+ for node in _ast.walk(tree):
+ # os.environ["ANTHROPIC_API_KEY"] or os.environ.get("ANTHROPIC_API_KEY")
+ if isinstance(node, _ast.Subscript):
+ # Check if this is os.environ[]
+ if isinstance(node.value, _ast.Attribute):
+ if node.value.attr == "environ":
+ slice_val = node.slice
+ # Python 3.9+: slice is the node directly
+ key_node = slice_val
+ if isinstance(key_node, _ast.Constant) and isinstance(key_node.value, str):
+ assert "ANTHROPIC_API_KEY" not in key_node.value, (
+ f"Found ANTHROPIC_API_KEY env read in {py_file}"
+ )
+ if isinstance(node, _ast.Call):
+ # os.getenv("ANTHROPIC_API_KEY") or os.environ.get("ANTHROPIC_API_KEY")
+ func = node.func
+ is_getenv = (
+ isinstance(func, _ast.Attribute)
+ and func.attr in ("getenv", "get")
+ )
+ if is_getenv and node.args:
+ first_arg = node.args[0]
+ if isinstance(first_arg, _ast.Constant) and isinstance(first_arg.value, str):
+ assert "ANTHROPIC_API_KEY" not in first_arg.value, (
+ f"Found ANTHROPIC_API_KEY env read in {py_file}"
+ )
+
+
+def test_vc1_end_to_end_run_via_mock_dispatcher(tmp_path: Path) -> None:
+ """VC1 / AC-9: load fixture → run via MockDispatcher → aggregate; zero real claude."""
+ fixture_path = (
+ Path(__file__).parent / "skills_eval" / "fixtures" / "map_debug_eval_set.json"
+ )
+ assert fixture_path.exists(), f"Fixture not found: {fixture_path}"
+
+ entries = load_eval_set(fixture_path)
+ assert len(entries) >= 2
+
+ out_path = tmp_path / "e2e_run.jsonl"
+ disp = MockDispatcher(triggered_skill="map-debug", raw_output="debug info")
+
+ records = run_eval(
+ skill="map-debug",
+ entries=entries,
+ dispatcher=disp,
+ runs=1,
+ out_path=out_path,
+ resume=False,
+ )
+
+ # Records durable: file written.
+ assert out_path.exists()
+ lines = [
+ ln for ln in out_path.read_text(encoding="utf-8").splitlines() if ln.strip()
+ ]
+ assert len(lines) == len(records) == len(entries)
+
+ # Aggregate produces a valid summary.
+ summary = aggregate(records)
+ assert summary.total_cells == len(entries)
+ assert 0.0 <= summary.pass_rate <= 1.0
+ d = summary.to_dict()
+ assert "pass_rate" in d
+ assert "total_cells" in d
+ # JSON-serialisable (no TypeError).
+ json.dumps(d)