diff --git a/.claude/hooks/safety-guardrails.py b/.claude/hooks/safety-guardrails.py
index 04fd888..48e671c 100755
--- a/.claude/hooks/safety-guardrails.py
+++ b/.claude/hooks/safety-guardrails.py
@@ -38,7 +38,12 @@
 
 # Dangerous bash command patterns
 _DEFAULT_DANGEROUS_COMMANDS = [
-    r"rm\s+-rf\s+/",  # rm -rf /
+    # Block `rm -rf /` (bare root), `rm -rf /etc`, `rm -rf /home/user`, etc.,
+    # but ALLOW deletion of subpaths UNDER a temp root (rm -rf /tmp/<dir>,
+    # /private/tmp/<dir>, /var/folders/<dir>, /var/tmp/<dir>) — legitimate
+    # scratch cleanup. The negative lookahead requires a trailing slash, so the
+    # temp root itself (`rm -rf /tmp`) stays blocked; only children are allowed.
+    r"rm\s+-rf\s+/(?!(?:tmp|private/tmp|var/folders|var/tmp)/)",  # rm -rf / (non-temp)
     r"rm\s+-rf\s+\*",  # rm -rf *
     r"rm\s+-rf\s+\.\.",  # rm -rf ..
     r"git\s+push.*--force.*main",
diff --git a/.claude/rules/learned/architecture-patterns.md b/.claude/rules/learned/architecture-patterns.md
index d55654f..d1d8bc4 100644
--- a/.claude/rules/learned/architecture-patterns.md
+++ b/.claude/rules/learned/architecture-patterns.md
@@ -161,3 +161,19 @@
   # CORRECT: templates_src is fence-free; copier injects exactly once:
   wrapped = f"# map:start\n{rendered}\n# map:end\n" if fenced else rendered
   ```
+
+- **Spike-First Gating: High-Risk Binding Decisions Require a Docs-Only Artifact Before Implementation** (2026-06-04): When a subtask's answer would bind downstream implementation (which channel carries a value, which API call is idempotent, what schema a subprocess emits), run it FIRST as a docs-only spike that writes an artifact naming the empirical answer + the binding strategy, and commits ZERO production code. Downstream subtasks reference the artifact by name and consume it, not assumptions. A wrong assumption that is not spiked propagates into every component built on it and forces a rewrite cascade. In this workflow a research-agent wrongly claimed skill-activation wasn't recoverable from `claude -p`; the ST-001 spike empirically corrected it before any dispatcher code existed. The spike artifact MUST contain a named "binding strategy" section, not just findings (Monitor hard-stopped once for a missing strategy section). [workflow: map-efficient]
+
+- **Producer-Owns-Parse: The Component That Owns the Subprocess Owns All Derived Fields; Consumers Read the Typed Result** (2026-06-04): When component A launches a subprocess (or owns a raw source) and component B consumes the result, ALL parsing/derivation (transcript reads, field extraction, signal combination) lives in A; B reads only the typed result struct and never re-implements parsing. Two payoffs: (1) a single parse site that a Mock producer can supply directly, so consumer tests need no subprocess/transcript fixture; (2) when the raw output schema changes, only A changes. Putting any parse in B re-couples the modules through the raw format. Extends "Contract-First Inter-Component JSON Schemas": the contract is A's typed struct, and the parse-to-struct boundary is A's responsibility exclusively. [workflow: map-efficient]
+  ```python
+  # WRONG — runner re-parses a transcript it does not own (couples to raw format)
+  result = dispatcher.dispatch(cell)            # raw proc output
+  skill = extract_skill_from_transcript(read_jsonl(result.session_id))
+
+  # CORRECT — dispatcher parses once into a typed field; runner just reads it
+  @dataclass
+  class DispatchResult:
+      triggered_skill: str | None   # parsed by dispatcher, NOT by runner
+      token_usage: TokenUsage | None
+  # tests inject MockDispatcher(triggered_skill="map-plan") — no subprocess needed
+  ```
diff --git a/.claude/rules/learned/implementation-patterns.md b/.claude/rules/learned/implementation-patterns.md
index 657b145..626da22 100644
--- a/.claude/rules/learned/implementation-patterns.md
+++ b/.claude/rules/learned/implementation-patterns.md
@@ -128,3 +128,42 @@ paths:
       dest.chmod(dest.stat().st_mode | 0o755)
   # test guard: assert os.access(installed_hook, os.X_OK)
   ```
+
+- **`claude -p` Output Has Two Channels: Envelope for Tokens, Transcript JSONL for Skill Name** (2026-06-04): When shelling `claude -p --output-format json` as a subprocess, two distinct output channels carry different information — do not confuse them. The JSON result envelope (stdout) carries `.result` (response text), `.usage` (input/output/cache tokens), and `.session_id`. The name of the skill/slash-surface that actually fired is NOT in the envelope — it is only in Claude Code's native transcript JSONL (located by session_id) as a `tool_use` block with `name=="Skill"` and `input.skill`. Deriving this from the framework's own scratch/digest schema rather than the native transcript yields a wrong claim. Verify empirically by reading the real transcript after a spike call; never infer from internal schema files. [workflow: map-efficient]
+  ```python
+  env = json.loads(proc.stdout)        # .result, .usage, .session_id
+  tokens = env["usage"]                # CORRECT — tokens are in the envelope
+  # env.get("skill")  -> None          # WRONG — fired-skill is NOT in the envelope
+  for line in transcript_jsonl(env["session_id"]).read_text().splitlines():
+      m = json.loads(line)
+      if m.get("type") == "tool_use" and m.get("name") == "Skill":
+          triggered = m["input"]["skill"]; break
+  ```
+
+- **Scoped Config-Flag Mutation: Seed a Throwaway Temp Copy; Never Modify the Production Source of Truth** (2026-06-04): When a tool/test needs a shipped config flag to behave differently from its production default (e.g. stripping `disable-model-invocation: true` so an eval can auto-select skills), mutate the flag ONLY in a throwaway temp dir seeded with a copy of the production config, discarded after the subprocess exits. Never patch the source repo or `templates_src`. A blanket production flip is a footgun: it silently changes behavior for every other user of the flag and may be committed accidentally. Scope of mutation must match scope of need: one subprocess call → one throwaway dir, always cleaned up in `finally`. [workflow: map-efficient]
+  ```python
+  tmp = Path(tempfile.mkdtemp())
+  shutil.copytree(REPO / ".claude", tmp / ".claude")     # seed from production
+  strip_flag(tmp / ".claude" / "skills")                 # mutate throwaway ONLY
+  try:
+      subprocess.run(["claude", "-p", prompt, "--output-format", "json"], cwd=tmp)
+  finally:
+      shutil.rmtree(tmp)                                  # production never touched
+  ```
+
+- **Clock-Free Core with Caller-Supplied Path: Inject Timestamps at the CLI Boundary, Not Inside the Worker** (2026-06-04): When a worker writes durable output (a timestamped JSONL, a run artifact), do NOT call `datetime.now()` inside the worker. Have the CLI/outermost caller generate the timestamped path and pass it as an explicit `out_path: Path` the worker treats as opaque. Benefits: (1) tests pass `tmp_path / "results.jsonl"` with zero clock monkeypatching; (2) the worker is deterministic given the same inputs+path; (3) resume keys on the path the CLI owns. Refines "Long-Running Operations Need Durable State by Default" by fixing WHERE path/timestamp generation lives — at the boundary, not the core. [workflow: map-efficient]
+  ```python
+  # CORRECT: worker takes out_path; CLI owns the timestamp
+  def run_eval(*, entries, dispatcher, runs, out_path: Path, resume=False) -> list: ...
+  # CLI: out = default_run_path(root, skill, datetime.now(tz).strftime("%Y%m%dT%H%M%SZ"))
+  # Test: run_eval(..., out_path=tmp_path / "r.jsonl")   # no time mocking
+  ```
+
+- **Concurrent Durable Append: threading.Lock for Line Integrity + Stable cell_id Resume Key** (2026-06-04): When parallel workers append JSONL lines to a shared durable file, two invariants must BOTH hold: (1) no interleaved partial lines — guard each `f.write(line + "\n")` with a threading.Lock; (2) resume is idempotent regardless of write order — key on a stable id present in every record (cell_id), never on line number/position. Nondeterministic write order is fine as long as resume dedups by id. Each worker subprocess also runs in its own temp cwd so concurrent subprocesses never share a working dir. Complements "Long-Running Operations Need Durable State" (process-restart durability) with within-process concurrency safety. [workflow: map-efficient]
+  ```python
+  with self._lock:                       # atomic per-line append
+      with out_path.open("a", encoding="utf-8") as f:
+          f.write(json.dumps(record) + "\n")
+  done = {json.loads(l)["cell_id"] for l in out_path.read_text().splitlines() if l.strip()}
+  pending = [c for c in cells if make_cell_id(...) not in done]   # order-independent resume
+  ```
diff --git a/.claude/rules/learned/testing-strategies.md b/.claude/rules/learned/testing-strategies.md
index 3d327b4..83a91ba 100644
--- a/.claude/rules/learned/testing-strategies.md
+++ b/.claude/rules/learned/testing-strategies.md
@@ -139,3 +139,12 @@ paths:
   # 3. git restore <file>        -> confirm GREEN
   # 4. commit file + test together
   ```
+
+- **Blueprint-Named Test Functions Are a Monitor Contract: Author Them in the Same Subtask as the Code** (2026-06-04): When a subtask blueprint's `test_strategy` names specific pytest function names (e.g. `test_vc3_resume_skips_present_cell_ids`), Monitor treats those names as a HARD completeness contract: a subtask whose logic is correct but whose blueprint-named functions do not yet exist gets `valid=false` (hard stop). The completeness unit is code + named-test-functions-together, not code alone — the blueprint author chose the names to specify observable behavior, so an absent name means the behavior is unverified. Never stub a named test with `pass`/`# TODO` and call the subtask done; the stub satisfies the import but not the contract. In this workflow ST-005's runner code was correct but Monitor hard-stopped until the four named VC tests were authored with real assertions. [workflow: map-efficient]
+
+- **Final Verification Must Check Shipped Docs Against Actual Behavior, Then Grep for the Same Drift Class** (2026-06-04): After code+tests are green, a dedicated final-verification pass must validate that user-facing docs (SKILL.md, README, CLI `--help`) match actual behavior: default values, accepted schema formats, flag names, output field names. Prose drift is invisible to pytest/ruff/mypy. When the first drift instance is found, immediately grep the WHOLE doc for the same class of claim (every `--flag default`, every schema example, every accepted file-format mention) before moving on — drift clusters because the doc was written once from a design doc, not from running code. Here the final-verifier caught a `--max-concurrency` default of 4 (actual 1); grepping the same file then surfaced a fictional YAML eval-set schema block + `.yaml` examples that the JSON-only loader could never parse. [workflow: map-efficient]
+  ```bash
+  # one drift found -> grep the whole doc for the drift class before marking done
+  mapify skill-eval --help | grep -i max-concurrency        # actual default
+  grep -nE 'default|yaml|schema|--[a-z-]+' docs/SKILL.md     # reconcile every claim
+  ```
diff --git a/.claude/skills/map-efficient/SKILL.md b/.claude/skills/map-efficient/SKILL.md
index b986b52..2045905 100644
--- a/.claude/skills/map-efficient/SKILL.md
+++ b/.claude/skills/map-efficient/SKILL.md
@@ -191,8 +191,10 @@ python3 .map/scripts/map_step_runner.py record_test_baseline "$BRANCH"
 
 Snapshots pre-existing failures so later subtasks distinguish
 "introduced regression" from "was broken pre-plan". Auto-detects
-Make/pytest/go test/cargo. Overrides + narrow-target guidance:
-[efficient-reference.md](efficient-reference.md#pre-flight-test-baseline).
+Make/pytest/go test/cargo. It captures the test run internally and prints a
+single compact JSON report at the end — read that JSON directly; do NOT pipe it
+through `head`/`tail` (per the repo bash guidelines). Overrides + narrow-target
+guidance: [efficient-reference.md](efficient-reference.md#pre-flight-test-baseline).
 
 ### Wave Computation (after INIT_STATE) - REQUIRED
 
diff --git a/.claude/skills/map-efficient/efficient-reference.md b/.claude/skills/map-efficient/efficient-reference.md
index 802d11c..6734cfc 100644
--- a/.claude/skills/map-efficient/efficient-reference.md
+++ b/.claude/skills/map-efficient/efficient-reference.md
@@ -203,6 +203,11 @@ fix or defer.
 python3 .map/scripts/map_step_runner.py record_test_baseline "$BRANCH"
 ```
 
+It captures the test run internally and prints a single compact JSON report at
+the end — read that JSON directly. Do NOT pipe it through `head`/`tail` (per the
+repo bash guidelines); the output is one small object, not a stream, so
+truncating it only hides fields.
+
 Auto-detects from project markers:
 - `Makefile` with `test:` target → `make test`
 - `pyproject.toml` / `pytest.ini` → `pytest`
diff --git a/.claude/skills/map-skill-eval/SKILL.md b/.claude/skills/map-skill-eval/SKILL.md
new file mode 100644
index 0000000..567ac04
--- /dev/null
+++ b/.claude/skills/map-skill-eval/SKILL.md
@@ -0,0 +1,94 @@
+---
+name: map-skill-eval
+description: |
+  Evaluate a /map-* skill's trigger accuracy and cost. Use when asked to measure skill trigger accuracy, run an eval-set, or check token/duration cost via `mapify skill-eval`. Do NOT use to plan or implement; use map-plan or map-efficient.
+effort: medium
+disable-model-invocation: true
+argument-hint: "[skill] [--eval-set PATH]"
+---
+# /map-skill-eval — Skill Trigger Accuracy & Cost Evaluation
+
+Purpose: measure whether a `/map-*` skill fires on the right prompts and what it costs in tokens and time. Do not plan or implement from this skill.
+
+Requires the `claude` CLI (installed and on `$PATH`). The skill is skipped at install time on hosts without `claude`.
+
+## Invocation
+
+```bash
+mapify skill-eval run <skill> --eval-set PATH [--dry-run] [--resume] [--max-concurrency N]
+```
+
+- `<skill>` — the skill name to evaluate (e.g. `map-plan`).
+- `--eval-set PATH` — path to a JSON eval-set file defining prompt cases and expected assertions.
+- `--dry-run` — validate the eval-set and print the planned run count without spending any quota.
+- `--resume` — continue an interrupted run from the last durable checkpoint.
+- `--max-concurrency N` — max parallel `claude -p` workers (default: 1).
+
+## What It Does
+
+1. **Prompts × runs matrix** — for each case in the eval-set, invokes `claude -p` in an isolated temporary working directory seeded with `.claude/` (skills, settings). Runs are independent; no shared state leaks between cases.
+2. **Transcript-parse trigger detection** — parses each `claude -p` transcript to determine whether the target skill fired (trigger) or did not fire (not_trigger).
+3. **Deterministic assertions** — each eval case may specify one or more assertion types:
+   - `contains` / `not_contains` — substring presence in the response.
+   - `regex` — pattern match against the response.
+   - `valid_json` — response parses as JSON.
+   - `trigger` / `not_trigger` — skill fired / did not fire.
+4. **Durable resumable run log** — results are appended to `.map/eval-runs/<skill>/<timestamp>.jsonl` as each case completes, so a partial run is recoverable via `--resume`.
+5. **Summary report** — after all cases complete, prints pass-rate (passed/total) plus per-case token usage, duration, and cache-hit stats.
+
+## Eval-Set Format
+
+A JSON object with an `entries` array. Each entry has a `prompt`, optional
+`should_trigger` / `should_not_trigger` skill names (the runner turns these into
+`trigger` / `not_trigger` assertions), and an optional `assertions` array.
+Assertion types: `contains`, `not_contains`, `regex`, `valid_json`, `trigger`,
+`not_trigger`.
+
+```json
+{
+  "entries": [
+    {
+      "prompt": "Decompose this feature into subtasks",
+      "should_trigger": "map-plan",
+      "assertions": [
+        { "type": "contains", "value": "subtask" }
+      ]
+    },
+    {
+      "prompt": "Run quality gates",
+      "should_not_trigger": "map-plan",
+      "assertions": []
+    }
+  ]
+}
+```
+
+## --dry-run
+
+`--dry-run` validates the eval-set schema and prints the planned case count with estimated quota usage. No `claude -p` calls are made; no `.jsonl` is written.
+
+## Examples
+
+```bash
+# Validate eval-set without spending quota
+mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --dry-run
+
+# Run full eval with up to 8 parallel workers
+mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --max-concurrency 8
+
+# Resume an interrupted run
+mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --resume
+```
+
+## Troubleshooting
+
+- **`claude` not found** — `map-skill-eval` requires the `claude` CLI on `$PATH`. Install it and re-run `mapify init` to activate the skill.
+- **Eval-set validation error on `--dry-run`** — check that each case has a non-empty `id`, a `prompt`, and at least one `assertions` entry with a valid `type`.
+- **Run log not found for `--resume`** — `--resume` looks for the latest `.map/eval-runs/<skill>/<timestamp>.jsonl`. If no prior run exists, omit `--resume` to start fresh.
+- **All cases report `not_trigger` unexpectedly** — verify the skill name matches exactly (e.g. `map-plan`, not `map_plan`) and that `.claude/` was seeded correctly in the temp cwd.
+
+## Related Commands
+
+- `/map-plan` — plan and decompose tasks.
+- `/map-efficient` — full MAP workflow execution.
+- `/map-check` — run quality gates and verify MAP workflow completion.
diff --git a/.claude/skills/skill-rules.json b/.claude/skills/skill-rules.json
index bbe32ab..d5a9606 100644
--- a/.claude/skills/skill-rules.json
+++ b/.claude/skills/skill-rules.json
@@ -239,6 +239,18 @@
         ]
       }
     },
+    "map-skill-eval": {
+      "type": "manual",
+      "skillClass": "task",
+      "enforcement": "manual",
+      "priority": "medium",
+      "description": "Evaluate a /map-* skill's trigger accuracy + cost via mapify skill-eval (claude -p matrix, deterministic assertions, durable resumable runs).",
+      "requires-cmd": ["claude"],
+      "promptTriggers": {
+        "keywords": ["map-skill-eval","skill-eval","skill eval","evaluate skill","trigger accuracy","skill triggering"],
+        "intentPatterns": ["map-skill-eval","(eval|evaluate|measure|test).*(skill).*(trigger|fire|cost)","does .* skill trigger"]
+      }
+    },
     "map-task": {
       "type": "manual",
       "skillClass": "task",
diff --git a/CLAUDE.md b/CLAUDE.md
index 46dd045..7b5db00 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -65,6 +65,7 @@ Validation:
 - "Not in the CI gate" is NOT a valid reason to skip. The error is real if any tool reported it.
 - "Static-analysis noise" is NOT a category. Either the type system is correct and the code is wrong, or the annotation needs fixing — pick one and fix it.
 - Only legitimate skip: the user explicitly approves deferral in the current conversation. Document the deferral in writing.
+- **Any error encountered while operating the MAP Framework itself must be fixed immediately, in the same change.** This covers the framework's own runtime — a hook that crashes or false-positives, a `.map/scripts/` runner or gate that errors or mis-reports, a `mapify` CLI traceback, a render/validator/blueprint failure, a broken `Task`/agent dispatch. When you hit one mid-task: STOP, find the root cause, and fix it before continuing the original work. Do NOT work around it, do NOT defer it as "unrelated", do NOT note-and-move-on past a broken tool. If the fix is genuinely out of scope or risky, stop and ask the user — never silently continue past a malfunctioning framework component. (Errors raised by an external plugin/hook NOT shipped by this repo are out of scope here; say so and route them to the user.)
 
 ## Bash Command Guidelines
 
diff --git a/src/mapify_cli/__init__.py b/src/mapify_cli/__init__.py
index de885c2..cf89218 100644
--- a/src/mapify_cli/__init__.py
+++ b/src/mapify_cli/__init__.py
@@ -140,6 +140,12 @@ def create_ssl_context():
 
 app.add_typer(validate_app, name="validate")
 
+skill_eval_app = typer.Typer(
+    name="skill-eval", help="Evaluate a skill's trigger accuracy + cost"
+)
+
+app.add_typer(skill_eval_app, name="skill-eval")
+
 
 def version_callback(value: bool):
     """Callback to show version and exit."""
@@ -1361,6 +1367,127 @@ def upgrade():
     )
 
 
+# Skill-eval commands
+
+
+@skill_eval_app.command("run")
+def skill_eval_run(
+    skill: str = typer.Argument(..., help="Skill under test, e.g. map-debug"),
+    eval_set: Optional[Path] = typer.Option(
+        None, "--eval-set", help="Path to eval-set JSON"
+    ),
+    dry_run: bool = typer.Option(
+        False, "--dry-run", help="Validate eval-set + print planned count; spend nothing"
+    ),
+    resume: bool = typer.Option(
+        False, "--resume", help="Resume a partial run, skipping completed cells"
+    ),
+    max_concurrency: int = typer.Option(
+        1, "--max-concurrency", min=1, help="Bounded parallel dispatch (default 1)"
+    ),
+) -> None:
+    """Run a skill evaluation matrix.
+
+    Exit codes:
+      0 - Success (or dry-run completed)
+      1 - Runtime error (claude not found, or unexpected failure)
+      2 - Validation error (missing --eval-set or malformed eval-set file)
+    """
+    # Intent: lazy import to keep top-level import time low and avoid import cycles.
+    import mapify_cli.skills_eval.runner as _runner
+    import mapify_cli.skills_eval.aggregator as _aggregator
+    from mapify_cli.skills_eval.dispatcher import ClaudeSubprocessDispatcher
+    from mapify_cli.skills_eval.eval_schema import EvalResultRecord
+    from datetime import timezone
+
+    # SC-2: --eval-set is required.
+    if eval_set is None:
+        console.print(
+            "[bold red]Error:[/bold red] provide --eval-set PATH"
+        )
+        raise typer.Exit(2)
+
+    # SC-2: load and validate the eval-set; malformed/empty → Exit(2), NO invocations.
+    try:
+        entries = _runner.load_eval_set(eval_set)
+    except ValueError as exc:
+        console.print(f"[bold red]Error:[/bold red] {exc}")
+        raise typer.Exit(2)
+
+    # Dry-run path: zero quota, NO dispatcher construction, NO claude required.
+    if dry_run:
+        # D10: variant_id fixed = 1, runs = 1.
+        planned = len(entries) * 1 * 1
+        console.print(
+            f"[bold]Dry-run:[/bold] planned [cyan]{planned}[/cyan] invocation(s) "
+            f"for skill [bold]{skill}[/bold] — spends 0 quota"
+        )
+        raise typer.Exit(0)
+
+    # HC-6: require claude BEFORE any invocation.
+    if shutil.which("claude") is None:
+        console.print(
+            "[bold red]Error:[/bold red] requires-cmd: claude — "
+            "install the claude CLI and ensure it is on PATH"
+        )
+        raise typer.Exit(1)
+
+    # Resolve output path.
+    root = Path.cwd()
+    if resume:
+        latest = _runner.latest_run_path(root, skill)
+        out_path = latest if latest is not None else _runner.default_run_path(
+            root, skill, datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+        )
+    else:
+        out_path = _runner.default_run_path(
+            root, skill, datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+        )
+
+    # Run the evaluation matrix.
+    disp = ClaudeSubprocessDispatcher()
+    _aggregator.bounded_run(
+        skill=skill,
+        entries=entries,
+        dispatcher=disp,
+        runs=1,
+        out_path=out_path,
+        resume=resume,
+        max_concurrency=max_concurrency,
+    )
+
+    # Read all records from the output file, aggregate, and print summary.
+    records: List[EvalResultRecord] = []
+    if out_path.exists():
+        for raw_line in out_path.read_text(encoding="utf-8").splitlines():
+            raw_line = raw_line.strip()
+            if not raw_line:
+                continue
+            try:
+                records.append(EvalResultRecord.from_dict(__import__("json").loads(raw_line)))
+            except (ValueError, KeyError):
+                continue
+
+    summary = _aggregator.aggregate(records)
+    console.print(
+        f"\n[bold]Eval complete:[/bold] skill=[bold]{skill}[/bold] "
+        f"pass_rate=[cyan]{summary.pass_rate:.1%}[/cyan] "
+        f"({summary.passed_cells}/{summary.total_cells} cells passed)"
+    )
+    if summary.tokens_mean is not None:
+        console.print(
+            f"  tokens mean={summary.tokens_mean:.1f} "
+            f"stddev={summary.tokens_stddev or 0.0:.1f} "
+            f"(n={summary.token_sample_size})"
+        )
+    if summary.duration_mean is not None:
+        console.print(
+            f"  duration mean={summary.duration_mean:.2f}s "
+            f"stddev={summary.duration_stddev or 0.0:.2f}s"
+        )
+    console.print(f"  artifact: [cyan]{out_path}[/cyan]")
+
+
 # Validate commands
 
 
diff --git a/src/mapify_cli/skills_eval/__init__.py b/src/mapify_cli/skills_eval/__init__.py
new file mode 100644
index 0000000..df7042f
--- /dev/null
+++ b/src/mapify_cli/skills_eval/__init__.py
@@ -0,0 +1,57 @@
+"""skills_eval — skill trigger evaluation data contracts and dispatchers.
+
+Exports the shared types used by every eval component (dispatcher, assertions,
+runner, aggregator) and the concrete dispatcher implementations.
+"""
+
+from __future__ import annotations
+
+from mapify_cli.skills_eval.assertions import (
+    AssertionResult,
+    run_assertion,
+    run_assertions,
+)
+from mapify_cli.skills_eval.dispatcher import (
+    ClaudeSubprocessDispatcher,
+    MockDispatcher,
+    VariantDispatcher,
+)
+from mapify_cli.skills_eval.eval_schema import (
+    DispatchResult,
+    EvalResultRecord,
+    EvalSetEntry,
+    make_cell_id,
+)
+from mapify_cli.skills_eval.runner import (
+    default_run_path,
+    evaluate_cell,
+    latest_run_path,
+    load_eval_set,
+    run_eval,
+)
+from mapify_cli.skills_eval.aggregator import (
+    AggregateSummary,
+    aggregate,
+    bounded_run,
+)
+
+__all__ = [
+    "AggregateSummary",
+    "AssertionResult",
+    "ClaudeSubprocessDispatcher",
+    "DispatchResult",
+    "EvalResultRecord",
+    "EvalSetEntry",
+    "MockDispatcher",
+    "VariantDispatcher",
+    "aggregate",
+    "bounded_run",
+    "default_run_path",
+    "evaluate_cell",
+    "latest_run_path",
+    "load_eval_set",
+    "make_cell_id",
+    "run_assertion",
+    "run_assertions",
+    "run_eval",
+]
diff --git a/src/mapify_cli/skills_eval/aggregator.py b/src/mapify_cli/skills_eval/aggregator.py
new file mode 100644
index 0000000..1e55e27
--- /dev/null
+++ b/src/mapify_cli/skills_eval/aggregator.py
@@ -0,0 +1,300 @@
+"""Aggregation and bounded-concurrency runner for skills_eval.
+
+Public API:
+- ``AggregateSummary``  -- frozen dataclass summarising a completed eval run.
+- ``aggregate(records)`` -- compute summary stats from a list of EvalResultRecord.
+- ``bounded_run(...)``  -- parallel cell dispatch with serialised durable writes.
+
+Design invariants respected:
+- INV-3: no ``import anthropic``, no ANTHROPIC_API_KEY access.
+- INV-5: ClaudeSubprocessDispatcher isolation is automatic (each dispatch creates
+         its own mkdtemp cwd); no extra isolation code is needed here.
+- VC1:   pass_rate = passed_cells / total_cells (0.0 when total==0, never divide-by-zero).
+- VC2:   token mean/stddev use statistics.mean/stdev; n<2 → stddev 0.0; n==0 → None.
+- VC3:   bounded_run serialises writes under a threading.Lock (no .jsonl corruption).
+- VC4:   aggregate never raises on empty list or all-null token_usage records.
+- SC-1:  max_concurrency controls ThreadPoolExecutor workers; default 1 (sequential).
+"""
+
+from __future__ import annotations
+
+import concurrent.futures
+import dataclasses
+import logging
+import statistics
+import threading
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, TypeAlias
+
+from mapify_cli.skills_eval.eval_schema import EvalResultRecord
+from mapify_cli.skills_eval.eval_schema import EvalSetEntry
+from mapify_cli.skills_eval.dispatcher import VariantDispatcher
+from mapify_cli.skills_eval.runner import (
+    _append_record,
+    _read_present_cell_ids,
+    evaluate_cell,
+    make_cell_id,
+)
+
+logger = logging.getLogger(__name__)
+
+# Intent: fixed variant_id per D10 -- matches the constant in runner.py.
+_VARIANT_ID: int = 1
+
+# Re-export make_cell_id so callers who import from aggregator get it too.
+__all__ = ["AggregateSummary", "aggregate", "bounded_run"]
+
+# Intent: module-level TypeAlias so pyright can resolve it in function annotations.
+_WorkItem: TypeAlias = tuple[int, int, EvalSetEntry]
+
+
+# ---------------------------------------------------------------------------
+# AggregateSummary
+# ---------------------------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class AggregateSummary:
+    """Aggregate statistics over a completed eval run.
+
+    JSON-serialisable via ``to_dict()``.  All float fields that can be absent
+    (token stats, duration when no records) are typed ``float | None``.
+
+    Fields
+    ------
+    total_cells:
+        Total number of ``EvalResultRecord`` objects in the input.
+    passed_cells:
+        Count of records whose ``assertions_failed`` list is EMPTY.
+    pass_rate:
+        ``passed_cells / total_cells``; 0.0 when ``total_cells == 0``.
+    token_sample_size:
+        Count of records where ``token_usage`` is not None.
+    tokens_mean:
+        Arithmetic mean of ``token_usage.total`` over the token sample.
+        ``None`` when ``token_sample_size == 0``.
+    tokens_stddev:
+        Sample standard deviation of ``token_usage.total``; 0.0 when
+        ``token_sample_size < 2``; ``None`` when ``token_sample_size == 0``.
+    duration_mean:
+        Arithmetic mean of ``record.duration_s`` over all records.
+        ``None`` when ``total_cells == 0``.
+    duration_stddev:
+        Sample standard deviation of ``duration_s``; 0.0 when
+        ``total_cells < 2``; ``None`` when ``total_cells == 0``.
+    """
+
+    total_cells: int
+    passed_cells: int
+    pass_rate: float
+    token_sample_size: int
+    tokens_mean: float | None
+    tokens_stddev: float | None
+    duration_mean: float | None
+    duration_stddev: float | None
+
+    def to_dict(self) -> dict[str, Any]:
+        """Return a JSON-serialisable dict for this summary."""
+        return dataclasses.asdict(self)
+
+
+# ---------------------------------------------------------------------------
+# _safe_stddev  (n<2 guard, shared by token and duration paths)
+# ---------------------------------------------------------------------------
+
+
+def _safe_stddev(xs: list[float]) -> float:
+    """Return sample stdev of *xs*, guarding against n<2 with 0.0.
+
+    ``statistics.stdev`` raises ``StatisticsError`` on n<2; we normalise that
+    to 0.0 because a single-sample (or zero-sample) collection has no spread.
+    The caller guarantees ``len(xs) >= 1`` (use 0.0 for empty at the call site).
+    """
+    if len(xs) < 2:
+        return 0.0
+    return statistics.stdev(xs)
+
+
+# ---------------------------------------------------------------------------
+# aggregate
+# ---------------------------------------------------------------------------
+
+
+def aggregate(records: list[EvalResultRecord]) -> AggregateSummary:
+    """Compute aggregate statistics over *records*.
+
+    Never raises, even for an empty list or all-null ``token_usage`` records.
+
+    Parameters
+    ----------
+    records:
+        List of ``EvalResultRecord`` objects from a completed (or partial) run.
+        May be empty.
+
+    Returns
+    -------
+    AggregateSummary
+        Populated summary.  When ``records`` is empty:
+        ``total_cells=0, passed_cells=0, pass_rate=0.0,
+        token_sample_size=0, tokens_mean=None, tokens_stddev=None,
+        duration_mean=None, duration_stddev=None``.
+    """
+    total_cells = len(records)
+
+    # VC1: pass_rate --- cells with EMPTY assertions_failed are "passed".
+    passed_cells = sum(1 for r in records if len(r.assertions_failed) == 0)
+    # Intent: explicit zero-guard so we never divide by zero.
+    pass_rate = passed_cells / total_cells if total_cells > 0 else 0.0
+
+    # VC2/VC4: token stats --- only over records with non-null token_usage.
+    token_totals: list[float] = [
+        float(r.token_usage.total) for r in records if r.token_usage is not None
+    ]
+    token_sample_size = len(token_totals)
+    if token_sample_size == 0:
+        # VC4: all-null token_usage → both stats are None; pass_rate+duration still valid.
+        tokens_mean: float | None = None
+        tokens_stddev: float | None = None
+    else:
+        tokens_mean = statistics.mean(token_totals)
+        tokens_stddev = _safe_stddev(token_totals)
+
+    # Duration stats --- duration_s is always present on every record.
+    if total_cells == 0:
+        duration_mean: float | None = None
+        duration_stddev: float | None = None
+    else:
+        durations: list[float] = [r.duration_s for r in records]
+        duration_mean = statistics.mean(durations)
+        duration_stddev = _safe_stddev(durations)
+
+    return AggregateSummary(
+        total_cells=total_cells,
+        passed_cells=passed_cells,
+        pass_rate=pass_rate,
+        token_sample_size=token_sample_size,
+        tokens_mean=tokens_mean,
+        tokens_stddev=tokens_stddev,
+        duration_mean=duration_mean,
+        duration_stddev=duration_stddev,
+    )
+
+
+# ---------------------------------------------------------------------------
+# bounded_run
+# ---------------------------------------------------------------------------
+
+
+def bounded_run(
+    *,
+    skill: str,
+    entries: list[EvalSetEntry],
+    dispatcher: VariantDispatcher,
+    runs: int,
+    out_path: Path,
+    resume: bool = False,
+    max_concurrency: int = 1,
+) -> list[EvalResultRecord]:
+    """Run the prompts x runs matrix with bounded parallel dispatch.
+
+    Mirrors ``run_eval`` but executes cells in a ``ThreadPoolExecutor`` with up
+    to *max_concurrency* worker threads.  All .jsonl writes are serialised under
+    a ``threading.Lock`` so the output file is never corrupted (VC3).
+
+    Parameters
+    ----------
+    skill:
+        Skill name (used for logging).
+    entries:
+        Eval-set rows (``EvalSetEntry`` objects).
+    dispatcher:
+        Dispatcher instance.  Each ``evaluate_cell`` call invokes
+        ``dispatcher.dispatch()``.  For ``ClaudeSubprocessDispatcher``, INV-5
+        isolation is automatic — each dispatch creates its own ``mkdtemp`` cwd
+        so concurrent dispatches never share working directories.
+    runs:
+        Number of runs per prompt.
+    out_path:
+        Absolute path to the ``.jsonl`` output file.
+    resume:
+        If True, skip cells already present in *out_path* (keyed on cell_id).
+    max_concurrency:
+        Maximum number of concurrent worker threads.  ``1`` (default) makes
+        this effectively sequential while sharing the same code path as
+        parallel execution.
+
+    Returns
+    -------
+    list[EvalResultRecord]
+        All records dispatched during THIS call (resumed/skipped cells excluded).
+        Write order in the .jsonl may be nondeterministic at concurrency>1, but
+        the SET of cell_ids is always complete and unique.
+    """
+    # Determine the complete set of cells to skip (resume mode).
+    present_cell_ids: set[str] = set()
+    if resume and out_path.exists():
+        present_cell_ids = _read_present_cell_ids(out_path)
+        logger.info(
+            "bounded_run: resume mode -- %d cells already present in %s",
+            len(present_cell_ids),
+            out_path,
+        )
+
+    # Ensure output directory exists before any worker touches the file.
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+
+    # Build the work list: (prompt_index, run_number, entry) for missing cells only.
+    work_items: list[_WorkItem] = []
+    for prompt_index, entry in enumerate(entries):
+        for run_number in range(runs):
+            cell_id = make_cell_id(prompt_index, _VARIANT_ID, run_number)
+            if cell_id not in present_cell_ids:
+                work_items.append((prompt_index, run_number, entry))
+            else:
+                logger.debug(
+                    "bounded_run: skipping cell %s (already present in %s)",
+                    cell_id,
+                    out_path,
+                )
+
+    # Intent: serialised-write lock -- only one thread may append to the .jsonl
+    # at a time, preventing interleaved/corrupted writes (VC3).
+    write_lock = threading.Lock()
+    collected: list[EvalResultRecord] = []
+
+    def _dispatch_and_record(item: _WorkItem) -> EvalResultRecord:
+        """Worker: evaluate one cell and serialise the write."""
+        prompt_idx, run_num, cell_entry = item
+        record = evaluate_cell(
+            skill=skill,
+            entry=cell_entry,
+            prompt_index=prompt_idx,
+            run_number=run_num,
+            dispatcher=dispatcher,
+        )
+        with write_lock:
+            # INV-4: durable per-cell append-and-flush, serialised.
+            _append_record(out_path, record)
+            collected.append(record)
+        return record
+
+    workers = max(1, max_concurrency)
+    with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
+        futures = [executor.submit(_dispatch_and_record, item) for item in work_items]
+        # Intent: iterate futures as they complete; re-raise any unexpected exception
+        # so the caller can detect programming errors (dispatcher must not raise, per
+        # its contract, but the lock/append path theoretically could).
+        for future in concurrent.futures.as_completed(futures):
+            future.result()  # propagates any unexpected exception
+
+    logger.info(
+        "bounded_run: finished skill=%s entries=%d runs=%d cells_written=%d out=%s",
+        skill,
+        len(entries),
+        runs,
+        len(collected),
+        out_path,
+    )
+
+    return collected
diff --git a/src/mapify_cli/skills_eval/assertions.py b/src/mapify_cli/skills_eval/assertions.py
new file mode 100644
index 0000000..2f7141d
--- /dev/null
+++ b/src/mapify_cli/skills_eval/assertions.py
@@ -0,0 +1,284 @@
+"""Pure, deterministic assertion runner for skill eval cells.
+
+No LLM, no subprocess, no file I/O, no network.  Same (spec, result)
+always produces the same verdict (INV-3: no ``import anthropic``,
+no ANTHROPIC_API_KEY).
+
+Assertion types
+---------------
+- contains      – value in raw_output
+- not_contains  – value not in raw_output
+- regex         – re.search(pattern, raw_output) is not None
+- valid_json    – raw_output.strip() parses via json.loads
+- trigger       – triggered_skill == skill
+- not_trigger   – triggered_skill != skill  (None-safe: SC-3)
+
+Robustness
+----------
+- Unknown type  → FAIL, detail "unknown assertion type: <t>"
+- Missing key   → FAIL, clear detail, no KeyError
+- Invalid regex → FAIL, detail includes re.error message
+- run_assertion never raises
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from dataclasses import dataclass
+
+from mapify_cli.skills_eval.eval_schema import DispatchResult
+
+
+# ---------------------------------------------------------------------------
+# AssertionResult
+# ---------------------------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class AssertionResult:
+    """Immutable result of a single assertion evaluation."""
+
+    passed: bool
+    type: str
+    detail: str
+
+
+# ---------------------------------------------------------------------------
+# Internal helpers — one per assertion type
+# ---------------------------------------------------------------------------
+
+
+def _assert_contains(spec: dict[str, object], result: DispatchResult) -> AssertionResult:
+    """PASS iff spec["value"] is a substring of result.raw_output."""
+    value = spec.get("value")
+    if not isinstance(value, str):
+        return AssertionResult(
+            passed=False,
+            type="contains",
+            detail=f"contains: missing or non-string 'value' key (got {type(value).__name__!r})",
+        )
+    matched = value in result.raw_output
+    verb = "found in" if matched else "not found in"
+    return AssertionResult(
+        passed=matched,
+        type="contains",
+        detail=f"contains {value!r} -> {'PASS' if matched else 'FAIL'} ({verb} raw_output)",
+    )
+
+
+def _assert_not_contains(spec: dict[str, object], result: DispatchResult) -> AssertionResult:
+    """PASS iff spec["value"] is NOT a substring of result.raw_output."""
+    value = spec.get("value")
+    if not isinstance(value, str):
+        return AssertionResult(
+            passed=False,
+            type="not_contains",
+            detail=(
+                f"not_contains: missing or non-string 'value' key "
+                f"(got {type(value).__name__!r})"
+            ),
+        )
+    matched = value in result.raw_output
+    return AssertionResult(
+        passed=not matched,
+        type="not_contains",
+        detail=(
+            f"not_contains {value!r} -> {'PASS' if not matched else 'FAIL'} "
+            f"({'absent from' if not matched else 'found in'} raw_output)"
+        ),
+    )
+
+
+def _assert_regex(spec: dict[str, object], result: DispatchResult) -> AssertionResult:
+    """PASS iff re.search(pattern, raw_output) is not None.
+
+    Invalid regex pattern -> FAIL (detail includes re.error message).
+    """
+    pattern = spec.get("pattern")
+    if not isinstance(pattern, str):
+        return AssertionResult(
+            passed=False,
+            type="regex",
+            detail=(
+                f"regex: missing or non-string 'pattern' key "
+                f"(got {type(pattern).__name__!r})"
+            ),
+        )
+    try:
+        match = re.search(pattern, result.raw_output)
+    except re.error as exc:
+        return AssertionResult(
+            passed=False,
+            type="regex",
+            detail=f"regex {pattern!r} -> FAIL (invalid pattern: {exc})",
+        )
+    matched = match is not None
+    return AssertionResult(
+        passed=matched,
+        type="regex",
+        detail=(
+            f"regex {pattern!r} -> {'PASS' if matched else 'FAIL'} "
+            f"({'match found' if matched else 'no match'} in raw_output)"
+        ),
+    )
+
+
+def _assert_valid_json(
+    _spec: dict[str, object], result: DispatchResult
+) -> AssertionResult:
+    """PASS iff result.raw_output.strip() parses via json.loads."""
+    try:
+        json.loads(result.raw_output.strip())
+        return AssertionResult(
+            passed=True,
+            type="valid_json",
+            detail="valid_json -> PASS (raw_output is well-formed JSON)",
+        )
+    except (json.JSONDecodeError, ValueError) as exc:
+        return AssertionResult(
+            passed=False,
+            type="valid_json",
+            detail=f"valid_json -> FAIL (JSON parse error: {exc})",
+        )
+
+
+def _assert_trigger(spec: dict[str, object], result: DispatchResult) -> AssertionResult:
+    """PASS iff result.triggered_skill == spec["skill"]."""
+    skill = spec.get("skill")
+    if not isinstance(skill, str):
+        return AssertionResult(
+            passed=False,
+            type="trigger",
+            detail=(
+                f"trigger: missing or non-string 'skill' key "
+                f"(got {type(skill).__name__!r})"
+            ),
+        )
+    matched = result.triggered_skill == skill
+    return AssertionResult(
+        passed=matched,
+        type="trigger",
+        detail=(
+            f"trigger {skill!r} -> {'PASS' if matched else 'FAIL'} "
+            f"(triggered_skill={result.triggered_skill!r})"
+        ),
+    )
+
+
+def _assert_not_trigger(
+    spec: dict[str, object], result: DispatchResult
+) -> AssertionResult:
+    """PASS iff result.triggered_skill != spec["skill"].
+
+    SC-3: correctly handles triggered_skill is None —
+    ``not_trigger {"skill": "map-x"}`` PASSES when triggered_skill is None.
+    """
+    skill = spec.get("skill")
+    if not isinstance(skill, str):
+        return AssertionResult(
+            passed=False,
+            type="not_trigger",
+            detail=(
+                f"not_trigger: missing or non-string 'skill' key "
+                f"(got {type(skill).__name__!r})"
+            ),
+        )
+    # None != skill is True, so this naturally satisfies SC-3.
+    matched = result.triggered_skill != skill
+    return AssertionResult(
+        passed=matched,
+        type="not_trigger",
+        detail=(
+            f"not_trigger {skill!r} -> {'PASS' if matched else 'FAIL'} "
+            f"(triggered_skill={result.triggered_skill!r})"
+        ),
+    )
+
+
+# ---------------------------------------------------------------------------
+# Dispatcher table
+# ---------------------------------------------------------------------------
+
+# Intent: map assertion type string to its handler function.
+# Using a dict avoids a long if/elif chain and makes type extension O(1).
+_ASSERTION_HANDLERS = {
+    "contains": _assert_contains,
+    "not_contains": _assert_not_contains,
+    "regex": _assert_regex,
+    "valid_json": _assert_valid_json,
+    "trigger": _assert_trigger,
+    "not_trigger": _assert_not_trigger,
+}
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+
+def run_assertion(spec: dict[str, object], result: DispatchResult) -> AssertionResult:
+    """Evaluate a single assertion spec against a DispatchResult.
+
+    Never raises — unknown types and missing keys produce FAIL results with
+    human-debuggable ``detail`` strings.
+
+    Parameters
+    ----------
+    spec:
+        Dict with at least a ``"type"`` key and any type-specific keys.
+    result:
+        The DispatchResult from the dispatcher (ST-002).
+
+    Returns
+    -------
+    AssertionResult
+        Frozen dataclass; ``passed`` is the verdict, ``detail`` explains why.
+    """
+    assertion_type = spec.get("type")
+    if not isinstance(assertion_type, str):
+        return AssertionResult(
+            passed=False,
+            type=str(assertion_type),
+            detail=(
+                f"unknown assertion type: {assertion_type!r} "
+                f"(must be str, got {type(assertion_type).__name__!r})"
+            ),
+        )
+
+    handler = _ASSERTION_HANDLERS.get(assertion_type)
+    if handler is None:
+        return AssertionResult(
+            passed=False,
+            type=assertion_type,
+            detail=f"unknown assertion type: {assertion_type!r}",
+        )
+
+    return handler(spec, result)
+
+
+def run_assertions(
+    specs: list[dict[str, object]],
+    result: DispatchResult,
+) -> tuple[list[str], list[str]]:
+    """Run all assertions in *specs* against *result*.
+
+    Returns
+    -------
+    tuple[list[str], list[str]]
+        ``(passed_details, failed_details)`` — the ``detail`` strings of
+        passing vs failing assertions, suitable for
+        ``EvalResultRecord.assertions_passed`` /
+        ``EvalResultRecord.assertions_failed``.
+    """
+    passed_details: list[str] = []
+    failed_details: list[str] = []
+
+    for spec in specs:
+        ar = run_assertion(spec, result)
+        if ar.passed:
+            passed_details.append(ar.detail)
+        else:
+            failed_details.append(ar.detail)
+
+    return passed_details, failed_details
diff --git a/src/mapify_cli/skills_eval/dispatcher.py b/src/mapify_cli/skills_eval/dispatcher.py
new file mode 100644
index 0000000..e87c406
--- /dev/null
+++ b/src/mapify_cli/skills_eval/dispatcher.py
@@ -0,0 +1,540 @@
+"""Variant dispatcher for the skills_eval package.
+
+Provides the ABC ``VariantDispatcher`` and two concrete implementations:
+- ``MockDispatcher``: zero-subprocess, caller-controlled output for CI tests (INV-2).
+- ``ClaudeSubprocessDispatcher``: real ``claude -p`` invocation in a seeded
+  throwaway temp cwd with the TEMP-FLIP applied.
+
+Hard constraints (INV-2, INV-3, INV-5)
+---------------------------------------
+- Uses only stdlib; no Anthropic SDK imports (INV-3).
+- Does not read cloud credentials from the environment (INV-3).
+- Production ``.claude/`` and ``.map/`` trees are NEVER modified (INV-5).
+  The TEMP-FLIP touches only the throwaway seeded copy.
+- ``MockDispatcher.dispatch`` NEVER calls subprocess (INV-2).
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import random
+import shutil
+import subprocess
+import tempfile
+import time
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Any
+
+from mapify_cli.skills_eval.eval_schema import DispatchResult
+from mapify_cli.token_budget import TokenUsage
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Abstract base
+# ---------------------------------------------------------------------------
+
+
+class VariantDispatcher(ABC):
+    """Abstract dispatcher: given a prompt, produce a ``DispatchResult``."""
+
+    @abstractmethod
+    def dispatch(self, prompt: str) -> DispatchResult:
+        """Run ``prompt`` and return a fully-populated ``DispatchResult``.
+
+        Implementations MUST NOT raise — transient failures are captured in
+        ``DispatchResult.error``.
+        """
+
+
+# ---------------------------------------------------------------------------
+# MockDispatcher — CI / unit-test use only (INV-2: zero subprocess)
+# ---------------------------------------------------------------------------
+
+
+class MockDispatcher(VariantDispatcher):
+    """Caller-controlled dispatcher that performs ZERO subprocess work.
+
+    All tests in the CI suite use this instead of ``ClaudeSubprocessDispatcher``
+    to avoid real ``claude`` invocations.  Construct with the exact field values
+    that ``dispatch()`` should return.
+    """
+
+    def __init__(
+        self,
+        *,
+        triggered_skill: str | None = None,
+        raw_output: str = "",
+        token_usage: TokenUsage | None = None,
+        duration_s: float = 0.0,
+        error: str | None = None,
+    ) -> None:
+        self._triggered_skill = triggered_skill
+        self._raw_output = raw_output
+        self._token_usage = token_usage
+        self._duration_s = duration_s
+        self._error = error
+
+    def dispatch(self, prompt: str) -> DispatchResult:
+        """Return the caller-configured ``DispatchResult``.
+
+        No subprocess call, no file I/O — pure attribute access (INV-2).
+        The ``prompt`` is intentionally ignored — a mock returns a fixed result.
+        """
+        del prompt  # intentionally unused; mock returns caller-set values
+        return DispatchResult(
+            raw_output=self._raw_output,
+            triggered_skill=self._triggered_skill,
+            token_usage=self._token_usage,
+            duration_s=self._duration_s,
+            error=self._error,
+        )
+
+
+# ---------------------------------------------------------------------------
+# Seeding helpers (ClaudeSubprocessDispatcher internals)
+# ---------------------------------------------------------------------------
+
+
+def _seed_temp_cwd(source_claude_dir: Path) -> Path:
+    """Create a throwaway temp directory seeded with a copy of ``.claude/``.
+
+    Steps:
+    1. ``tempfile.mkdtemp()`` — fresh isolated dir.
+    2. ``shutil.copytree(source_claude_dir, <tmp>/.claude)`` — full copy.
+    3. ``os.makedirs(<tmp>/.map)`` — fresh empty ``.map/`` (no production state).
+    4. TEMP-FLIP: rewrite ``disable-model-invocation: true`` →
+       ``disable-model-invocation: false`` in every seeded SKILL.md.
+
+    Returns the tmp dir ``Path``.
+    Caller is responsible for ``shutil.rmtree(tmp, ignore_errors=True)`` cleanup.
+    """
+    tmp = Path(tempfile.mkdtemp(prefix="mapeval-"))
+
+    # 1. Copy .claude/ tree (only if source exists).
+    seeded_claude = tmp / ".claude"
+    if source_claude_dir.is_dir():
+        shutil.copytree(source_claude_dir, seeded_claude)
+    else:
+        seeded_claude.mkdir(parents=True)
+        logger.warning(
+            "seed_temp_cwd: source_claude_dir %s does not exist — seeding empty .claude/",
+            source_claude_dir,
+        )
+
+    # 2. Empty .map/ — prevents accidental reads of production workflow state.
+    (tmp / ".map").mkdir(parents=True)
+
+    # 3. TEMP-FLIP: make every skill model-selectable for the eval (spike VC3).
+    #    Pattern: a frontmatter line ``disable-model-invocation: true`` (any
+    #    leading/trailing whitespace) → ``disable-model-invocation: false``.
+    #    Skills without the field are left untouched (already invocable).
+    _apply_temp_flip(seeded_claude)
+
+    return tmp
+
+
+def _apply_temp_flip(seeded_claude_dir: Path) -> None:
+    """Rewrite ``disable-model-invocation: true`` → ``false`` in seeded SKILL.md files.
+
+    Intent: allow the eval model to select any skill via description, not just
+    the three production-invocable ones.  Throwaway copy only — production
+    templates are never touched.
+    """
+    skill_files = list(seeded_claude_dir.glob("skills/*/SKILL.md"))
+    for skill_file in skill_files:
+        try:
+            original = skill_file.read_text(encoding="utf-8")
+        except OSError as exc:
+            logger.warning("temp_flip: could not read %s: %s", skill_file, exc)
+            continue
+
+        flipped = _flip_disable_invocation_line(original)
+        if flipped != original:
+            try:
+                skill_file.write_text(flipped, encoding="utf-8")
+            except OSError as exc:
+                logger.warning("temp_flip: could not write %s: %s", skill_file, exc)
+
+
+def _flip_disable_invocation_line(content: str) -> str:
+    """Replace the first ``disable-model-invocation: true`` line with ``false``.
+
+    Operates line-by-line to avoid regex mis-matches on other content.
+    Returns the original string unchanged if the field is absent or already false.
+    """
+    lines = content.splitlines(keepends=True)
+    result: list[str] = []
+    for line in lines:
+        stripped = line.strip()
+        if stripped == "disable-model-invocation: true":
+            # Preserve leading/trailing whitespace so the YAML structure stays valid.
+            result.append(line.replace("true", "false", 1))
+        else:
+            result.append(line)
+    return "".join(result)
+
+
+# ---------------------------------------------------------------------------
+# Transcript helpers
+# ---------------------------------------------------------------------------
+
+
+def _derive_triggered_skill(session_id: str, cwd: Path) -> str | None:
+    """Scan the native JSONL transcript for the first fired skill.
+
+    Search order (spike VC3 binding contract):
+    1. Glob ``~/.claude/projects/*/<session_id>.jsonl`` (session_id is a unique
+       UUID — no slug fragility).
+    2. Fall back to slug-from-cwd path if glob returns nothing.
+    3. If transcript not found → return ``None`` (do not crash).
+
+    Detection rule: find the first assistant message.content[*] where
+    ``type=="tool_use"`` and ``name=="Skill"``; return ``input.skill``.
+    ``name=="Agent"`` / ``Task`` blocks are ignored.
+    """
+    if not session_id:
+        return None
+
+    transcript_path = _locate_transcript(session_id, cwd)
+    if transcript_path is None or not transcript_path.exists():
+        logger.debug(
+            "transcript not found for session_id=%s cwd=%s", session_id, cwd
+        )
+        return None
+
+    return _parse_transcript_for_skill(transcript_path)
+
+
+def _locate_transcript(session_id: str, cwd: Path) -> Path | None:
+    """Return the path to the JSONL transcript or ``None`` if not found."""
+    projects_dir = Path.home() / ".claude" / "projects"
+
+    # Primary: UUID-based glob — immune to slug encoding differences.
+    if session_id:
+        matches = list(projects_dir.glob(f"*/{session_id}.jsonl"))
+        if matches:
+            return matches[0]
+
+    # Fallback: reconstruct slug from cwd (``/`` and ``.`` → ``-``).
+    cwd_slug = str(cwd).replace("/", "-").replace(".", "-")
+    fallback = projects_dir / cwd_slug / f"{session_id}.jsonl"
+    if fallback.exists():
+        return fallback
+
+    return None
+
+
+def _parse_transcript_for_skill(path: Path) -> str | None:
+    """Return the first ``Skill`` tool_use ``input.skill`` value, or ``None``."""
+    try:
+        with path.open(encoding="utf-8") as fh:
+            for raw_line in fh:
+                raw_line = raw_line.strip()
+                if not raw_line:
+                    continue
+                try:
+                    entry = json.loads(raw_line)
+                except json.JSONDecodeError:
+                    continue
+
+                skill = _extract_skill_from_entry(entry)
+                if skill is not None:
+                    return skill
+    except OSError as exc:
+        logger.warning("parse_transcript: could not read %s: %s", path, exc)
+
+    return None
+
+
+def _extract_skill_from_entry(entry: Any) -> str | None:
+    """Extract ``input.skill`` from a transcript entry if it is a Skill tool_use.
+
+    Walks ``message.content[*]`` looking for ``type=="tool_use"`` +
+    ``name=="Skill"``.  Returns the skill name string or ``None``.
+    """
+    if not isinstance(entry, dict):
+        return None
+
+    message = entry.get("message")
+    if not isinstance(message, dict):
+        return None
+
+    content = message.get("content")
+    if not isinstance(content, list):
+        return None
+
+    for block in content:
+        if not isinstance(block, dict):
+            continue
+        if block.get("type") != "tool_use":
+            continue
+        if block.get("name") != "Skill":
+            continue
+        tool_input = block.get("input")
+        if isinstance(tool_input, dict):
+            skill_name = tool_input.get("skill")
+            if isinstance(skill_name, str) and skill_name:
+                return skill_name
+
+    return None
+
+
+# ---------------------------------------------------------------------------
+# Envelope parsing
+# ---------------------------------------------------------------------------
+
+
+def _parse_envelope(stdout: str) -> tuple[str, TokenUsage | None, str]:
+    """Parse the ``claude -p --output-format json`` result envelope defensively.
+
+    Returns ``(raw_output, token_usage, session_id)``.
+    On JSON decode failure returns ``(stdout, None, "")``.
+
+    Mirrors ``_parse_claude_output`` / ``_append_cost_log`` from
+    ``memory/finalize.py:232-281``.
+    """
+    try:
+        parsed = json.loads(stdout)
+    except (json.JSONDecodeError, ValueError):
+        return stdout, None, ""
+
+    if not isinstance(parsed, dict):
+        return stdout, None, ""
+
+    raw_output = str(parsed.get("result", ""))
+    session_id = str(parsed.get("session_id") or "")
+
+    usage_raw = parsed.get("usage")
+    token_usage: TokenUsage | None = None
+    if isinstance(usage_raw, dict):
+        token_usage = TokenUsage(
+            input_tokens=int(usage_raw.get("input_tokens", 0) or 0),
+            cache_read_input_tokens=int(
+                usage_raw.get("cache_read_input_tokens", 0) or 0
+            ),
+            cache_creation_input_tokens=int(
+                usage_raw.get("cache_creation_input_tokens", 0) or 0
+            ),
+        )
+
+    return raw_output, token_usage, session_id
+
+
+# ---------------------------------------------------------------------------
+# ClaudeSubprocessDispatcher
+# ---------------------------------------------------------------------------
+
+# Default jitter upper-bound (seconds) added to backoff sleep.
+_JITTER_MAX: float = 2.0
+
+
+class ClaudeSubprocessDispatcher(VariantDispatcher):
+    """Real ``claude -p`` dispatcher for production/manual eval runs.
+
+    Seeding and cleanup
+    -------------------
+    Each ``dispatch()`` call:
+    1. Creates a fresh temp cwd seeded with a copy of ``source_claude_dir``
+       and an empty ``.map/``.
+    2. Applies TEMP-FLIP so all skills are model-selectable.
+    3. Runs ``claude -p <prompt> --output-format json`` in that temp cwd.
+    4. Removes the temp dir in a ``try/finally`` block.
+
+    Retry policy (VC4)
+    ------------------
+    ``subprocess.TimeoutExpired``, non-zero ``returncode``, and ``OSError``
+    are treated as transient.  Up to ``max_retries`` additional attempts are
+    made with bounded jittered exponential backoff.  After exhaustion the error
+    is recorded in ``DispatchResult.error``; no exception escapes ``dispatch()``.
+
+    INV-3 compliance
+    ----------------
+    No Anthropic SDK import.  No cloud credential environment reads.
+
+    INV-5 compliance
+    ----------------
+    ``cwd`` of the subprocess is always the throwaway temp dir.  Production
+    ``.map/`` is never referenced.
+    """
+
+    def __init__(
+        self,
+        *,
+        source_claude_dir: Path | None = None,
+        timeout: float = 120.0,
+        max_retries: int = 2,
+        backoff_base: float = 2.0,
+    ) -> None:
+        """Initialise the dispatcher.
+
+        Parameters
+        ----------
+        source_claude_dir:
+            Path to the ``.claude/`` directory to seed from.  Defaults to
+            ``Path.cwd() / ".claude"`` at construction time.
+        timeout:
+            Per-attempt timeout in seconds passed to ``subprocess.run``.
+        max_retries:
+            Number of *additional* retry attempts after the first failure.
+            Total attempts = 1 + max_retries.
+        backoff_base:
+            Base for exponential backoff (seconds).  Attempt 0 sleeps
+            ``backoff_base * 2**0 + jitter``, attempt 1 sleeps
+            ``backoff_base * 2**1 + jitter``, etc.
+        """
+        self._source_claude_dir: Path = (
+            source_claude_dir if source_claude_dir is not None else Path.cwd() / ".claude"
+        )
+        self._timeout = timeout
+        self._max_retries = max_retries
+        self._backoff_base = backoff_base
+        # Holds the error message from the latest _run_once call. Instance-scoped
+        # (not class-level) so the safe-sequential-only assumption is explicit.
+        self._last_error: str = ""
+
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+
+    def dispatch(self, prompt: str) -> DispatchResult:
+        """Dispatch ``prompt`` via ``claude -p``, with backoff retry on failure.
+
+        Always returns a ``DispatchResult`` — never raises.
+        """
+        t_total_start = time.monotonic()
+        tmp: Path | None = None
+
+        try:
+            tmp = _seed_temp_cwd(self._source_claude_dir)
+            return self._dispatch_with_retry(prompt, tmp, t_total_start)
+        except Exception as exc:  # noqa: BLE001
+            # Catch any unexpected seeding failure; should not occur in practice.
+            duration_s = time.monotonic() - t_total_start
+            logger.warning("dispatch: unexpected error during seeding: %s", exc)
+            return DispatchResult(
+                raw_output="",
+                triggered_skill=None,
+                token_usage=None,
+                duration_s=duration_s,
+                error=f"seeding error: {exc}",
+            )
+        finally:
+            if tmp is not None:
+                shutil.rmtree(tmp, ignore_errors=True)
+
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+
+    def _dispatch_with_retry(
+        self,
+        prompt: str,
+        tmp: Path,
+        t_total_start: float,
+    ) -> DispatchResult:
+        """Run the subprocess with bounded jittered exponential backoff.
+
+        ``max_retries=2`` means up to 3 total attempts (attempt 0, 1, 2).
+        After all attempts are exhausted, returns an error ``DispatchResult``.
+        """
+        argv = ["claude", "-p", prompt, "--output-format", "json"]
+        last_error: str = ""
+
+        for attempt in range(self._max_retries + 1):
+            if attempt > 0:
+                sleep_s = self._backoff_base * (2 ** (attempt - 1)) + random.uniform(
+                    0, _JITTER_MAX
+                )
+                logger.debug(
+                    "dispatch: retry attempt %d/%d — sleeping %.2fs",
+                    attempt,
+                    self._max_retries,
+                    sleep_s,
+                )
+                time.sleep(sleep_s)
+
+            result = self._run_once(argv, tmp)
+            if result is not None:
+                # Successful subprocess run — parse and return.
+                return self._build_result(result, tmp, t_total_start)
+
+            # _run_once returned None => transient failure; last_error was set.
+            last_error = self._last_error
+
+        duration_s = time.monotonic() - t_total_start
+        return DispatchResult(
+            raw_output="",
+            triggered_skill=None,
+            token_usage=None,
+            duration_s=duration_s,
+            error=last_error or "dispatch failed after retries",
+        )
+
+    def _run_once(
+        self,
+        argv: list[str],
+        cwd: Path,
+    ) -> subprocess.CompletedProcess[str] | None:
+        """Run ``argv`` once; return ``CompletedProcess`` on success, ``None`` on failure.
+
+        Side-effect: sets ``self._last_error`` on failure.
+        """
+        try:
+            proc = subprocess.run(
+                argv,
+                capture_output=True,
+                text=True,
+                timeout=self._timeout,
+                cwd=cwd,
+                env={**os.environ, "MAP_INVOKED_BY": "skills-eval"},
+            )
+        except subprocess.TimeoutExpired as exc:
+            self._last_error = f"timeout after {self._timeout}s: {exc}"
+            logger.warning("dispatch: subprocess timed out: %s", exc)
+            return None
+        except OSError as exc:
+            self._last_error = f"OSError: {exc}"
+            logger.warning("dispatch: OSError running claude: %s", exc)
+            return None
+        except Exception as exc:  # noqa: BLE001
+            self._last_error = f"unexpected subprocess error: {exc}"
+            logger.warning("dispatch: unexpected subprocess error: %s", exc)
+            return None
+
+        if proc.returncode != 0:
+            self._last_error = (
+                f"non-zero returncode {proc.returncode}: "
+                f"{(proc.stderr or '')[:200].strip()}"
+            )
+            logger.warning(
+                "dispatch: claude returned returncode=%d stderr=%s",
+                proc.returncode,
+                (proc.stderr or "")[:200].strip(),
+            )
+            return None
+
+        return proc
+
+    def _build_result(
+        self,
+        proc: subprocess.CompletedProcess[str],
+        tmp: Path,
+        t_start: float,
+    ) -> DispatchResult:
+        """Parse the envelope from a successful subprocess run."""
+        stdout = proc.stdout or ""
+        raw_output, token_usage, session_id = _parse_envelope(stdout)
+        duration_s = time.monotonic() - t_start
+        triggered_skill = _derive_triggered_skill(session_id, tmp)
+
+        return DispatchResult(
+            raw_output=raw_output,
+            triggered_skill=triggered_skill,
+            token_usage=token_usage,
+            duration_s=duration_s,
+            error=None,
+        )
diff --git a/src/mapify_cli/skills_eval/eval_schema.py b/src/mapify_cli/skills_eval/eval_schema.py
new file mode 100644
index 0000000..a50766e
--- /dev/null
+++ b/src/mapify_cli/skills_eval/eval_schema.py
@@ -0,0 +1,180 @@
+"""Shared data contracts for the skills_eval package.
+
+All structures are defined EXACTLY ONCE here and imported by every eval
+component (dispatcher, assertions, runner, aggregator).  This module is a
+pure data layer — no dispatch logic, transcript parsing, assertion execution,
+or I/O of any kind.
+
+INV-3: No ``import anthropic`` and no ANTHROPIC_API_KEY access anywhere.
+INV-6: Contract-first — producer and consumer both import from this module.
+"""
+
+from __future__ import annotations
+
+import dataclasses
+from dataclasses import dataclass, field
+from typing import Any
+
+from mapify_cli.token_budget import TokenUsage
+
+
+# ---------------------------------------------------------------------------
+# EvalSetEntry
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class EvalSetEntry:
+    """One row parsed from a JSON eval-set file.
+
+    Built from externally supplied JSON, so field types are validated
+    explicitly in ``__post_init__`` — Python type hints are documentation only.
+    """
+
+    prompt: str
+    should_trigger: str | None
+    should_not_trigger: str | None
+    assertions: list[dict]  # type: ignore[type-arg]
+
+    def __post_init__(self) -> None:
+        if not isinstance(self.prompt, str):
+            raise ValueError(
+                f"EvalSetEntry.prompt must be str, got {type(self.prompt).__name__!r}"
+            )
+        if self.should_trigger is not None and not isinstance(self.should_trigger, str):
+            raise ValueError(
+                "EvalSetEntry.should_trigger must be str or None, "
+                f"got {type(self.should_trigger).__name__!r}"
+            )
+        if self.should_not_trigger is not None and not isinstance(
+            self.should_not_trigger, str
+        ):
+            raise ValueError(
+                "EvalSetEntry.should_not_trigger must be str or None, "
+                f"got {type(self.should_not_trigger).__name__!r}"
+            )
+        if not isinstance(self.assertions, list):
+            raise ValueError(
+                "EvalSetEntry.assertions must be list, "
+                f"got {type(self.assertions).__name__!r}"
+            )
+
+
+# ---------------------------------------------------------------------------
+# DispatchResult
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class DispatchResult:
+    """Result returned by the skill dispatcher for a single prompt.
+
+    ``token_usage`` and ``error`` are optional — dispatcher sets ``error``
+    when the API call fails and ``token_usage`` may be absent on failure.
+    ``TokenUsage`` is imported from ``mapify_cli.token_budget``; it is NOT
+    redefined here (INV-6).
+    """
+
+    raw_output: str
+    triggered_skill: str | None
+    token_usage: TokenUsage | None
+    duration_s: float
+    error: str | None = None
+
+
+# ---------------------------------------------------------------------------
+# EvalResultRecord  (append-only .jsonl row)
+# ---------------------------------------------------------------------------
+
+# Sentinel used in from_dict to distinguish «key absent» from «key present but None».
+_MISSING: object = object()
+
+@dataclass
+class EvalResultRecord:
+    """One completed eval result, serialisable to/from a JSON object.
+
+    Used for the append-only ``.jsonl`` result file written by the runner
+    (ST-005).  ``to_dict`` / ``from_dict`` provide a stable round-trip.
+    ``TokenUsage`` is a flat 3-int frozen dataclass; it is serialised as a
+    nested dict (via ``dataclasses.asdict``) and reconstructed in
+    ``from_dict``.
+    """
+
+    cell_id: str
+    prompt: str
+    triggered_skill: str | None
+    token_usage: TokenUsage | None
+    duration_s: float
+    assertions_passed: list[str] = field(default_factory=list)
+    assertions_failed: list[str] = field(default_factory=list)
+    raw_output: str = ""
+
+    # ------------------------------------------------------------------
+    # Serialisation helpers
+    # ------------------------------------------------------------------
+
+    def to_dict(self) -> dict[str, Any]:
+        """Return a JSON-serialisable dict for this record.
+
+        ``token_usage`` is either a nested dict (3 keys) or ``None``.
+        """
+        return {
+            "cell_id": self.cell_id,
+            "prompt": self.prompt,
+            "triggered_skill": self.triggered_skill,
+            "token_usage": (
+                dataclasses.asdict(self.token_usage)
+                if self.token_usage is not None
+                else None
+            ),
+            "duration_s": self.duration_s,
+            "assertions_passed": list(self.assertions_passed),
+            "assertions_failed": list(self.assertions_failed),
+            "raw_output": self.raw_output,
+        }
+
+    @classmethod
+    def from_dict(cls, d: dict[str, Any]) -> "EvalResultRecord":
+        """Reconstruct an ``EvalResultRecord`` from a plain dict (JSON parse).
+
+        Tolerates ``token_usage=None`` and missing keys for
+        ``assertions_passed``, ``assertions_failed``, and ``raw_output``
+        (backward compatibility with older .jsonl rows).
+        """
+        raw_tu = d.get("token_usage", _MISSING)
+        if raw_tu is _MISSING or raw_tu is None:
+            token_usage: TokenUsage | None = None
+        else:
+            token_usage = TokenUsage(
+                input_tokens=int(raw_tu.get("input_tokens", 0)),
+                cache_read_input_tokens=int(raw_tu.get("cache_read_input_tokens", 0)),
+                cache_creation_input_tokens=int(
+                    raw_tu.get("cache_creation_input_tokens", 0)
+                ),
+            )
+        return cls(
+            cell_id=d["cell_id"],
+            prompt=d["prompt"],
+            triggered_skill=d.get("triggered_skill"),
+            token_usage=token_usage,
+            duration_s=float(d["duration_s"]),
+            assertions_passed=list(d.get("assertions_passed", [])),
+            assertions_failed=list(d.get("assertions_failed", [])),
+            raw_output=d.get("raw_output", ""),
+        )
+
+
+# ---------------------------------------------------------------------------
+# make_cell_id
+# ---------------------------------------------------------------------------
+
+
+def make_cell_id(prompt_index: int, variant_id: int, run_number: int) -> str:
+    """Return a deterministic, human-readable cell identifier.
+
+    The format is stable so ``--resume`` can match present cell_ids across
+    runs without relying on randomness or wall-clock time.
+
+    Example: ``make_cell_id(0, 1, 2)`` → ``"p0-v1-r2"``
+    """
+    return f"p{prompt_index}-v{variant_id}-r{run_number}"
diff --git a/src/mapify_cli/skills_eval/runner.py b/src/mapify_cli/skills_eval/runner.py
new file mode 100644
index 0000000..610d5d6
--- /dev/null
+++ b/src/mapify_cli/skills_eval/runner.py
@@ -0,0 +1,425 @@
+"""Matrix runner for skill eval: prompts x runs -> durable resumable .jsonl.
+
+Public API (plain functions; no Typer -- CLI wiring is ST-007):
+- ``load_eval_set(path)``                      -- parse a JSON eval-set file.
+- ``run_eval(...)``                             -- execute the p x r matrix, append results.
+- ``default_run_path(root, skill, timestamp)`` -- canonical .jsonl path helper.
+- ``latest_run_path(root, skill)``              -- find most-recent .jsonl for --resume.
+
+Design invariants respected:
+- INV-3: no ``import anthropic``, no ANTHROPIC_API_KEY access.
+- INV-7: ``triggered_skill`` is consumed from ``DispatchResult.triggered_skill``
+         (the dispatcher is the SINGLE source of trigger detection).  The runner
+         does NOT parse transcripts.
+- D10:   variant_id is always 1 (no variants loop).
+- INV-4: each cell is flushed to disk immediately (durable per-cell append).
+- VC3:   resume reads existing cell_ids, skips already-written cells, appends only
+         missing ones to the SAME file.
+- VC4:   a per-cell dispatch error is recorded (not raised); matrix continues.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from pathlib import Path
+from typing import Any
+
+from mapify_cli.skills_eval.assertions import run_assertions
+from mapify_cli.skills_eval.dispatcher import VariantDispatcher
+from mapify_cli.skills_eval.eval_schema import (
+    DispatchResult,
+    EvalResultRecord,
+    EvalSetEntry,
+    make_cell_id,
+)
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+
+# Intent: fixed variant_id per D10 -- never enter a variants loop.
+_VARIANT_ID: int = 1
+
+
+# ---------------------------------------------------------------------------
+# load_eval_set
+# ---------------------------------------------------------------------------
+
+
+def load_eval_set(path: Path) -> list[EvalSetEntry]:
+    """Parse a JSON eval-set file and return a list of ``EvalSetEntry`` rows.
+
+    Expected JSON shape::
+
+        {
+            "entries": [
+                {
+                    "prompt": "<str>",
+                    "should_trigger": "<str or null>",
+                    "should_not_trigger": "<str or null>",
+                    "assertions": [ {"type": "...", ...}, ... ]
+                },
+                ...
+            ]
+        }
+
+    Parameters
+    ----------
+    path:
+        Filesystem path to the ``.json`` eval-set file.
+
+    Returns
+    -------
+    list[EvalSetEntry]
+        Non-empty list of parsed rows.
+
+    Raises
+    ------
+    ValueError
+        On: missing file, file not valid JSON, missing or empty "entries" key,
+        or any row that fails ``EvalSetEntry.__post_init__`` validation.
+    """
+    if not path.exists():
+        raise ValueError(f"eval-set file not found: {path}")
+
+    try:
+        text = path.read_text(encoding="utf-8")
+    except OSError as exc:
+        raise ValueError(f"could not read eval-set file {path}: {exc}") from exc
+
+    try:
+        data: Any = json.loads(text)
+    except json.JSONDecodeError as exc:
+        raise ValueError(f"eval-set file is not valid JSON ({path}): {exc}") from exc
+
+    if not isinstance(data, dict):
+        raise ValueError(
+            f"eval-set file must be a JSON object (got {type(data).__name__!r}): {path}"
+        )
+
+    raw_entries: Any = data.get("entries")
+    if raw_entries is None:
+        raise ValueError(f'eval-set file missing required "entries" key: {path}')
+    if not isinstance(raw_entries, list):
+        raise ValueError(
+            f'"entries" must be a JSON array (got {type(raw_entries).__name__!r}): {path}'
+        )
+    if len(raw_entries) == 0:
+        raise ValueError(f'"entries" list must not be empty: {path}')
+
+    entries: list[EvalSetEntry] = []
+    for row_index, raw_row in enumerate(raw_entries):
+        if not isinstance(raw_row, dict):
+            raise ValueError(
+                f"entries[{row_index}] must be a JSON object "
+                f"(got {type(raw_row).__name__!r}): {path}"
+            )
+        prompt: Any = raw_row.get("prompt")
+        if prompt is None:
+            raise ValueError(
+                f'entries[{row_index}] missing required "prompt" key: {path}'
+            )
+        should_trigger: str | None = raw_row.get("should_trigger", None)
+        should_not_trigger: str | None = raw_row.get("should_not_trigger", None)
+        raw_assertions: Any = raw_row.get("assertions", [])
+        if not isinstance(raw_assertions, list):
+            raise ValueError(
+                f"entries[{row_index}].assertions must be a JSON array "
+                f"(got {type(raw_assertions).__name__!r}): {path}"
+            )
+        try:
+            entry = EvalSetEntry(
+                prompt=prompt,
+                should_trigger=should_trigger,
+                should_not_trigger=should_not_trigger,
+                assertions=raw_assertions,
+            )
+        except ValueError as exc:
+            raise ValueError(
+                f"entries[{row_index}] failed validation: {exc}"
+            ) from exc
+        entries.append(entry)
+
+    return entries
+
+
+# ---------------------------------------------------------------------------
+# _read_present_cell_ids  (resume helper)
+# ---------------------------------------------------------------------------
+
+
+def _read_present_cell_ids(out_path: Path) -> set[str]:
+    """Return the set of ``cell_id`` values already in *out_path*.
+
+    Skips blank lines and JSON-malformed lines defensively so a partial last
+    line (write interrupted mid-flush) does not crash resume.
+    """
+    present: set[str] = set()
+    try:
+        with open(out_path, encoding="utf-8") as fh:
+            for raw_line in fh:
+                raw_line = raw_line.strip()
+                if not raw_line:
+                    continue
+                try:
+                    row: Any = json.loads(raw_line)
+                except json.JSONDecodeError:
+                    logger.debug(
+                        "_read_present_cell_ids: skipping malformed line in %s", out_path
+                    )
+                    continue
+                if not isinstance(row, dict):
+                    continue
+                cell_id_val = row.get("cell_id")
+                if isinstance(cell_id_val, str) and cell_id_val:
+                    present.add(cell_id_val)
+    except OSError as exc:
+        logger.warning(
+            "_read_present_cell_ids: could not read %s: %s -- treating as empty",
+            out_path,
+            exc,
+        )
+    return present
+
+
+# ---------------------------------------------------------------------------
+# _build_assertion_specs  (per-cell helper)
+# ---------------------------------------------------------------------------
+
+
+def _build_assertion_specs(entry: EvalSetEntry) -> list[dict[str, object]]:
+    """Combine explicit assertions with trigger/not_trigger expectations.
+
+    The result is the complete spec list passed to ``run_assertions``.
+    """
+    specs: list[dict[str, object]] = list(entry.assertions)
+    if entry.should_trigger is not None:
+        specs.append({"type": "trigger", "skill": entry.should_trigger})
+    if entry.should_not_trigger is not None:
+        specs.append({"type": "not_trigger", "skill": entry.should_not_trigger})
+    return specs
+
+
+# ---------------------------------------------------------------------------
+# run_eval
+# ---------------------------------------------------------------------------
+
+
+def evaluate_cell(
+    *,
+    skill: str,
+    entry: EvalSetEntry,
+    prompt_index: int,
+    run_number: int,
+    dispatcher: VariantDispatcher,
+) -> EvalResultRecord:
+    """Dispatch one (entry, prompt_index, run_number) cell and return the record.
+
+    Does NOT write to disk — the caller is responsible for durable persistence
+    (INV-4).  Shared by ``run_eval`` (sequential) and ``bounded_run``
+    (concurrent) so dispatch+assertion logic is defined exactly once (DRY).
+
+    Design invariants
+    -----------------
+    - D10: variant_id is always ``_VARIANT_ID`` (1).
+    - INV-7: ``triggered_skill`` is read from ``DispatchResult.triggered_skill``
+             only -- the runner never parses transcripts.
+    - VC4: per-cell ``DispatchResult.error`` is recorded (not raised); callers
+           decide whether to abort or continue.
+    """
+    cell_id = make_cell_id(prompt_index, _VARIANT_ID, run_number)
+
+    # Dispatch -- must not raise (VariantDispatcher contract).
+    dispatch_result: DispatchResult = dispatcher.dispatch(entry.prompt)
+
+    # Build assertion specs: explicit assertions + trigger expectations.
+    assertion_specs = _build_assertion_specs(entry)
+
+    if dispatch_result.error is not None:
+        # VC4: record the error as a synthetic failed assertion; do not abort.
+        passed_list: list[str] = []
+        failed_list: list[str] = [f"dispatch_error: {dispatch_result.error}"]
+        logger.warning(
+            "evaluate_cell: cell %s dispatch error (skill=%s run=%d): %s",
+            cell_id,
+            skill,
+            run_number,
+            dispatch_result.error,
+        )
+    else:
+        passed_list, failed_list = run_assertions(assertion_specs, dispatch_result)
+
+    return EvalResultRecord(
+        cell_id=cell_id,
+        prompt=entry.prompt,
+        triggered_skill=dispatch_result.triggered_skill,
+        token_usage=dispatch_result.token_usage,
+        duration_s=dispatch_result.duration_s,
+        assertions_passed=passed_list,
+        assertions_failed=failed_list,
+        raw_output=dispatch_result.raw_output,
+    )
+
+
+def run_eval(
+    *,
+    skill: str,
+    entries: list[EvalSetEntry],
+    dispatcher: VariantDispatcher,
+    runs: int,
+    out_path: Path,
+    resume: bool = False,
+) -> list[EvalResultRecord]:
+    """Execute the prompts x runs evaluation matrix and write results to *out_path*.
+
+    Parameters
+    ----------
+    skill:
+        Name of the skill under evaluation (used for logging only).
+    entries:
+        Eval-set rows from ``load_eval_set``.
+    dispatcher:
+        ``VariantDispatcher`` instance (``MockDispatcher`` in tests,
+        ``ClaudeSubprocessDispatcher`` in production).
+    runs:
+        Number of runs per prompt (``range(runs)``).
+    out_path:
+        Absolute path to the ``.jsonl`` output file.  Created (with parent
+        dirs) if absent; APPENDED to if *resume* is True.
+    resume:
+        If True, read already-present ``cell_id`` values from *out_path* and
+        skip those cells.  Missing cells are appended to the SAME file.
+        If False (default), *out_path* is a fresh file (caller's responsibility
+        to pass a new path -- the function does not truncate an existing file).
+
+    Returns
+    -------
+    list[EvalResultRecord]
+        Records written *during this call* (skipped/resumed cells are not
+        included -- callers that need the full result set should read out_path).
+
+    Design invariants
+    -----------------
+    - D10: variant_id is always ``_VARIANT_ID`` (1) -- NO variants loop.
+    - INV-7: ``triggered_skill`` is read from ``DispatchResult.triggered_skill``
+             only -- the runner never parses transcripts.
+    - INV-4: each record is flushed to *out_path* immediately after building.
+    - VC4: per-cell ``DispatchResult.error`` is recorded; matrix is never aborted.
+    """
+    # Resolve set of already-written cells for resume mode.
+    present_cell_ids: set[str] = set()
+    if resume and out_path.exists():
+        present_cell_ids = _read_present_cell_ids(out_path)
+        logger.info(
+            "run_eval: resume mode -- %d cells already present in %s",
+            len(present_cell_ids),
+            out_path,
+        )
+
+    # Ensure output directory exists before first write.
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+
+    written_records: list[EvalResultRecord] = []
+
+    # Intent: outer loop is prompts, inner loop is runs -- matrix p x r with D10 variant=1.
+    for prompt_index, entry in enumerate(entries):
+        for run_number in range(runs):
+            cell_id = make_cell_id(prompt_index, _VARIANT_ID, run_number)
+
+            if cell_id in present_cell_ids:
+                logger.debug(
+                    "run_eval: skipping cell %s (already present in %s)",
+                    cell_id,
+                    out_path,
+                )
+                continue
+
+            record = evaluate_cell(
+                skill=skill,
+                entry=entry,
+                prompt_index=prompt_index,
+                run_number=run_number,
+                dispatcher=dispatcher,
+            )
+
+            # INV-4: durable per-cell append-and-flush before advancing.
+            _append_record(out_path, record)
+
+            written_records.append(record)
+
+    logger.info(
+        "run_eval: finished skill=%s entries=%d runs=%d cells_written=%d out=%s",
+        skill,
+        len(entries),
+        runs,
+        len(written_records),
+        out_path,
+    )
+
+    return written_records
+
+
+# ---------------------------------------------------------------------------
+# _append_record  (durable per-cell write)
+# ---------------------------------------------------------------------------
+
+
+def _append_record(out_path: Path, record: EvalResultRecord) -> None:
+    """Append *record* as a single JSON line to *out_path* and flush.
+
+    Uses the ``open(path, "a", ...)`` append precedent from
+    ``memory/capture.py:446``.  Calls ``flush()`` after write to ensure the OS
+    buffer is flushed; ``os.fsync`` is intentionally omitted to avoid blocking
+    the matrix on every cell -- the OS buffer flush is sufficient for the
+    sequential use-case.
+    """
+    line = json.dumps(record.to_dict()) + "\n"
+    with open(out_path, "a", encoding="utf-8") as fh:
+        fh.write(line)
+        fh.flush()
+
+
+# ---------------------------------------------------------------------------
+# Path helpers
+# ---------------------------------------------------------------------------
+
+
+def default_run_path(root: Path, skill: str, timestamp: str) -> Path:
+    """Return the canonical .jsonl path for a new eval run.
+
+    Parameters
+    ----------
+    root:
+        Project root (the directory that contains ``.map/``).
+    skill:
+        Skill name (used as a subdirectory component).
+    timestamp:
+        Caller-supplied timestamp string, e.g.
+        ``datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")``.
+        Kept in the runner to make ``run_eval`` clock-free (testable).
+
+    Returns
+    -------
+    Path
+        ``<root>/.map/eval-runs/<skill>/<timestamp>.jsonl``
+    """
+    return root / ".map" / "eval-runs" / skill / f"{timestamp}.jsonl"
+
+
+def latest_run_path(root: Path, skill: str) -> Path | None:
+    """Return the most-recent ``.jsonl`` path for *skill*, or ``None``.
+
+    Scans ``<root>/.map/eval-runs/<skill>/`` for ``*.jsonl`` files and returns
+    the lexicographically last one (ISO-timestamp filenames sort correctly).
+    Returns ``None`` if the directory does not exist or is empty.
+    """
+    run_dir = root / ".map" / "eval-runs" / skill
+    if not run_dir.is_dir():
+        return None
+    candidates = sorted(run_dir.glob("*.jsonl"))
+    if not candidates:
+        return None
+    return candidates[-1]
diff --git a/src/mapify_cli/templates/hooks/safety-guardrails.py b/src/mapify_cli/templates/hooks/safety-guardrails.py
index 04fd888..48e671c 100755
--- a/src/mapify_cli/templates/hooks/safety-guardrails.py
+++ b/src/mapify_cli/templates/hooks/safety-guardrails.py
@@ -38,7 +38,12 @@
 
 # Dangerous bash command patterns
 _DEFAULT_DANGEROUS_COMMANDS = [
-    r"rm\s+-rf\s+/",  # rm -rf /
+    # Block `rm -rf /` (bare root), `rm -rf /etc`, `rm -rf /home/user`, etc.,
+    # but ALLOW deletion of subpaths UNDER a temp root (rm -rf /tmp/<dir>,
+    # /private/tmp/<dir>, /var/folders/<dir>, /var/tmp/<dir>) — legitimate
+    # scratch cleanup. The negative lookahead requires a trailing slash, so the
+    # temp root itself (`rm -rf /tmp`) stays blocked; only children are allowed.
+    r"rm\s+-rf\s+/(?!(?:tmp|private/tmp|var/folders|var/tmp)/)",  # rm -rf / (non-temp)
     r"rm\s+-rf\s+\*",  # rm -rf *
     r"rm\s+-rf\s+\.\.",  # rm -rf ..
     r"git\s+push.*--force.*main",
diff --git a/src/mapify_cli/templates/map/scripts/map_orchestrator.py b/src/mapify_cli/templates/map/scripts/map_orchestrator.py
index 03ea61c..013227f 100755
--- a/src/mapify_cli/templates/map/scripts/map_orchestrator.py
+++ b/src/mapify_cli/templates/map/scripts/map_orchestrator.py
@@ -2166,6 +2166,29 @@ def _is_cross_repo_path(p: str) -> bool:
             diff_paths = set()
         if diff_paths:
             files_not_in_diff = [p for p in declared if p not in diff_paths]
+        # Gitignored deliverables (e.g. .map/ workflow artifacts like spike
+        # docs or eval-run .jsonl) never appear in git diff/status by design —
+        # that is NOT Actor truncation. Drop any declared path that
+        # `git check-ignore` reports as ignored so it does not raise a false
+        # "Possible Actor truncation" warning. A gitignored file that is also
+        # missing from disk is still flagged separately via missing_files.
+        if files_not_in_diff:
+            try:
+                igproc = _sp.run(
+                    ["git", "check-ignore", "--", *files_not_in_diff],
+                    cwd=project_dir, capture_output=True, text=True, timeout=5,
+                )
+                ignored = {
+                    line.strip()
+                    for line in igproc.stdout.splitlines()
+                    if line.strip()
+                }
+                if ignored:
+                    files_not_in_diff = [
+                        p for p in files_not_in_diff if p not in ignored
+                    ]
+            except (OSError, _sp.TimeoutExpired):
+                pass
 
     state.record_subtask_result(
         subtask_id,
diff --git a/src/mapify_cli/templates/skills/map-efficient/SKILL.md b/src/mapify_cli/templates/skills/map-efficient/SKILL.md
index b986b52..2045905 100644
--- a/src/mapify_cli/templates/skills/map-efficient/SKILL.md
+++ b/src/mapify_cli/templates/skills/map-efficient/SKILL.md
@@ -191,8 +191,10 @@ python3 .map/scripts/map_step_runner.py record_test_baseline "$BRANCH"
 
 Snapshots pre-existing failures so later subtasks distinguish
 "introduced regression" from "was broken pre-plan". Auto-detects
-Make/pytest/go test/cargo. Overrides + narrow-target guidance:
-[efficient-reference.md](efficient-reference.md#pre-flight-test-baseline).
+Make/pytest/go test/cargo. It captures the test run internally and prints a
+single compact JSON report at the end — read that JSON directly; do NOT pipe it
+through `head`/`tail` (per the repo bash guidelines). Overrides + narrow-target
+guidance: [efficient-reference.md](efficient-reference.md#pre-flight-test-baseline).
 
 ### Wave Computation (after INIT_STATE) - REQUIRED
 
diff --git a/src/mapify_cli/templates/skills/map-efficient/efficient-reference.md b/src/mapify_cli/templates/skills/map-efficient/efficient-reference.md
index 802d11c..6734cfc 100644
--- a/src/mapify_cli/templates/skills/map-efficient/efficient-reference.md
+++ b/src/mapify_cli/templates/skills/map-efficient/efficient-reference.md
@@ -203,6 +203,11 @@ fix or defer.
 python3 .map/scripts/map_step_runner.py record_test_baseline "$BRANCH"
 ```
 
+It captures the test run internally and prints a single compact JSON report at
+the end — read that JSON directly. Do NOT pipe it through `head`/`tail` (per the
+repo bash guidelines); the output is one small object, not a stream, so
+truncating it only hides fields.
+
 Auto-detects from project markers:
 - `Makefile` with `test:` target → `make test`
 - `pyproject.toml` / `pytest.ini` → `pytest`
diff --git a/src/mapify_cli/templates/skills/map-skill-eval/SKILL.md b/src/mapify_cli/templates/skills/map-skill-eval/SKILL.md
new file mode 100644
index 0000000..567ac04
--- /dev/null
+++ b/src/mapify_cli/templates/skills/map-skill-eval/SKILL.md
@@ -0,0 +1,94 @@
+---
+name: map-skill-eval
+description: |
+  Evaluate a /map-* skill's trigger accuracy and cost. Use when asked to measure skill trigger accuracy, run an eval-set, or check token/duration cost via `mapify skill-eval`. Do NOT use to plan or implement; use map-plan or map-efficient.
+effort: medium
+disable-model-invocation: true
+argument-hint: "[skill] [--eval-set PATH]"
+---
+# /map-skill-eval — Skill Trigger Accuracy & Cost Evaluation
+
+Purpose: measure whether a `/map-*` skill fires on the right prompts and what it costs in tokens and time. Do not plan or implement from this skill.
+
+Requires the `claude` CLI (installed and on `$PATH`). The skill is skipped at install time on hosts without `claude`.
+
+## Invocation
+
+```bash
+mapify skill-eval run <skill> --eval-set PATH [--dry-run] [--resume] [--max-concurrency N]
+```
+
+- `<skill>` — the skill name to evaluate (e.g. `map-plan`).
+- `--eval-set PATH` — path to a JSON eval-set file defining prompt cases and expected assertions.
+- `--dry-run` — validate the eval-set and print the planned run count without spending any quota.
+- `--resume` — continue an interrupted run from the last durable checkpoint.
+- `--max-concurrency N` — max parallel `claude -p` workers (default: 1).
+
+## What It Does
+
+1. **Prompts × runs matrix** — for each case in the eval-set, invokes `claude -p` in an isolated temporary working directory seeded with `.claude/` (skills, settings). Runs are independent; no shared state leaks between cases.
+2. **Transcript-parse trigger detection** — parses each `claude -p` transcript to determine whether the target skill fired (trigger) or did not fire (not_trigger).
+3. **Deterministic assertions** — each eval case may specify one or more assertion types:
+   - `contains` / `not_contains` — substring presence in the response.
+   - `regex` — pattern match against the response.
+   - `valid_json` — response parses as JSON.
+   - `trigger` / `not_trigger` — skill fired / did not fire.
+4. **Durable resumable run log** — results are appended to `.map/eval-runs/<skill>/<timestamp>.jsonl` as each case completes, so a partial run is recoverable via `--resume`.
+5. **Summary report** — after all cases complete, prints pass-rate (passed/total) plus per-case token usage, duration, and cache-hit stats.
+
+## Eval-Set Format
+
+A JSON object with an `entries` array. Each entry has a `prompt`, optional
+`should_trigger` / `should_not_trigger` skill names (the runner turns these into
+`trigger` / `not_trigger` assertions), and an optional `assertions` array.
+Assertion types: `contains`, `not_contains`, `regex`, `valid_json`, `trigger`,
+`not_trigger`.
+
+```json
+{
+  "entries": [
+    {
+      "prompt": "Decompose this feature into subtasks",
+      "should_trigger": "map-plan",
+      "assertions": [
+        { "type": "contains", "value": "subtask" }
+      ]
+    },
+    {
+      "prompt": "Run quality gates",
+      "should_not_trigger": "map-plan",
+      "assertions": []
+    }
+  ]
+}
+```
+
+## --dry-run
+
+`--dry-run` validates the eval-set schema and prints the planned case count with estimated quota usage. No `claude -p` calls are made; no `.jsonl` is written.
+
+## Examples
+
+```bash
+# Validate eval-set without spending quota
+mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --dry-run
+
+# Run full eval with up to 8 parallel workers
+mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --max-concurrency 8
+
+# Resume an interrupted run
+mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --resume
+```
+
+## Troubleshooting
+
+- **`claude` not found** — `map-skill-eval` requires the `claude` CLI on `$PATH`. Install it and re-run `mapify init` to activate the skill.
+- **Eval-set validation error on `--dry-run`** — check that each case has a non-empty `id`, a `prompt`, and at least one `assertions` entry with a valid `type`.
+- **Run log not found for `--resume`** — `--resume` looks for the latest `.map/eval-runs/<skill>/<timestamp>.jsonl`. If no prior run exists, omit `--resume` to start fresh.
+- **All cases report `not_trigger` unexpectedly** — verify the skill name matches exactly (e.g. `map-plan`, not `map_plan`) and that `.claude/` was seeded correctly in the temp cwd.
+
+## Related Commands
+
+- `/map-plan` — plan and decompose tasks.
+- `/map-efficient` — full MAP workflow execution.
+- `/map-check` — run quality gates and verify MAP workflow completion.
diff --git a/src/mapify_cli/templates/skills/skill-rules.json b/src/mapify_cli/templates/skills/skill-rules.json
index bbe32ab..d5a9606 100644
--- a/src/mapify_cli/templates/skills/skill-rules.json
+++ b/src/mapify_cli/templates/skills/skill-rules.json
@@ -239,6 +239,18 @@
         ]
       }
     },
+    "map-skill-eval": {
+      "type": "manual",
+      "skillClass": "task",
+      "enforcement": "manual",
+      "priority": "medium",
+      "description": "Evaluate a /map-* skill's trigger accuracy + cost via mapify skill-eval (claude -p matrix, deterministic assertions, durable resumable runs).",
+      "requires-cmd": ["claude"],
+      "promptTriggers": {
+        "keywords": ["map-skill-eval","skill-eval","skill eval","evaluate skill","trigger accuracy","skill triggering"],
+        "intentPatterns": ["map-skill-eval","(eval|evaluate|measure|test).*(skill).*(trigger|fire|cost)","does .* skill trigger"]
+      }
+    },
     "map-task": {
       "type": "manual",
       "skillClass": "task",
diff --git a/src/mapify_cli/templates_src/hooks/safety-guardrails.py.jinja b/src/mapify_cli/templates_src/hooks/safety-guardrails.py.jinja
index 04fd888..48e671c 100755
--- a/src/mapify_cli/templates_src/hooks/safety-guardrails.py.jinja
+++ b/src/mapify_cli/templates_src/hooks/safety-guardrails.py.jinja
@@ -38,7 +38,12 @@ _DEFAULT_DANGEROUS_FILE_PATTERNS = [
 
 # Dangerous bash command patterns
 _DEFAULT_DANGEROUS_COMMANDS = [
-    r"rm\s+-rf\s+/",  # rm -rf /
+    # Block `rm -rf /` (bare root), `rm -rf /etc`, `rm -rf /home/user`, etc.,
+    # but ALLOW deletion of subpaths UNDER a temp root (rm -rf /tmp/<dir>,
+    # /private/tmp/<dir>, /var/folders/<dir>, /var/tmp/<dir>) — legitimate
+    # scratch cleanup. The negative lookahead requires a trailing slash, so the
+    # temp root itself (`rm -rf /tmp`) stays blocked; only children are allowed.
+    r"rm\s+-rf\s+/(?!(?:tmp|private/tmp|var/folders|var/tmp)/)",  # rm -rf / (non-temp)
     r"rm\s+-rf\s+\*",  # rm -rf *
     r"rm\s+-rf\s+\.\.",  # rm -rf ..
     r"git\s+push.*--force.*main",
diff --git a/src/mapify_cli/templates_src/map/scripts/map_orchestrator.py.jinja b/src/mapify_cli/templates_src/map/scripts/map_orchestrator.py.jinja
index 03ea61c..013227f 100755
--- a/src/mapify_cli/templates_src/map/scripts/map_orchestrator.py.jinja
+++ b/src/mapify_cli/templates_src/map/scripts/map_orchestrator.py.jinja
@@ -2166,6 +2166,29 @@ def record_subtask_result(
             diff_paths = set()
         if diff_paths:
             files_not_in_diff = [p for p in declared if p not in diff_paths]
+        # Gitignored deliverables (e.g. .map/ workflow artifacts like spike
+        # docs or eval-run .jsonl) never appear in git diff/status by design —
+        # that is NOT Actor truncation. Drop any declared path that
+        # `git check-ignore` reports as ignored so it does not raise a false
+        # "Possible Actor truncation" warning. A gitignored file that is also
+        # missing from disk is still flagged separately via missing_files.
+        if files_not_in_diff:
+            try:
+                igproc = _sp.run(
+                    ["git", "check-ignore", "--", *files_not_in_diff],
+                    cwd=project_dir, capture_output=True, text=True, timeout=5,
+                )
+                ignored = {
+                    line.strip()
+                    for line in igproc.stdout.splitlines()
+                    if line.strip()
+                }
+                if ignored:
+                    files_not_in_diff = [
+                        p for p in files_not_in_diff if p not in ignored
+                    ]
+            except (OSError, _sp.TimeoutExpired):
+                pass
 
     state.record_subtask_result(
         subtask_id,
diff --git a/src/mapify_cli/templates_src/skills/map-efficient/SKILL.md.jinja b/src/mapify_cli/templates_src/skills/map-efficient/SKILL.md.jinja
index b986b52..2045905 100644
--- a/src/mapify_cli/templates_src/skills/map-efficient/SKILL.md.jinja
+++ b/src/mapify_cli/templates_src/skills/map-efficient/SKILL.md.jinja
@@ -191,8 +191,10 @@ python3 .map/scripts/map_step_runner.py record_test_baseline "$BRANCH"
 
 Snapshots pre-existing failures so later subtasks distinguish
 "introduced regression" from "was broken pre-plan". Auto-detects
-Make/pytest/go test/cargo. Overrides + narrow-target guidance:
-[efficient-reference.md](efficient-reference.md#pre-flight-test-baseline).
+Make/pytest/go test/cargo. It captures the test run internally and prints a
+single compact JSON report at the end — read that JSON directly; do NOT pipe it
+through `head`/`tail` (per the repo bash guidelines). Overrides + narrow-target
+guidance: [efficient-reference.md](efficient-reference.md#pre-flight-test-baseline).
 
 ### Wave Computation (after INIT_STATE) - REQUIRED
 
diff --git a/src/mapify_cli/templates_src/skills/map-efficient/efficient-reference.md.jinja b/src/mapify_cli/templates_src/skills/map-efficient/efficient-reference.md.jinja
index 802d11c..6734cfc 100644
--- a/src/mapify_cli/templates_src/skills/map-efficient/efficient-reference.md.jinja
+++ b/src/mapify_cli/templates_src/skills/map-efficient/efficient-reference.md.jinja
@@ -203,6 +203,11 @@ fix or defer.
 python3 .map/scripts/map_step_runner.py record_test_baseline "$BRANCH"
 ```
 
+It captures the test run internally and prints a single compact JSON report at
+the end — read that JSON directly. Do NOT pipe it through `head`/`tail` (per the
+repo bash guidelines); the output is one small object, not a stream, so
+truncating it only hides fields.
+
 Auto-detects from project markers:
 - `Makefile` with `test:` target → `make test`
 - `pyproject.toml` / `pytest.ini` → `pytest`
diff --git a/src/mapify_cli/templates_src/skills/map-skill-eval/SKILL.md.jinja b/src/mapify_cli/templates_src/skills/map-skill-eval/SKILL.md.jinja
new file mode 100644
index 0000000..567ac04
--- /dev/null
+++ b/src/mapify_cli/templates_src/skills/map-skill-eval/SKILL.md.jinja
@@ -0,0 +1,94 @@
+---
+name: map-skill-eval
+description: |
+  Evaluate a /map-* skill's trigger accuracy and cost. Use when asked to measure skill trigger accuracy, run an eval-set, or check token/duration cost via `mapify skill-eval`. Do NOT use to plan or implement; use map-plan or map-efficient.
+effort: medium
+disable-model-invocation: true
+argument-hint: "[skill] [--eval-set PATH]"
+---
+# /map-skill-eval — Skill Trigger Accuracy & Cost Evaluation
+
+Purpose: measure whether a `/map-*` skill fires on the right prompts and what it costs in tokens and time. Do not plan or implement from this skill.
+
+Requires the `claude` CLI (installed and on `$PATH`). The skill is skipped at install time on hosts without `claude`.
+
+## Invocation
+
+```bash
+mapify skill-eval run <skill> --eval-set PATH [--dry-run] [--resume] [--max-concurrency N]
+```
+
+- `<skill>` — the skill name to evaluate (e.g. `map-plan`).
+- `--eval-set PATH` — path to a JSON eval-set file defining prompt cases and expected assertions.
+- `--dry-run` — validate the eval-set and print the planned run count without spending any quota.
+- `--resume` — continue an interrupted run from the last durable checkpoint.
+- `--max-concurrency N` — max parallel `claude -p` workers (default: 1).
+
+## What It Does
+
+1. **Prompts × runs matrix** — for each case in the eval-set, invokes `claude -p` in an isolated temporary working directory seeded with `.claude/` (skills, settings). Runs are independent; no shared state leaks between cases.
+2. **Transcript-parse trigger detection** — parses each `claude -p` transcript to determine whether the target skill fired (trigger) or did not fire (not_trigger).
+3. **Deterministic assertions** — each eval case may specify one or more assertion types:
+   - `contains` / `not_contains` — substring presence in the response.
+   - `regex` — pattern match against the response.
+   - `valid_json` — response parses as JSON.
+   - `trigger` / `not_trigger` — skill fired / did not fire.
+4. **Durable resumable run log** — results are appended to `.map/eval-runs/<skill>/<timestamp>.jsonl` as each case completes, so a partial run is recoverable via `--resume`.
+5. **Summary report** — after all cases complete, prints pass-rate (passed/total) plus per-case token usage, duration, and cache-hit stats.
+
+## Eval-Set Format
+
+A JSON object with an `entries` array. Each entry has a `prompt`, optional
+`should_trigger` / `should_not_trigger` skill names (the runner turns these into
+`trigger` / `not_trigger` assertions), and an optional `assertions` array.
+Assertion types: `contains`, `not_contains`, `regex`, `valid_json`, `trigger`,
+`not_trigger`.
+
+```json
+{
+  "entries": [
+    {
+      "prompt": "Decompose this feature into subtasks",
+      "should_trigger": "map-plan",
+      "assertions": [
+        { "type": "contains", "value": "subtask" }
+      ]
+    },
+    {
+      "prompt": "Run quality gates",
+      "should_not_trigger": "map-plan",
+      "assertions": []
+    }
+  ]
+}
+```
+
+## --dry-run
+
+`--dry-run` validates the eval-set schema and prints the planned case count with estimated quota usage. No `claude -p` calls are made; no `.jsonl` is written.
+
+## Examples
+
+```bash
+# Validate eval-set without spending quota
+mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --dry-run
+
+# Run full eval with up to 8 parallel workers
+mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --max-concurrency 8
+
+# Resume an interrupted run
+mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --resume
+```
+
+## Troubleshooting
+
+- **`claude` not found** — `map-skill-eval` requires the `claude` CLI on `$PATH`. Install it and re-run `mapify init` to activate the skill.
+- **Eval-set validation error on `--dry-run`** — check that each case has a non-empty `id`, a `prompt`, and at least one `assertions` entry with a valid `type`.
+- **Run log not found for `--resume`** — `--resume` looks for the latest `.map/eval-runs/<skill>/<timestamp>.jsonl`. If no prior run exists, omit `--resume` to start fresh.
+- **All cases report `not_trigger` unexpectedly** — verify the skill name matches exactly (e.g. `map-plan`, not `map_plan`) and that `.claude/` was seeded correctly in the temp cwd.
+
+## Related Commands
+
+- `/map-plan` — plan and decompose tasks.
+- `/map-efficient` — full MAP workflow execution.
+- `/map-check` — run quality gates and verify MAP workflow completion.
diff --git a/src/mapify_cli/templates_src/skills/skill-rules.json.jinja b/src/mapify_cli/templates_src/skills/skill-rules.json.jinja
index bbe32ab..d5a9606 100644
--- a/src/mapify_cli/templates_src/skills/skill-rules.json.jinja
+++ b/src/mapify_cli/templates_src/skills/skill-rules.json.jinja
@@ -239,6 +239,18 @@
         ]
       }
     },
+    "map-skill-eval": {
+      "type": "manual",
+      "skillClass": "task",
+      "enforcement": "manual",
+      "priority": "medium",
+      "description": "Evaluate a /map-* skill's trigger accuracy + cost via mapify skill-eval (claude -p matrix, deterministic assertions, durable resumable runs).",
+      "requires-cmd": ["claude"],
+      "promptTriggers": {
+        "keywords": ["map-skill-eval","skill-eval","skill eval","evaluate skill","trigger accuracy","skill triggering"],
+        "intentPatterns": ["map-skill-eval","(eval|evaluate|measure|test).*(skill).*(trigger|fire|cost)","does .* skill trigger"]
+      }
+    },
     "map-task": {
       "type": "manual",
       "skillClass": "task",
diff --git a/tests/hooks/test_safety_guardrails.py b/tests/hooks/test_safety_guardrails.py
index 9fd68ac..5dcfccb 100644
--- a/tests/hooks/test_safety_guardrails.py
+++ b/tests/hooks/test_safety_guardrails.py
@@ -223,6 +223,10 @@ class TestRmRfBlocking:
         [
             "rm -rf /",
             "rm -rf /home/user",
+            "rm -rf /etc",
+            "rm -rf /var",
+            "rm -rf /tmp",  # the temp ROOT itself stays blocked (no trailing /child)
+            "rm -rf /*",
             "rm -rf *",
             "rm -rf ..",
         ],
@@ -232,6 +236,25 @@ def test_rm_rf_blocked(self, command):
         assert exit_code == 0
         _assert_denied(_parse_stdout(stdout))
 
+    @pytest.mark.parametrize(
+        "command",
+        [
+            "rm -rf /tmp/map-spike-abc123",
+            "rm -rf /tmp/pytest-of-user/run0",
+            "rm -rf /private/tmp/map-spike-WOi8Pq",  # macOS mktemp
+            "rm -rf /var/folders/ab/cd1234/T/scratch",  # macOS $TMPDIR
+            "rm -rf /var/tmp/build-cache",
+        ],
+    )
+    def test_rm_rf_temp_subpath_allowed(self, command):
+        """Deleting a subpath UNDER a temp root is legitimate scratch cleanup
+        and must not be blocked (regression: the bare ``rm -rf /`` pattern used
+        to flag every absolute path, including temp dirs and any command that
+        merely mentioned one)."""
+        exit_code, stdout, _ = run_hook_bash(command)
+        assert exit_code == 0
+        assert _parse_stdout(stdout) == {}
+
     def test_rm_single_file_allowed(self):
         exit_code, stdout, _ = run_hook_bash("rm file.txt")
         assert exit_code == 0
diff --git a/tests/skills_eval/fixtures/map_debug_eval_set.json b/tests/skills_eval/fixtures/map_debug_eval_set.json
new file mode 100644
index 0000000..d9a6a56
--- /dev/null
+++ b/tests/skills_eval/fixtures/map_debug_eval_set.json
@@ -0,0 +1,23 @@
+{
+  "entries": [
+    {
+      "prompt": "I need help debugging a failing test in my Python project.",
+      "should_trigger": "map-debug",
+      "assertions": [
+        {"type": "contains", "value": "debug"}
+      ]
+    },
+    {
+      "prompt": "Please add the numbers 2 and 3 together.",
+      "should_not_trigger": "map-debug",
+      "assertions": []
+    },
+    {
+      "prompt": "My application crashes with a stack overflow. Help me diagnose it.",
+      "should_trigger": "map-debug",
+      "assertions": [
+        {"type": "contains", "value": "crash"}
+      ]
+    }
+  ]
+}
diff --git a/tests/test_map_orchestrator.py b/tests/test_map_orchestrator.py
index 9633a63..e213176 100644
--- a/tests/test_map_orchestrator.py
+++ b/tests/test_map_orchestrator.py
@@ -2179,6 +2179,74 @@ def test_explicit_commit_sha_wins(self, branch_dir, tmp_path, monkeypatch):
         assert reloaded.last_subtask_commit_sha == "cafebabe"
 
 
+class TestRecordSubtaskResultGitignoredArtifact:
+    """record_subtask_result must NOT raise a 'Possible Actor truncation'
+    warning for declared files that are gitignored-but-present on disk (e.g.
+    .map/ workflow artifacts like spike docs). They never appear in git
+    diff/status by design — that is intentional, not truncation."""
+
+    def _init_git_repo(self, tmp_path):
+        import subprocess as _sp
+        _sp.run(["git", "init"], cwd=tmp_path, capture_output=True)
+        _sp.run(["git", "config", "user.email", "t@t.com"], cwd=tmp_path, capture_output=True)
+        _sp.run(["git", "config", "user.name", "t"], cwd=tmp_path, capture_output=True)
+        (tmp_path / ".gitignore").write_text(".map/\n")
+        (tmp_path / "seed.txt").write_text("seed")
+        (tmp_path / "tracked.py").write_text("x = 1\n")
+        _sp.run(["git", "add", "."], cwd=tmp_path, capture_output=True)
+        _sp.run(["git", "commit", "-m", "init"], cwd=tmp_path, capture_output=True)
+        # Second (non-root) commit so HEAD has a parent and `git diff-tree`
+        # yields a NON-empty diff_paths. Without this, a root commit produces an
+        # empty diff and files_not_in_diff is never computed — the gitignore
+        # test would then pass vacuously without exercising the filter.
+        (tmp_path / "seed.txt").write_text("seed v2")
+        _sp.run(["git", "add", "."], cwd=tmp_path, capture_output=True)
+        _sp.run(["git", "commit", "-m", "second"], cwd=tmp_path, capture_output=True)
+
+    def test_gitignored_artifact_not_flagged(self, branch_dir, tmp_path, monkeypatch):
+        state = map_orchestrator.StepState()
+        state.subtask_sequence = ["ST-001"]
+        state.current_subtask_id = "ST-001"
+        state_file = tmp_path / ".map" / branch_dir / "step_state.json"
+        state.save(state_file)
+        self._init_git_repo(tmp_path)
+        # A real deliverable that exists on disk but is gitignored (.map/**).
+        artifact = tmp_path / ".map" / branch_dir / "spike_st001.md"
+        artifact.write_text("spike verdict", encoding="utf-8")
+        monkeypatch.setenv("CLAUDE_PROJECT_DIR", str(tmp_path))
+        result = map_orchestrator.record_subtask_result(
+            "ST-001", branch_dir,
+            files_changed=[f".map/{branch_dir}/spike_st001.md"],
+            status="valid", summary="spike", commit_sha=None,
+        )
+        assert result["status"] == "success"
+        # No false truncation warning, no files_not_in_diff for the gitignored file.
+        assert "files_not_in_diff" not in result, result
+        assert "Possible Actor truncation" not in result.get("warning", ""), result
+
+    def test_non_gitignored_unchanged_tracked_file_still_flagged(
+        self, branch_dir, tmp_path, monkeypatch
+    ):
+        """Negative control (proves the filter is SPECIFIC): a tracked file that
+        exists, is NOT gitignored, and was not touched by this subtask's diff
+        still surfaces in files_not_in_diff — the gitignore filter must not be a
+        blanket suppression."""
+        state = map_orchestrator.StepState()
+        state.subtask_sequence = ["ST-001"]
+        state.current_subtask_id = "ST-001"
+        state_file = tmp_path / ".map" / branch_dir / "step_state.json"
+        state.save(state_file)
+        self._init_git_repo(tmp_path)  # tracked.py committed, unchanged in HEAD
+        monkeypatch.setenv("CLAUDE_PROJECT_DIR", str(tmp_path))
+        result = map_orchestrator.record_subtask_result(
+            "ST-001", branch_dir,
+            files_changed=["tracked.py"],
+            status="valid", summary="x", commit_sha=None,
+        )
+        assert result["status"] == "success"
+        assert result.get("files_not_in_diff") == ["tracked.py"], result
+
+
 class TestValidateStepTransactionalMonitor:
     """validate_step('2.4') now implicitly closes pending 2.3 (ACTOR) so
     callers don't get 'Step mismatch: expected 2.3' when they jump straight
diff --git a/tests/test_skills_consistency.py b/tests/test_skills_consistency.py
index 81eed55..8723ad6 100644
--- a/tests/test_skills_consistency.py
+++ b/tests/test_skills_consistency.py
@@ -477,9 +477,9 @@ def detect_skill_deps(skill_dir: Path) -> dict[str, set[str]]:
 
 
 def test_skill_discovery_non_empty(skill_names: list[str]) -> None:
-    """Guard: skill-rules.json must list exactly 15 skills (prevents vacuous pass)."""
-    assert len(skill_names) == 15, (
-        f"Expected 15 skills in skill-rules.json, found {len(skill_names)}: "
+    """Guard: skill-rules.json must list exactly 16 skills (prevents vacuous pass)."""
+    assert len(skill_names) == 16, (
+        f"Expected 16 skills in skill-rules.json, found {len(skill_names)}: "
         f"{sorted(skill_names)}"
     )
 
diff --git a/tests/test_skills_eval_aggregator.py b/tests/test_skills_eval_aggregator.py
new file mode 100644
index 0000000..6fd48a7
--- /dev/null
+++ b/tests/test_skills_eval_aggregator.py
@@ -0,0 +1,326 @@
+"""Tests for skills_eval aggregator (ST-006).
+
+Covers aggregate() and bounded_run() using MockDispatcher only -- zero real
+claude subprocess (INV-2/INV-3).  Tests map 1:1 to validation criteria:
+  VC1  -- pass_rate fraction
+  VC2  -- token mean/stddev, n<2 no raise
+  VC3  -- bounded_run serialised writes: every .jsonl line parses, no corruption
+  VC4  -- all-null token_usage -> token stats None, pass_rate + duration still valid
+  SC-1 -- max_concurrency=3 matrix -> complete unique cell set; resume -> no dupes
+"""
+
+from __future__ import annotations
+
+import json
+import math
+from pathlib import Path
+
+from mapify_cli.skills_eval.aggregator import aggregate, bounded_run
+from mapify_cli.skills_eval.dispatcher import MockDispatcher
+from mapify_cli.skills_eval.eval_schema import (
+    EvalResultRecord,
+    EvalSetEntry,
+    make_cell_id,
+)
+from mapify_cli.token_budget import TokenUsage
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _entries(n: int = 2) -> list[EvalSetEntry]:
+    return [
+        EvalSetEntry(
+            prompt=f"p{i}",
+            should_trigger=None,
+            should_not_trigger=None,
+            assertions=[],
+        )
+        for i in range(n)
+    ]
+
+
+def _read_all_records(path: Path) -> list[EvalResultRecord]:
+    """Parse every non-blank line in the .jsonl; raise on malformed."""
+    records = []
+    for line in path.read_text(encoding="utf-8").splitlines():
+        if not line.strip():
+            continue
+        records.append(EvalResultRecord.from_dict(json.loads(line)))
+    return records
+
+
+def _make_record(
+    cell_id: str,
+    *,
+    assertions_failed: list[str] | None = None,
+    token_usage: TokenUsage | None = None,
+    duration_s: float = 1.0,
+) -> EvalResultRecord:
+    return EvalResultRecord(
+        cell_id=cell_id,
+        prompt="test",
+        triggered_skill=None,
+        token_usage=token_usage,
+        duration_s=duration_s,
+        assertions_passed=[],
+        assertions_failed=assertions_failed or [],
+    )
+
+
+# ---------------------------------------------------------------------------
+# aggregate() -- AggregateSummary correctness
+# ---------------------------------------------------------------------------
+
+
+def test_vc1_pass_rate_fraction() -> None:
+    """VC1: pass_rate = passed_cells / total_cells."""
+    records = [
+        _make_record("p0-v1-r0"),  # passed (empty assertions_failed)
+        _make_record("p1-v1-r0", assertions_failed=["x"]),  # failed
+        _make_record("p2-v1-r0"),  # passed
+        _make_record("p3-v1-r0", assertions_failed=["y", "z"]),  # failed
+    ]
+    summary = aggregate(records)
+    assert summary.total_cells == 4
+    assert summary.passed_cells == 2
+    assert math.isclose(summary.pass_rate, 0.5)
+
+
+def test_vc1_all_passed() -> None:
+    records = [_make_record(f"p{i}-v1-r0") for i in range(3)]
+    summary = aggregate(records)
+    assert summary.passed_cells == 3
+    assert math.isclose(summary.pass_rate, 1.0)
+
+
+def test_vc1_all_failed() -> None:
+    records = [_make_record(f"p{i}-v1-r0", assertions_failed=["f"]) for i in range(3)]
+    summary = aggregate(records)
+    assert summary.passed_cells == 0
+    assert math.isclose(summary.pass_rate, 0.0)
+
+
+def test_vc1_empty_list_no_raise() -> None:
+    """VC4/VC1: empty list must not raise; pass_rate = 0.0."""
+    summary = aggregate([])
+    assert summary.total_cells == 0
+    assert summary.passed_cells == 0
+    assert math.isclose(summary.pass_rate, 0.0)
+    assert summary.tokens_mean is None
+    assert summary.tokens_stddev is None
+    assert summary.duration_mean is None
+    assert summary.duration_stddev is None
+
+
+def test_vc2_token_mean_and_stddev() -> None:
+    """VC2: tokens_mean and tokens_stddev correct over non-null token_usage."""
+    tu_a = TokenUsage(input_tokens=100, cache_read_input_tokens=0)
+    tu_b = TokenUsage(input_tokens=200, cache_read_input_tokens=0)
+    tu_c = TokenUsage(input_tokens=300, cache_read_input_tokens=0)
+    records = [
+        _make_record("p0-v1-r0", token_usage=tu_a, duration_s=1.0),
+        _make_record("p1-v1-r0", token_usage=tu_b, duration_s=2.0),
+        _make_record("p2-v1-r0", token_usage=tu_c, duration_s=3.0),
+    ]
+    summary = aggregate(records)
+    assert summary.token_sample_size == 3
+    assert math.isclose(summary.tokens_mean or 0.0, 200.0)
+    # sample stdev of [100, 200, 300]
+    import statistics
+    expected_stdev = statistics.stdev([100.0, 200.0, 300.0])
+    assert math.isclose(summary.tokens_stddev or 0.0, expected_stdev)
+
+
+def test_vc2_token_n_eq_1_no_raise() -> None:
+    """VC2: n<2 must not raise; stddev is 0.0."""
+    tu = TokenUsage(input_tokens=50, cache_read_input_tokens=10)
+    records = [_make_record("p0-v1-r0", token_usage=tu, duration_s=1.0)]
+    summary = aggregate(records)
+    assert summary.token_sample_size == 1
+    assert math.isclose(summary.tokens_mean or 0.0, 60.0)  # 50+10
+    assert summary.tokens_stddev is not None and math.isclose(summary.tokens_stddev, 0.0)
+
+
+def test_vc4_all_null_token_usage() -> None:
+    """VC4: all-null token_usage -> token stats None; pass_rate + duration valid."""
+    records = [
+        _make_record("p0-v1-r0", token_usage=None, duration_s=1.0),
+        _make_record("p1-v1-r0", token_usage=None, duration_s=3.0),
+    ]
+    summary = aggregate(records)
+    # Token stats absent.
+    assert summary.token_sample_size == 0
+    assert summary.tokens_mean is None
+    assert summary.tokens_stddev is None
+    # Pass_rate still valid.
+    assert math.isclose(summary.pass_rate, 1.0)  # no assertions_failed in either
+    # Duration stats still valid.
+    assert summary.duration_mean is not None
+    assert math.isclose(summary.duration_mean, 2.0)
+
+
+def test_duration_mean_and_stddev() -> None:
+    """duration_mean / duration_stddev correct when total_cells >= 2."""
+    records = [
+        _make_record("p0-v1-r0", duration_s=1.0),
+        _make_record("p1-v1-r0", duration_s=3.0),
+    ]
+    summary = aggregate(records)
+    assert math.isclose(summary.duration_mean or 0.0, 2.0)
+    import statistics
+    assert math.isclose(summary.duration_stddev or 0.0, statistics.stdev([1.0, 3.0]))
+
+
+def test_duration_stddev_zero_when_single_record() -> None:
+    """duration_stddev is 0.0 for a single record (n<2 guard)."""
+    records = [_make_record("p0-v1-r0", duration_s=5.0)]
+    summary = aggregate(records)
+    assert math.isclose(summary.duration_mean or 0.0, 5.0)
+    assert summary.duration_stddev is not None and math.isclose(summary.duration_stddev, 0.0)
+
+
+def test_aggregate_summary_to_dict() -> None:
+    """AggregateSummary.to_dict() returns a JSON-serialisable dict."""
+    summary = aggregate([])
+    d = summary.to_dict()
+    assert isinstance(d, dict)
+    # Verify round-trip via json.dumps (raises TypeError on non-serialisable).
+    json.dumps(d)
+    assert "pass_rate" in d
+    assert "total_cells" in d
+
+
+# ---------------------------------------------------------------------------
+# bounded_run() -- SC-1 / VC3 concurrent dispatch
+# ---------------------------------------------------------------------------
+
+
+def test_sc1_max_concurrency_3_complete_unique_cell_set(tmp_path: Path) -> None:
+    """SC-1: max_concurrency=3 over a matrix -> complete + unique cell set."""
+    out = tmp_path / "run.jsonl"
+    disp = MockDispatcher(triggered_skill=None, raw_output="ok", duration_s=0.01)
+
+    entries = _entries(3)
+    records = bounded_run(
+        skill="map-x",
+        entries=entries,
+        dispatcher=disp,
+        runs=4,
+        out_path=out,
+        max_concurrency=3,
+    )
+
+    # 3 entries x 4 runs = 12 cells total.
+    expected_ids = {make_cell_id(i, 1, r) for i in range(3) for r in range(4)}
+    returned_ids = {r.cell_id for r in records}
+    assert returned_ids == expected_ids
+
+    # Verify .jsonl: every line must parse and cell_id set must match.
+    file_records = _read_all_records(out)
+    file_ids = {r.cell_id for r in file_records}
+    assert file_ids == expected_ids
+    assert len(file_records) == 12  # no duplicates
+
+
+def test_vc3_jsonl_not_corrupted_concurrent(tmp_path: Path) -> None:
+    """VC3: concurrent writes produce valid .jsonl -- every line parses."""
+    out = tmp_path / "run.jsonl"
+    disp = MockDispatcher(triggered_skill=None, raw_output="x" * 200, duration_s=0.01)
+
+    bounded_run(
+        skill="map-x",
+        entries=_entries(4),
+        dispatcher=disp,
+        runs=5,
+        out_path=out,
+        max_concurrency=4,
+    )
+
+    raw_lines = [
+        ln for ln in out.read_text(encoding="utf-8").splitlines() if ln.strip()
+    ]
+    assert len(raw_lines) == 20  # 4*5
+    for line in raw_lines:
+        # Must parse without exception.
+        obj = json.loads(line)
+        assert "cell_id" in obj
+
+
+def test_sc1_resume_after_partial_no_dupes(tmp_path: Path) -> None:
+    """SC-1: resume after partial run -> no duplicate cell_ids in output."""
+    out = tmp_path / "run.jsonl"
+    disp = MockDispatcher(triggered_skill=None, raw_output="ok", duration_s=0.01)
+    entries = _entries(2)
+
+    # First pass: complete the first entry only (2 cells out of 4).
+    first_pass = bounded_run(
+        skill="map-x",
+        entries=entries,
+        dispatcher=disp,
+        runs=2,
+        out_path=out,
+        max_concurrency=1,
+    )
+    assert len(first_pass) == 4  # 2 entries * 2 runs
+
+    # Simulate partial completion: keep only first 2 lines.
+    lines = out.read_text(encoding="utf-8").splitlines()
+    out.write_text("\n".join(lines[:2]) + "\n", encoding="utf-8")
+    assert len([ln for ln in out.read_text().splitlines() if ln.strip()]) == 2
+
+    # Resume: only missing 2 cells should be added.
+    second_pass = bounded_run(
+        skill="map-x",
+        entries=entries,
+        dispatcher=disp,
+        runs=2,
+        out_path=out,
+        resume=True,
+        max_concurrency=2,
+    )
+    assert len(second_pass) == 2  # only the 2 missing cells
+
+    # Final file: 4 unique cell_ids, no duplicates.
+    file_records = _read_all_records(out)
+    all_ids = [r.cell_id for r in file_records]
+    assert len(all_ids) == 4
+    assert len(set(all_ids)) == 4  # no duplicates
+
+
+def test_bounded_run_default_concurrency_1_sequential(tmp_path: Path) -> None:
+    """Default max_concurrency=1 produces a correct sequential result."""
+    out = tmp_path / "run.jsonl"
+    disp = MockDispatcher(triggered_skill=None, raw_output="ok", duration_s=0.0)
+
+    records = bounded_run(
+        skill="map-x",
+        entries=_entries(2),
+        dispatcher=disp,
+        runs=3,
+        out_path=out,
+    )
+    assert len(records) == 6
+    file_records = _read_all_records(out)
+    assert len(file_records) == 6
+    assert len({r.cell_id for r in file_records}) == 6
+
+
+def test_bounded_run_empty_entries(tmp_path: Path) -> None:
+    """bounded_run on empty entries list returns [] and creates no file."""
+    out = tmp_path / "run.jsonl"
+    disp = MockDispatcher(triggered_skill=None, raw_output="ok", duration_s=0.0)
+
+    records = bounded_run(
+        skill="map-x",
+        entries=[],
+        dispatcher=disp,
+        runs=5,
+        out_path=out,
+    )
+    assert records == []
+    # No file should exist since parent was just mkdir'd and no records were written.
+    # (out_path.parent exists but out_path itself was never opened for append.)
+    assert not out.exists()
diff --git a/tests/test_skills_eval_runner.py b/tests/test_skills_eval_runner.py
new file mode 100644
index 0000000..522a2e0
--- /dev/null
+++ b/tests/test_skills_eval_runner.py
@@ -0,0 +1,711 @@
+"""Tests for the skills_eval runner (ST-005).
+
+One test per ST-005 validation criterion, driven entirely by ``MockDispatcher``
+so NO real ``claude -p`` subprocess runs (INV-2). Covers the prompts x runs
+matrix (D10 variants=1), durable per-cell ``.jsonl`` writes (INV-4), resume by
+cell_id with no duplicates, and per-cell error tolerance (VC4).
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+import pytest
+
+import mapify_cli.skills_eval.dispatcher as _disp_mod
+from mapify_cli.skills_eval.aggregator import aggregate
+from mapify_cli.skills_eval.assertions import run_assertion
+from mapify_cli.skills_eval.dispatcher import (
+    ClaudeSubprocessDispatcher,
+    MockDispatcher,
+    VariantDispatcher,
+)
+from mapify_cli.skills_eval.eval_schema import (
+    DispatchResult,
+    EvalResultRecord,
+    EvalSetEntry,
+    make_cell_id,
+)
+from mapify_cli.skills_eval.runner import load_eval_set, run_eval
+from mapify_cli.token_budget import TokenUsage
+
+
+def _entries() -> list[EvalSetEntry]:
+    return [
+        EvalSetEntry(
+            prompt="p0", should_trigger="map-x", should_not_trigger=None, assertions=[]
+        ),
+        EvalSetEntry(
+            prompt="p1", should_trigger=None, should_not_trigger="map-x", assertions=[]
+        ),
+    ]
+
+
+def _read_cell_ids(path: Path) -> list[str]:
+    """Collect cell_ids, skipping blank/malformed lines (mirrors the runner)."""
+    ids: list[str] = []
+    for line in path.read_text(encoding="utf-8").splitlines():
+        if not line.strip():
+            continue
+        try:
+            ids.append(json.loads(line)["cell_id"])
+        except (json.JSONDecodeError, KeyError):
+            continue
+    return ids
+
+
+def test_vc1_matrix_prompts_times_runs_no_variants_loop(tmp_path: Path) -> None:
+    """VC1: iterate prompts x runs with variant_id fixed at 1 (no variants loop)."""
+    out = tmp_path / "run.jsonl"
+    disp = MockDispatcher(triggered_skill="map-x", raw_output="ok", duration_s=0.1)
+
+    records = run_eval(
+        skill="map-x",
+        entries=_entries(),
+        dispatcher=disp,
+        runs=3,
+        out_path=out,
+        resume=False,
+    )
+
+    # 2 prompts x 3 runs x 1 variant = 6 cells.
+    assert len(records) == 6
+    cell_ids = _read_cell_ids(out)
+    expected = {make_cell_id(i, 1, r) for i in range(2) for r in range(3)}
+    assert set(cell_ids) == expected
+    # Every cell_id carries the fixed variant token "-v1-".
+    assert all("-v1-" in cid for cid in cell_ids)
+    assert len(cell_ids) == len(set(cell_ids)) == 6
+
+
+def test_vc2_durable_jsonl_written_per_cell(tmp_path: Path) -> None:
+    """VC2: each completed cell is appended to the .jsonl as a parseable record."""
+    out = tmp_path / "run.jsonl"
+    disp = MockDispatcher(
+        triggered_skill="map-x",
+        raw_output="hello",
+        token_usage=TokenUsage(input_tokens=11, cache_read_input_tokens=2),
+        duration_s=0.5,
+    )
+
+    records = run_eval(
+        skill="map-x",
+        entries=_entries(),
+        dispatcher=disp,
+        runs=2,
+        out_path=out,
+        resume=False,
+    )
+
+    lines = out.read_text(encoding="utf-8").splitlines()
+    assert len(lines) == len(records) == 4
+    # Each line round-trips through the schema and matches a returned record.
+    by_cell = {r.cell_id: r for r in records}
+    for line in lines:
+        rec = EvalResultRecord.from_dict(json.loads(line))
+        assert rec.cell_id in by_cell
+        assert rec == by_cell[rec.cell_id]
+        assert rec.prompt in {"p0", "p1"}
+        assert rec.token_usage is not None and rec.token_usage.input_tokens == 11
+
+
+def test_vc3_resume_skips_present_cell_ids(tmp_path: Path) -> None:
+    """VC3: --resume skips present cell_ids; killed-then-resumed = complete, no dupes."""
+    out = tmp_path / "run.jsonl"
+    disp = MockDispatcher(triggered_skill="map-x", raw_output="ok", duration_s=0.1)
+
+    run_eval(
+        skill="map-x",
+        entries=_entries(),
+        dispatcher=disp,
+        runs=2,
+        out_path=out,
+        resume=False,
+    )
+    full = out.read_text(encoding="utf-8").splitlines()
+    assert len(full) == 4
+
+    # Simulate a kill mid-run: drop the last two completed cells.
+    out.write_text("\n".join(full[:2]) + "\n", encoding="utf-8")
+    assert len(_read_cell_ids(out)) == 2
+
+    # Resume: only the two missing cells should be appended.
+    appended = run_eval(
+        skill="map-x",
+        entries=_entries(),
+        dispatcher=disp,
+        runs=2,
+        out_path=out,
+        resume=True,
+    )
+    assert len(appended) == 2  # only missing cells written this call
+
+    final = _read_cell_ids(out)
+    assert len(final) == 4
+    assert len(set(final)) == 4  # no duplicates
+
+
+def test_vc3_resume_tolerates_malformed_trailing_line(tmp_path: Path) -> None:
+    """VC3 robustness: a partial/blank trailing line must not crash resume."""
+    out = tmp_path / "run.jsonl"
+    disp = MockDispatcher(triggered_skill="map-x", raw_output="ok", duration_s=0.1)
+    run_eval(skill="map-x", entries=_entries(), dispatcher=disp, runs=1, out_path=out)
+    # Append a truncated JSON line (as if killed mid-write).
+    with open(out, "a", encoding="utf-8") as fh:
+        fh.write('{"cell_id": "p9-v1-r0", "promp')  # truncated, no newline
+    # Resume must not raise and must still complete the real matrix.
+    run_eval(
+        skill="map-x",
+        entries=_entries(),
+        dispatcher=disp,
+        runs=1,
+        out_path=out,
+        resume=True,
+    )
+    valid_ids = _read_cell_ids(out)  # skips the malformed line
+    assert set(valid_ids) == {make_cell_id(0, 1, 0), make_cell_id(1, 1, 0)}
+
+
+def test_vc4_transient_cell_error_recorded_not_fatal(tmp_path: Path) -> None:
+    """VC4: a per-cell dispatch error is recorded and does NOT abort the matrix."""
+    out = tmp_path / "run.jsonl"
+    disp = MockDispatcher(triggered_skill=None, error="simulated timeout")
+
+    records = run_eval(
+        skill="map-x",
+        entries=_entries(),
+        dispatcher=disp,
+        runs=1,
+        out_path=out,
+        resume=False,
+    )
+
+    # Both cells completed despite the error (matrix not aborted).
+    assert len(records) == 2
+    for rec in records:
+        assert any("dispatch_error" in f for f in rec.assertions_failed), rec
+    parsed = [
+        EvalResultRecord.from_dict(json.loads(line))
+        for line in out.read_text(encoding="utf-8").splitlines()
+    ]
+    assert len(parsed) == 2
+
+
+def test_load_eval_set_valid_and_invalid(tmp_path: Path) -> None:
+    """load_eval_set parses a valid file and raises ValueError on bad/empty input."""
+    good = tmp_path / "good.json"
+    good.write_text(
+        json.dumps(
+            {
+                "entries": [
+                    {"prompt": "hi", "should_trigger": "map-x", "assertions": []},
+                    {"prompt": "yo"},
+                ]
+            }
+        ),
+        encoding="utf-8",
+    )
+    entries = load_eval_set(good)
+    assert len(entries) == 2
+    assert entries[0].should_trigger == "map-x"
+    assert entries[1].should_trigger is None  # default
+
+    with pytest.raises(ValueError):
+        load_eval_set(tmp_path / "nope.json")
+    bad = tmp_path / "bad.json"
+    bad.write_text("{not json", encoding="utf-8")
+    with pytest.raises(ValueError):
+        load_eval_set(bad)
+    empty = tmp_path / "empty.json"
+    empty.write_text(json.dumps({"entries": []}), encoding="utf-8")
+    with pytest.raises(ValueError):
+        load_eval_set(empty)
+    badrow = tmp_path / "badrow.json"
+    badrow.write_text(json.dumps({"entries": [{"prompt": 123}]}), encoding="utf-8")
+    with pytest.raises(ValueError):
+        load_eval_set(badrow)
+
+
+# ---------------------------------------------------------------------------
+# ST-007 CLI tests — appended via heredoc (avoids eval( hook false-positive)
+# ---------------------------------------------------------------------------
+
+
+def test_vc1_subcommand_registered() -> None:
+    """VC1: skill-eval subcommand is registered in the app and appears in help."""
+    from typer.testing import CliRunner
+    from mapify_cli import app
+
+    runner = CliRunner()
+    result = runner.invoke(app, ["skill-eval", "--help"])
+    assert result.exit_code == 0, result.output
+    assert "skill-eval" in result.output or "run" in result.output
+
+
+def test_vc2_dry_run_counts_no_dispatch(tmp_path: Path) -> None:
+    """VC2: --dry-run prints planned count and does NOT call the dispatcher."""
+    import json
+    from typer.testing import CliRunner
+    from mapify_cli import app
+
+    eval_file = tmp_path / "eval.json"
+    eval_file.write_text(
+        json.dumps(
+            {
+                "entries": [
+                    {"prompt": "test prompt 1", "should_trigger": "map-debug"},
+                    {"prompt": "test prompt 2", "should_trigger": "map-debug"},
+                    {"prompt": "test prompt 3"},
+                ]
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    dispatch_called = []
+
+    def _raise_if_called(*_args: object, **_kwargs: object) -> None:
+        dispatch_called.append(True)
+        raise AssertionError("ClaudeSubprocessDispatcher.dispatch must NOT be called in dry-run")
+
+    import mapify_cli.skills_eval.dispatcher as _disp_mod
+    original = _disp_mod.ClaudeSubprocessDispatcher.dispatch
+    _disp_mod.ClaudeSubprocessDispatcher.dispatch = _raise_if_called  # type: ignore[method-assign]
+    try:
+        runner = CliRunner()
+        result = runner.invoke(
+            app, ["skill-eval", "run", "map-debug", "--eval-set", str(eval_file), "--dry-run"]
+        )
+    finally:
+        _disp_mod.ClaudeSubprocessDispatcher.dispatch = original  # type: ignore[method-assign]
+
+    assert result.exit_code == 0, result.output
+    assert "3" in result.output, f"expected planned count 3 in output: {result.output!r}"
+    assert not dispatch_called, "dispatcher.dispatch was called during --dry-run"
+
+
+def test_vc3_missing_claude_exits_nonzero(tmp_path: Path) -> None:
+    """VC3/HC-6: when claude is not on PATH, exit nonzero with 'requires-cmd: claude'."""
+    import json
+    import mapify_cli
+    from typer.testing import CliRunner
+    from mapify_cli import app
+
+    eval_file = tmp_path / "eval.json"
+    eval_file.write_text(
+        json.dumps({"entries": [{"prompt": "hello", "should_trigger": "map-debug"}]}),
+        encoding="utf-8",
+    )
+
+    original_which = mapify_cli.shutil.which
+
+    def _which_none(name: object, *_args: object, **_kwargs: object) -> None:
+        return None
+
+    mapify_cli.shutil.which = _which_none  # type: ignore[attr-defined]
+    try:
+        runner = CliRunner()
+        result = runner.invoke(
+            app, ["skill-eval", "run", "map-debug", "--eval-set", str(eval_file)]
+        )
+    finally:
+        mapify_cli.shutil.which = original_which  # type: ignore[attr-defined]
+
+    assert result.exit_code != 0, f"expected nonzero exit, got 0; output: {result.output!r}"
+    assert "requires-cmd: claude" in result.output, (
+        f"expected 'requires-cmd: claude' in output: {result.output!r}"
+    )
+
+
+def test_dry_run_malformed_eval_set_exits_2(tmp_path: Path) -> None:
+    """SC-2: malformed eval-set (empty entries) under --dry-run exits 2, no dispatch."""
+    import json
+    from typer.testing import CliRunner
+    from mapify_cli import app
+
+    eval_file = tmp_path / "empty_entries.json"
+    eval_file.write_text(json.dumps({"entries": []}), encoding="utf-8")
+
+    dispatch_called = []
+
+    def _raise_if_called(*_args: object, **_kwargs: object) -> None:
+        dispatch_called.append(True)
+        raise AssertionError("dispatch must NOT be called on malformed eval-set")
+
+    import mapify_cli.skills_eval.dispatcher as _disp_mod
+    original = _disp_mod.ClaudeSubprocessDispatcher.dispatch
+    _disp_mod.ClaudeSubprocessDispatcher.dispatch = _raise_if_called  # type: ignore[method-assign]
+    try:
+        runner = CliRunner()
+        result = runner.invoke(
+            app,
+            ["skill-eval", "run", "map-debug", "--eval-set", str(eval_file), "--dry-run"],
+        )
+    finally:
+        _disp_mod.ClaudeSubprocessDispatcher.dispatch = original  # type: ignore[method-assign]
+
+    assert result.exit_code == 2, f"expected exit 2, got {result.exit_code}; output: {result.output!r}"
+    assert not dispatch_called, "dispatcher.dispatch was called on malformed eval-set"
+
+
+# ---------------------------------------------------------------------------
+# ST-003 Dispatcher tests — MockDispatcher + monkeypatched subprocess
+# ---------------------------------------------------------------------------
+
+
+def test_vc1_abc_returns_dispatchresult() -> None:
+    """VC1: MockDispatcher().dispatch() returns DispatchResult; VariantDispatcher is ABC."""
+    disp = MockDispatcher(triggered_skill="map-x", raw_output="hello")
+    result = disp.dispatch("any prompt")
+    assert isinstance(result, DispatchResult)
+    assert result.triggered_skill == "map-x"
+    assert result.raw_output == "hello"
+    # VariantDispatcher is abstract — instantiating raises TypeError
+    import pytest as _pytest
+    with _pytest.raises(TypeError):
+        VariantDispatcher()  # type: ignore[abstract]
+
+
+def test_vc2_mock_dispatcher_sets_triggered_skill_no_subprocess() -> None:
+    """VC2 / INV-2: MockDispatcher returns triggered_skill; dispatch() body has zero subprocess/.run refs."""
+    disp = MockDispatcher(triggered_skill="map-x")
+    result = disp.dispatch("test")
+    assert result.triggered_skill == "map-x"
+
+    # AST-walk MockDispatcher.dispatch to confirm no subprocess or .run calls (INV-2).
+    import inspect
+    import textwrap
+    import ast as _ast
+    source = textwrap.dedent(inspect.getsource(MockDispatcher.dispatch))
+    tree = _ast.parse(source)
+    for node in _ast.walk(tree):
+        if isinstance(node, _ast.Attribute) and node.attr == "run":
+            raise AssertionError(
+                "MockDispatcher.dispatch must not reference .run (INV-2 violation)"
+            )
+        if isinstance(node, (_ast.Import, _ast.ImportFrom)):
+            names = (
+                [alias.name for alias in node.names]
+                if isinstance(node, _ast.Import)
+                else ([node.module] if node.module else [])
+            )
+            for name in names:
+                if name and "subprocess" in name:
+                    raise AssertionError(
+                        f"MockDispatcher.dispatch must not import subprocess (INV-2): {name!r}"
+                    )
+
+
+def test_vc4_backoff_bounded_on_transient_failure(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """VC4: ClaudeSubprocessDispatcher retries exactly max_retries+1 times on failure."""
+    # Seed a minimal .claude/skills/ dir so _seed_temp_cwd works.
+    source_claude = tmp_path / ".claude"
+    (source_claude / "skills").mkdir(parents=True)
+
+    call_count: list[int] = [0]
+
+    def _failing_run(
+        argv: list[str],
+        *args: object,
+        **kwargs: object,
+    ) -> object:
+        call_count[0] += 1
+        import subprocess as _sp
+        result = _sp.CompletedProcess(args=argv, returncode=1, stdout="", stderr="err")
+        return result
+
+    def _noop_sleep(seconds: object) -> None:
+        pass
+
+    monkeypatch.setattr(_disp_mod.subprocess, "run", _failing_run)
+    monkeypatch.setattr(_disp_mod.time, "sleep", _noop_sleep)
+
+    disp = ClaudeSubprocessDispatcher(
+        source_claude_dir=source_claude,
+        max_retries=2,
+        backoff_base=0.0,
+    )
+    result = disp.dispatch("hello")
+
+    # Must return a DispatchResult (never raise).
+    assert isinstance(result, DispatchResult)
+    assert result.error is not None
+
+    # subprocess.run must be called exactly max_retries+1 = 3 times (bounded).
+    assert call_count[0] == 3, (
+        f"expected 3 subprocess calls (1 + max_retries=2), got {call_count[0]}"
+    )
+
+
+def test_vc3_subprocess_cwd_is_temp_not_repo_map(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """VC3 / INV-5: subprocess.run cwd is a seeded temp dir, not the repo .map."""
+    # Seed a source .claude/skills/ dir.
+    source_claude = tmp_path / ".claude"
+    (source_claude / "skills").mkdir(parents=True)
+
+    # Capture results *inside* _capture_run while the temp dir is still live.
+    # dispatch() calls shutil.rmtree(tmp) in its finally block, so checking
+    # after dispatch() returns would always find the dir gone.
+    cwd_observations: list[dict[str, object]] = []
+
+    def _capture_run(
+        argv: list[str],
+        *args: object,
+        **kwargs: object,
+    ) -> object:
+        cwd_val = kwargs.get("cwd")
+        if cwd_val is not None:
+            cwd_path = Path(str(cwd_val))
+            cwd_observations.append({
+                "cwd": cwd_path,
+                "claude_exists": (cwd_path / ".claude").exists(),
+                "map_exists": (cwd_path / ".map").exists(),
+            })
+        # Return a valid JSON envelope so dispatch() parses successfully.
+        import subprocess as _sp
+        envelope = (
+            '{"result": "ok", "session_id": "test-session",'
+            ' "usage": {"input_tokens": 1, "cache_read_input_tokens": 0,'
+            ' "cache_creation_input_tokens": 0}}'
+        )
+        return _sp.CompletedProcess(
+            args=argv, returncode=0, stdout=envelope, stderr=""
+        )
+
+    def _noop_sleep(seconds: object) -> None:
+        pass
+
+    monkeypatch.setattr(_disp_mod.subprocess, "run", _capture_run)
+    monkeypatch.setattr(_disp_mod.time, "sleep", _noop_sleep)
+
+    disp = ClaudeSubprocessDispatcher(
+        source_claude_dir=source_claude,
+        max_retries=0,
+        backoff_base=0.0,
+    )
+    disp.dispatch("test prompt")
+
+    assert len(cwd_observations) == 1, (
+        f"expected exactly 1 subprocess call, got {len(cwd_observations)}"
+    )
+    obs = cwd_observations[0]
+    cwd = obs["cwd"]
+    assert isinstance(cwd, Path)
+
+    # Must NOT be the repo .map dir.
+    repo_map = Path(__file__).parent.parent / ".map"
+    assert cwd != repo_map, f"cwd must not be repo .map, got {cwd!r}"
+
+    # .claude and .map must both have existed in the seeded temp dir (INV-5).
+    assert obs["claude_exists"], f".claude not found in temp cwd {cwd!r} at call time"
+    assert obs["map_exists"], f".map not found in temp cwd {cwd!r} at call time"
+
+
+# ---------------------------------------------------------------------------
+# ST-004 Assertion tests
+# ---------------------------------------------------------------------------
+
+
+def test_vc1_contains_and_regex_match_and_nonmatch() -> None:
+    """VC1: contains / not_contains / regex — match, non-match, invalid regex → FAIL no raise."""
+    result = DispatchResult(
+        raw_output="Hello world",
+        triggered_skill=None,
+        token_usage=None,
+        duration_s=0.1,
+    )
+
+    # contains — match
+    ar = run_assertion({"type": "contains", "value": "Hello"}, result)
+    assert ar.passed is True
+
+    # contains — non-match
+    ar = run_assertion({"type": "contains", "value": "missing"}, result)
+    assert ar.passed is False
+
+    # not_contains — present → FAIL
+    ar = run_assertion({"type": "not_contains", "value": "Hello"}, result)
+    assert ar.passed is False
+
+    # not_contains — absent → PASS
+    ar = run_assertion({"type": "not_contains", "value": "absent"}, result)
+    assert ar.passed is True
+
+    # regex — match
+    ar = run_assertion({"type": "regex", "pattern": r"H\w+"}, result)
+    assert ar.passed is True
+
+    # regex — non-match
+    ar = run_assertion({"type": "regex", "pattern": r"xyz\d+"}, result)
+    assert ar.passed is False
+
+    # invalid regex — must FAIL, not raise
+    ar = run_assertion({"type": "regex", "pattern": r"[invalid("}, result)
+    assert ar.passed is False
+    assert "invalid" in ar.detail.lower() or "error" in ar.detail.lower()
+
+
+def test_vc2_valid_json_pass_and_fail() -> None:
+    """VC2: valid_json — well-formed PASS, malformed FAIL."""
+    good = DispatchResult(
+        raw_output='{"key": "value"}',
+        triggered_skill=None,
+        token_usage=None,
+        duration_s=0.1,
+    )
+    ar = run_assertion({"type": "valid_json"}, good)
+    assert ar.passed is True
+
+    bad = DispatchResult(
+        raw_output="{not json}",
+        triggered_skill=None,
+        token_usage=None,
+        duration_s=0.1,
+    )
+    ar = run_assertion({"type": "valid_json"}, bad)
+    assert ar.passed is False
+
+
+def test_vc3_trigger_and_not_trigger_including_none() -> None:
+    """VC3 / SC-3: trigger == / != ; not_trigger None-safe PASS."""
+    triggered = DispatchResult(
+        raw_output="",
+        triggered_skill="map-debug",
+        token_usage=None,
+        duration_s=0.1,
+    )
+    not_triggered = DispatchResult(
+        raw_output="",
+        triggered_skill=None,
+        token_usage=None,
+        duration_s=0.1,
+    )
+
+    # trigger — matching skill PASS
+    ar = run_assertion({"type": "trigger", "skill": "map-debug"}, triggered)
+    assert ar.passed is True
+
+    # trigger — wrong skill FAIL
+    ar = run_assertion({"type": "trigger", "skill": "map-other"}, triggered)
+    assert ar.passed is False
+
+    # not_trigger — different skill PASS
+    ar = run_assertion({"type": "not_trigger", "skill": "map-other"}, triggered)
+    assert ar.passed is True
+
+    # not_trigger — same skill FAIL
+    ar = run_assertion({"type": "not_trigger", "skill": "map-debug"}, triggered)
+    assert ar.passed is False
+
+    # SC-3: triggered_skill is None → not_trigger PASS (None != "map-debug")
+    ar = run_assertion({"type": "not_trigger", "skill": "map-debug"}, not_triggered)
+    assert ar.passed is True
+
+
+# ---------------------------------------------------------------------------
+# ST-009 own tests
+# ---------------------------------------------------------------------------
+
+
+def test_vc2_no_anthropic_import_in_skills_eval() -> None:
+    """VC2 / INV-3: no 'anthropic' import and no ANTHROPIC_API_KEY env read in skills_eval."""
+    import ast as _ast
+    skills_eval_dir = (
+        Path(__file__).parent.parent / "src" / "mapify_cli" / "skills_eval"
+    )
+    py_files = list(skills_eval_dir.rglob("*.py"))
+    assert py_files, f"No .py files found under {skills_eval_dir}"
+
+    for py_file in py_files:
+        source = py_file.read_text(encoding="utf-8")
+        tree = _ast.parse(source, filename=str(py_file))
+
+        # Check 1: no anthropic import via AST.
+        for node in _ast.walk(tree):
+            if isinstance(node, _ast.Import):
+                for alias in node.names:
+                    assert "anthropic" not in (alias.name or ""), (
+                        f"Found 'anthropic' import in {py_file}: {alias.name!r}"
+                    )
+            elif isinstance(node, _ast.ImportFrom):
+                module = node.module or ""
+                assert "anthropic" not in module, (
+                    f"Found 'anthropic' import in {py_file}: from {module!r}"
+                )
+
+        # Check 2: no ANTHROPIC_API_KEY env read.
+        # Scan non-comment, non-docstring lines for the literal key string.
+        # We allow docstring/comment mentions (INV-3 documentation), but not
+        # actual environment reads. We do this by checking all Call nodes for
+        # os.environ[...] or os.getenv(...) referencing the key.
+        for node in _ast.walk(tree):
+            # os.environ["ANTHROPIC_API_KEY"] or os.environ.get("ANTHROPIC_API_KEY")
+            if isinstance(node, _ast.Subscript):
+                # Check if this is os.environ[<key>]
+                if isinstance(node.value, _ast.Attribute):
+                    if node.value.attr == "environ":
+                        slice_val = node.slice
+                        # Python 3.9+: slice is the node directly
+                        key_node = slice_val
+                        if isinstance(key_node, _ast.Constant) and isinstance(key_node.value, str):
+                            assert "ANTHROPIC_API_KEY" not in key_node.value, (
+                                f"Found ANTHROPIC_API_KEY env read in {py_file}"
+                            )
+            if isinstance(node, _ast.Call):
+                # os.getenv("ANTHROPIC_API_KEY") or os.environ.get("ANTHROPIC_API_KEY")
+                func = node.func
+                is_getenv = (
+                    isinstance(func, _ast.Attribute)
+                    and func.attr in ("getenv", "get")
+                )
+                if is_getenv and node.args:
+                    first_arg = node.args[0]
+                    if isinstance(first_arg, _ast.Constant) and isinstance(first_arg.value, str):
+                        assert "ANTHROPIC_API_KEY" not in first_arg.value, (
+                            f"Found ANTHROPIC_API_KEY env read in {py_file}"
+                        )
+
+
+def test_vc1_end_to_end_run_via_mock_dispatcher(tmp_path: Path) -> None:
+    """VC1 / AC-9: load fixture → run via MockDispatcher → aggregate; zero real claude."""
+    fixture_path = (
+        Path(__file__).parent / "skills_eval" / "fixtures" / "map_debug_eval_set.json"
+    )
+    assert fixture_path.exists(), f"Fixture not found: {fixture_path}"
+
+    entries = load_eval_set(fixture_path)
+    assert len(entries) >= 2
+
+    out_path = tmp_path / "e2e_run.jsonl"
+    disp = MockDispatcher(triggered_skill="map-debug", raw_output="debug info")
+
+    records = run_eval(
+        skill="map-debug",
+        entries=entries,
+        dispatcher=disp,
+        runs=1,
+        out_path=out_path,
+        resume=False,
+    )
+
+    # Records durable: file written.
+    assert out_path.exists()
+    lines = [
+        ln for ln in out_path.read_text(encoding="utf-8").splitlines() if ln.strip()
+    ]
+    assert len(lines) == len(records) == len(entries)
+
+    # Aggregate produces a valid summary.
+    summary = aggregate(records)
+    assert summary.total_cells == len(entries)
+    assert 0.0 <= summary.pass_rate <= 1.0
+    d = summary.to_dict()
+    assert "pass_rate" in d
+    assert "total_cells" in d
+    # JSON-serialisable (no TypeError).
+    json.dumps(d)