diff --git a/.claude/hooks/safety-guardrails.py b/.claude/hooks/safety-guardrails.py index 04fd888..48e671c 100755 --- a/.claude/hooks/safety-guardrails.py +++ b/.claude/hooks/safety-guardrails.py @@ -38,7 +38,12 @@ # Dangerous bash command patterns _DEFAULT_DANGEROUS_COMMANDS = [ - r"rm\s+-rf\s+/", # rm -rf / + # Block `rm -rf /` (bare root), `rm -rf /etc`, `rm -rf /home/user`, etc., + # but ALLOW deletion of subpaths UNDER a temp root (rm -rf /tmp/, + # /private/tmp/, /var/folders/, /var/tmp/) — legitimate + # scratch cleanup. The negative lookahead requires a trailing slash, so the + # temp root itself (`rm -rf /tmp`) stays blocked; only children are allowed. + r"rm\s+-rf\s+/(?!(?:tmp|private/tmp|var/folders|var/tmp)/)", # rm -rf / (non-temp) r"rm\s+-rf\s+\*", # rm -rf * r"rm\s+-rf\s+\.\.", # rm -rf .. r"git\s+push.*--force.*main", diff --git a/.claude/rules/learned/architecture-patterns.md b/.claude/rules/learned/architecture-patterns.md index d55654f..d1d8bc4 100644 --- a/.claude/rules/learned/architecture-patterns.md +++ b/.claude/rules/learned/architecture-patterns.md @@ -161,3 +161,19 @@ # CORRECT: templates_src is fence-free; copier injects exactly once: wrapped = f"# map:start\n{rendered}\n# map:end\n" if fenced else rendered ``` + +- **Spike-First Gating: High-Risk Binding Decisions Require a Docs-Only Artifact Before Implementation** (2026-06-04): When a subtask's answer would bind downstream implementation (which channel carries a value, which API call is idempotent, what schema a subprocess emits), run it FIRST as a docs-only spike that writes an artifact naming the empirical answer + the binding strategy, and commits ZERO production code. Downstream subtasks reference the artifact by name and consume it, not assumptions. A wrong assumption that is not spiked propagates into every component built on it and forces a rewrite cascade. In this workflow a research-agent wrongly claimed skill-activation wasn't recoverable from `claude -p`; the ST-001 spike empirically corrected it before any dispatcher code existed. The spike artifact MUST contain a named "binding strategy" section, not just findings (Monitor hard-stopped once for a missing strategy section). [workflow: map-efficient] + +- **Producer-Owns-Parse: The Component That Owns the Subprocess Owns All Derived Fields; Consumers Read the Typed Result** (2026-06-04): When component A launches a subprocess (or owns a raw source) and component B consumes the result, ALL parsing/derivation (transcript reads, field extraction, signal combination) lives in A; B reads only the typed result struct and never re-implements parsing. Two payoffs: (1) a single parse site that a Mock producer can supply directly, so consumer tests need no subprocess/transcript fixture; (2) when the raw output schema changes, only A changes. Putting any parse in B re-couples the modules through the raw format. Extends "Contract-First Inter-Component JSON Schemas": the contract is A's typed struct, and the parse-to-struct boundary is A's responsibility exclusively. [workflow: map-efficient] + ```python + # WRONG — runner re-parses a transcript it does not own (couples to raw format) + result = dispatcher.dispatch(cell) # raw proc output + skill = extract_skill_from_transcript(read_jsonl(result.session_id)) + + # CORRECT — dispatcher parses once into a typed field; runner just reads it + @dataclass + class DispatchResult: + triggered_skill: str | None # parsed by dispatcher, NOT by runner + token_usage: TokenUsage | None + # tests inject MockDispatcher(triggered_skill="map-plan") — no subprocess needed + ``` diff --git a/.claude/rules/learned/implementation-patterns.md b/.claude/rules/learned/implementation-patterns.md index 657b145..626da22 100644 --- a/.claude/rules/learned/implementation-patterns.md +++ b/.claude/rules/learned/implementation-patterns.md @@ -128,3 +128,42 @@ paths: dest.chmod(dest.stat().st_mode | 0o755) # test guard: assert os.access(installed_hook, os.X_OK) ``` + +- **`claude -p` Output Has Two Channels: Envelope for Tokens, Transcript JSONL for Skill Name** (2026-06-04): When shelling `claude -p --output-format json` as a subprocess, two distinct output channels carry different information — do not confuse them. The JSON result envelope (stdout) carries `.result` (response text), `.usage` (input/output/cache tokens), and `.session_id`. The name of the skill/slash-surface that actually fired is NOT in the envelope — it is only in Claude Code's native transcript JSONL (located by session_id) as a `tool_use` block with `name=="Skill"` and `input.skill`. Deriving this from the framework's own scratch/digest schema rather than the native transcript yields a wrong claim. Verify empirically by reading the real transcript after a spike call; never infer from internal schema files. [workflow: map-efficient] + ```python + env = json.loads(proc.stdout) # .result, .usage, .session_id + tokens = env["usage"] # CORRECT — tokens are in the envelope + # env.get("skill") -> None # WRONG — fired-skill is NOT in the envelope + for line in transcript_jsonl(env["session_id"]).read_text().splitlines(): + m = json.loads(line) + if m.get("type") == "tool_use" and m.get("name") == "Skill": + triggered = m["input"]["skill"]; break + ``` + +- **Scoped Config-Flag Mutation: Seed a Throwaway Temp Copy; Never Modify the Production Source of Truth** (2026-06-04): When a tool/test needs a shipped config flag to behave differently from its production default (e.g. stripping `disable-model-invocation: true` so an eval can auto-select skills), mutate the flag ONLY in a throwaway temp dir seeded with a copy of the production config, discarded after the subprocess exits. Never patch the source repo or `templates_src`. A blanket production flip is a footgun: it silently changes behavior for every other user of the flag and may be committed accidentally. Scope of mutation must match scope of need: one subprocess call → one throwaway dir, always cleaned up in `finally`. [workflow: map-efficient] + ```python + tmp = Path(tempfile.mkdtemp()) + shutil.copytree(REPO / ".claude", tmp / ".claude") # seed from production + strip_flag(tmp / ".claude" / "skills") # mutate throwaway ONLY + try: + subprocess.run(["claude", "-p", prompt, "--output-format", "json"], cwd=tmp) + finally: + shutil.rmtree(tmp) # production never touched + ``` + +- **Clock-Free Core with Caller-Supplied Path: Inject Timestamps at the CLI Boundary, Not Inside the Worker** (2026-06-04): When a worker writes durable output (a timestamped JSONL, a run artifact), do NOT call `datetime.now()` inside the worker. Have the CLI/outermost caller generate the timestamped path and pass it as an explicit `out_path: Path` the worker treats as opaque. Benefits: (1) tests pass `tmp_path / "results.jsonl"` with zero clock monkeypatching; (2) the worker is deterministic given the same inputs+path; (3) resume keys on the path the CLI owns. Refines "Long-Running Operations Need Durable State by Default" by fixing WHERE path/timestamp generation lives — at the boundary, not the core. [workflow: map-efficient] + ```python + # CORRECT: worker takes out_path; CLI owns the timestamp + def run_eval(*, entries, dispatcher, runs, out_path: Path, resume=False) -> list: ... + # CLI: out = default_run_path(root, skill, datetime.now(tz).strftime("%Y%m%dT%H%M%SZ")) + # Test: run_eval(..., out_path=tmp_path / "r.jsonl") # no time mocking + ``` + +- **Concurrent Durable Append: threading.Lock for Line Integrity + Stable cell_id Resume Key** (2026-06-04): When parallel workers append JSONL lines to a shared durable file, two invariants must BOTH hold: (1) no interleaved partial lines — guard each `f.write(line + "\n")` with a threading.Lock; (2) resume is idempotent regardless of write order — key on a stable id present in every record (cell_id), never on line number/position. Nondeterministic write order is fine as long as resume dedups by id. Each worker subprocess also runs in its own temp cwd so concurrent subprocesses never share a working dir. Complements "Long-Running Operations Need Durable State" (process-restart durability) with within-process concurrency safety. [workflow: map-efficient] + ```python + with self._lock: # atomic per-line append + with out_path.open("a", encoding="utf-8") as f: + f.write(json.dumps(record) + "\n") + done = {json.loads(l)["cell_id"] for l in out_path.read_text().splitlines() if l.strip()} + pending = [c for c in cells if make_cell_id(...) not in done] # order-independent resume + ``` diff --git a/.claude/rules/learned/testing-strategies.md b/.claude/rules/learned/testing-strategies.md index 3d327b4..83a91ba 100644 --- a/.claude/rules/learned/testing-strategies.md +++ b/.claude/rules/learned/testing-strategies.md @@ -139,3 +139,12 @@ paths: # 3. git restore -> confirm GREEN # 4. commit file + test together ``` + +- **Blueprint-Named Test Functions Are a Monitor Contract: Author Them in the Same Subtask as the Code** (2026-06-04): When a subtask blueprint's `test_strategy` names specific pytest function names (e.g. `test_vc3_resume_skips_present_cell_ids`), Monitor treats those names as a HARD completeness contract: a subtask whose logic is correct but whose blueprint-named functions do not yet exist gets `valid=false` (hard stop). The completeness unit is code + named-test-functions-together, not code alone — the blueprint author chose the names to specify observable behavior, so an absent name means the behavior is unverified. Never stub a named test with `pass`/`# TODO` and call the subtask done; the stub satisfies the import but not the contract. In this workflow ST-005's runner code was correct but Monitor hard-stopped until the four named VC tests were authored with real assertions. [workflow: map-efficient] + +- **Final Verification Must Check Shipped Docs Against Actual Behavior, Then Grep for the Same Drift Class** (2026-06-04): After code+tests are green, a dedicated final-verification pass must validate that user-facing docs (SKILL.md, README, CLI `--help`) match actual behavior: default values, accepted schema formats, flag names, output field names. Prose drift is invisible to pytest/ruff/mypy. When the first drift instance is found, immediately grep the WHOLE doc for the same class of claim (every `--flag default`, every schema example, every accepted file-format mention) before moving on — drift clusters because the doc was written once from a design doc, not from running code. Here the final-verifier caught a `--max-concurrency` default of 4 (actual 1); grepping the same file then surfaced a fictional YAML eval-set schema block + `.yaml` examples that the JSON-only loader could never parse. [workflow: map-efficient] + ```bash + # one drift found -> grep the whole doc for the drift class before marking done + mapify skill-eval --help | grep -i max-concurrency # actual default + grep -nE 'default|yaml|schema|--[a-z-]+' docs/SKILL.md # reconcile every claim + ``` diff --git a/.claude/skills/map-efficient/SKILL.md b/.claude/skills/map-efficient/SKILL.md index b986b52..2045905 100644 --- a/.claude/skills/map-efficient/SKILL.md +++ b/.claude/skills/map-efficient/SKILL.md @@ -191,8 +191,10 @@ python3 .map/scripts/map_step_runner.py record_test_baseline "$BRANCH" Snapshots pre-existing failures so later subtasks distinguish "introduced regression" from "was broken pre-plan". Auto-detects -Make/pytest/go test/cargo. Overrides + narrow-target guidance: -[efficient-reference.md](efficient-reference.md#pre-flight-test-baseline). +Make/pytest/go test/cargo. It captures the test run internally and prints a +single compact JSON report at the end — read that JSON directly; do NOT pipe it +through `head`/`tail` (per the repo bash guidelines). Overrides + narrow-target +guidance: [efficient-reference.md](efficient-reference.md#pre-flight-test-baseline). ### Wave Computation (after INIT_STATE) - REQUIRED diff --git a/.claude/skills/map-efficient/efficient-reference.md b/.claude/skills/map-efficient/efficient-reference.md index 802d11c..6734cfc 100644 --- a/.claude/skills/map-efficient/efficient-reference.md +++ b/.claude/skills/map-efficient/efficient-reference.md @@ -203,6 +203,11 @@ fix or defer. python3 .map/scripts/map_step_runner.py record_test_baseline "$BRANCH" ``` +It captures the test run internally and prints a single compact JSON report at +the end — read that JSON directly. Do NOT pipe it through `head`/`tail` (per the +repo bash guidelines); the output is one small object, not a stream, so +truncating it only hides fields. + Auto-detects from project markers: - `Makefile` with `test:` target → `make test` - `pyproject.toml` / `pytest.ini` → `pytest` diff --git a/.claude/skills/map-skill-eval/SKILL.md b/.claude/skills/map-skill-eval/SKILL.md new file mode 100644 index 0000000..567ac04 --- /dev/null +++ b/.claude/skills/map-skill-eval/SKILL.md @@ -0,0 +1,94 @@ +--- +name: map-skill-eval +description: | + Evaluate a /map-* skill's trigger accuracy and cost. Use when asked to measure skill trigger accuracy, run an eval-set, or check token/duration cost via `mapify skill-eval`. Do NOT use to plan or implement; use map-plan or map-efficient. +effort: medium +disable-model-invocation: true +argument-hint: "[skill] [--eval-set PATH]" +--- +# /map-skill-eval — Skill Trigger Accuracy & Cost Evaluation + +Purpose: measure whether a `/map-*` skill fires on the right prompts and what it costs in tokens and time. Do not plan or implement from this skill. + +Requires the `claude` CLI (installed and on `$PATH`). The skill is skipped at install time on hosts without `claude`. + +## Invocation + +```bash +mapify skill-eval run --eval-set PATH [--dry-run] [--resume] [--max-concurrency N] +``` + +- `` — the skill name to evaluate (e.g. `map-plan`). +- `--eval-set PATH` — path to a JSON eval-set file defining prompt cases and expected assertions. +- `--dry-run` — validate the eval-set and print the planned run count without spending any quota. +- `--resume` — continue an interrupted run from the last durable checkpoint. +- `--max-concurrency N` — max parallel `claude -p` workers (default: 1). + +## What It Does + +1. **Prompts × runs matrix** — for each case in the eval-set, invokes `claude -p` in an isolated temporary working directory seeded with `.claude/` (skills, settings). Runs are independent; no shared state leaks between cases. +2. **Transcript-parse trigger detection** — parses each `claude -p` transcript to determine whether the target skill fired (trigger) or did not fire (not_trigger). +3. **Deterministic assertions** — each eval case may specify one or more assertion types: + - `contains` / `not_contains` — substring presence in the response. + - `regex` — pattern match against the response. + - `valid_json` — response parses as JSON. + - `trigger` / `not_trigger` — skill fired / did not fire. +4. **Durable resumable run log** — results are appended to `.map/eval-runs//.jsonl` as each case completes, so a partial run is recoverable via `--resume`. +5. **Summary report** — after all cases complete, prints pass-rate (passed/total) plus per-case token usage, duration, and cache-hit stats. + +## Eval-Set Format + +A JSON object with an `entries` array. Each entry has a `prompt`, optional +`should_trigger` / `should_not_trigger` skill names (the runner turns these into +`trigger` / `not_trigger` assertions), and an optional `assertions` array. +Assertion types: `contains`, `not_contains`, `regex`, `valid_json`, `trigger`, +`not_trigger`. + +```json +{ + "entries": [ + { + "prompt": "Decompose this feature into subtasks", + "should_trigger": "map-plan", + "assertions": [ + { "type": "contains", "value": "subtask" } + ] + }, + { + "prompt": "Run quality gates", + "should_not_trigger": "map-plan", + "assertions": [] + } + ] +} +``` + +## --dry-run + +`--dry-run` validates the eval-set schema and prints the planned case count with estimated quota usage. No `claude -p` calls are made; no `.jsonl` is written. + +## Examples + +```bash +# Validate eval-set without spending quota +mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --dry-run + +# Run full eval with up to 8 parallel workers +mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --max-concurrency 8 + +# Resume an interrupted run +mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --resume +``` + +## Troubleshooting + +- **`claude` not found** — `map-skill-eval` requires the `claude` CLI on `$PATH`. Install it and re-run `mapify init` to activate the skill. +- **Eval-set validation error on `--dry-run`** — check that each case has a non-empty `id`, a `prompt`, and at least one `assertions` entry with a valid `type`. +- **Run log not found for `--resume`** — `--resume` looks for the latest `.map/eval-runs//.jsonl`. If no prior run exists, omit `--resume` to start fresh. +- **All cases report `not_trigger` unexpectedly** — verify the skill name matches exactly (e.g. `map-plan`, not `map_plan`) and that `.claude/` was seeded correctly in the temp cwd. + +## Related Commands + +- `/map-plan` — plan and decompose tasks. +- `/map-efficient` — full MAP workflow execution. +- `/map-check` — run quality gates and verify MAP workflow completion. diff --git a/.claude/skills/skill-rules.json b/.claude/skills/skill-rules.json index bbe32ab..d5a9606 100644 --- a/.claude/skills/skill-rules.json +++ b/.claude/skills/skill-rules.json @@ -239,6 +239,18 @@ ] } }, + "map-skill-eval": { + "type": "manual", + "skillClass": "task", + "enforcement": "manual", + "priority": "medium", + "description": "Evaluate a /map-* skill's trigger accuracy + cost via mapify skill-eval (claude -p matrix, deterministic assertions, durable resumable runs).", + "requires-cmd": ["claude"], + "promptTriggers": { + "keywords": ["map-skill-eval","skill-eval","skill eval","evaluate skill","trigger accuracy","skill triggering"], + "intentPatterns": ["map-skill-eval","(eval|evaluate|measure|test).*(skill).*(trigger|fire|cost)","does .* skill trigger"] + } + }, "map-task": { "type": "manual", "skillClass": "task", diff --git a/CLAUDE.md b/CLAUDE.md index 46dd045..7b5db00 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -65,6 +65,7 @@ Validation: - "Not in the CI gate" is NOT a valid reason to skip. The error is real if any tool reported it. - "Static-analysis noise" is NOT a category. Either the type system is correct and the code is wrong, or the annotation needs fixing — pick one and fix it. - Only legitimate skip: the user explicitly approves deferral in the current conversation. Document the deferral in writing. +- **Any error encountered while operating the MAP Framework itself must be fixed immediately, in the same change.** This covers the framework's own runtime — a hook that crashes or false-positives, a `.map/scripts/` runner or gate that errors or mis-reports, a `mapify` CLI traceback, a render/validator/blueprint failure, a broken `Task`/agent dispatch. When you hit one mid-task: STOP, find the root cause, and fix it before continuing the original work. Do NOT work around it, do NOT defer it as "unrelated", do NOT note-and-move-on past a broken tool. If the fix is genuinely out of scope or risky, stop and ask the user — never silently continue past a malfunctioning framework component. (Errors raised by an external plugin/hook NOT shipped by this repo are out of scope here; say so and route them to the user.) ## Bash Command Guidelines diff --git a/src/mapify_cli/__init__.py b/src/mapify_cli/__init__.py index de885c2..cf89218 100644 --- a/src/mapify_cli/__init__.py +++ b/src/mapify_cli/__init__.py @@ -140,6 +140,12 @@ def create_ssl_context(): app.add_typer(validate_app, name="validate") +skill_eval_app = typer.Typer( + name="skill-eval", help="Evaluate a skill's trigger accuracy + cost" +) + +app.add_typer(skill_eval_app, name="skill-eval") + def version_callback(value: bool): """Callback to show version and exit.""" @@ -1361,6 +1367,127 @@ def upgrade(): ) +# Skill-eval commands + + +@skill_eval_app.command("run") +def skill_eval_run( + skill: str = typer.Argument(..., help="Skill under test, e.g. map-debug"), + eval_set: Optional[Path] = typer.Option( + None, "--eval-set", help="Path to eval-set JSON" + ), + dry_run: bool = typer.Option( + False, "--dry-run", help="Validate eval-set + print planned count; spend nothing" + ), + resume: bool = typer.Option( + False, "--resume", help="Resume a partial run, skipping completed cells" + ), + max_concurrency: int = typer.Option( + 1, "--max-concurrency", min=1, help="Bounded parallel dispatch (default 1)" + ), +) -> None: + """Run a skill evaluation matrix. + + Exit codes: + 0 - Success (or dry-run completed) + 1 - Runtime error (claude not found, or unexpected failure) + 2 - Validation error (missing --eval-set or malformed eval-set file) + """ + # Intent: lazy import to keep top-level import time low and avoid import cycles. + import mapify_cli.skills_eval.runner as _runner + import mapify_cli.skills_eval.aggregator as _aggregator + from mapify_cli.skills_eval.dispatcher import ClaudeSubprocessDispatcher + from mapify_cli.skills_eval.eval_schema import EvalResultRecord + from datetime import timezone + + # SC-2: --eval-set is required. + if eval_set is None: + console.print( + "[bold red]Error:[/bold red] provide --eval-set PATH" + ) + raise typer.Exit(2) + + # SC-2: load and validate the eval-set; malformed/empty → Exit(2), NO invocations. + try: + entries = _runner.load_eval_set(eval_set) + except ValueError as exc: + console.print(f"[bold red]Error:[/bold red] {exc}") + raise typer.Exit(2) + + # Dry-run path: zero quota, NO dispatcher construction, NO claude required. + if dry_run: + # D10: variant_id fixed = 1, runs = 1. + planned = len(entries) * 1 * 1 + console.print( + f"[bold]Dry-run:[/bold] planned [cyan]{planned}[/cyan] invocation(s) " + f"for skill [bold]{skill}[/bold] — spends 0 quota" + ) + raise typer.Exit(0) + + # HC-6: require claude BEFORE any invocation. + if shutil.which("claude") is None: + console.print( + "[bold red]Error:[/bold red] requires-cmd: claude — " + "install the claude CLI and ensure it is on PATH" + ) + raise typer.Exit(1) + + # Resolve output path. + root = Path.cwd() + if resume: + latest = _runner.latest_run_path(root, skill) + out_path = latest if latest is not None else _runner.default_run_path( + root, skill, datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") + ) + else: + out_path = _runner.default_run_path( + root, skill, datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") + ) + + # Run the evaluation matrix. + disp = ClaudeSubprocessDispatcher() + _aggregator.bounded_run( + skill=skill, + entries=entries, + dispatcher=disp, + runs=1, + out_path=out_path, + resume=resume, + max_concurrency=max_concurrency, + ) + + # Read all records from the output file, aggregate, and print summary. + records: List[EvalResultRecord] = [] + if out_path.exists(): + for raw_line in out_path.read_text(encoding="utf-8").splitlines(): + raw_line = raw_line.strip() + if not raw_line: + continue + try: + records.append(EvalResultRecord.from_dict(__import__("json").loads(raw_line))) + except (ValueError, KeyError): + continue + + summary = _aggregator.aggregate(records) + console.print( + f"\n[bold]Eval complete:[/bold] skill=[bold]{skill}[/bold] " + f"pass_rate=[cyan]{summary.pass_rate:.1%}[/cyan] " + f"({summary.passed_cells}/{summary.total_cells} cells passed)" + ) + if summary.tokens_mean is not None: + console.print( + f" tokens mean={summary.tokens_mean:.1f} " + f"stddev={summary.tokens_stddev or 0.0:.1f} " + f"(n={summary.token_sample_size})" + ) + if summary.duration_mean is not None: + console.print( + f" duration mean={summary.duration_mean:.2f}s " + f"stddev={summary.duration_stddev or 0.0:.2f}s" + ) + console.print(f" artifact: [cyan]{out_path}[/cyan]") + + # Validate commands diff --git a/src/mapify_cli/skills_eval/__init__.py b/src/mapify_cli/skills_eval/__init__.py new file mode 100644 index 0000000..df7042f --- /dev/null +++ b/src/mapify_cli/skills_eval/__init__.py @@ -0,0 +1,57 @@ +"""skills_eval — skill trigger evaluation data contracts and dispatchers. + +Exports the shared types used by every eval component (dispatcher, assertions, +runner, aggregator) and the concrete dispatcher implementations. +""" + +from __future__ import annotations + +from mapify_cli.skills_eval.assertions import ( + AssertionResult, + run_assertion, + run_assertions, +) +from mapify_cli.skills_eval.dispatcher import ( + ClaudeSubprocessDispatcher, + MockDispatcher, + VariantDispatcher, +) +from mapify_cli.skills_eval.eval_schema import ( + DispatchResult, + EvalResultRecord, + EvalSetEntry, + make_cell_id, +) +from mapify_cli.skills_eval.runner import ( + default_run_path, + evaluate_cell, + latest_run_path, + load_eval_set, + run_eval, +) +from mapify_cli.skills_eval.aggregator import ( + AggregateSummary, + aggregate, + bounded_run, +) + +__all__ = [ + "AggregateSummary", + "AssertionResult", + "ClaudeSubprocessDispatcher", + "DispatchResult", + "EvalResultRecord", + "EvalSetEntry", + "MockDispatcher", + "VariantDispatcher", + "aggregate", + "bounded_run", + "default_run_path", + "evaluate_cell", + "latest_run_path", + "load_eval_set", + "make_cell_id", + "run_assertion", + "run_assertions", + "run_eval", +] diff --git a/src/mapify_cli/skills_eval/aggregator.py b/src/mapify_cli/skills_eval/aggregator.py new file mode 100644 index 0000000..1e55e27 --- /dev/null +++ b/src/mapify_cli/skills_eval/aggregator.py @@ -0,0 +1,300 @@ +"""Aggregation and bounded-concurrency runner for skills_eval. + +Public API: +- ``AggregateSummary`` -- frozen dataclass summarising a completed eval run. +- ``aggregate(records)`` -- compute summary stats from a list of EvalResultRecord. +- ``bounded_run(...)`` -- parallel cell dispatch with serialised durable writes. + +Design invariants respected: +- INV-3: no ``import anthropic``, no ANTHROPIC_API_KEY access. +- INV-5: ClaudeSubprocessDispatcher isolation is automatic (each dispatch creates + its own mkdtemp cwd); no extra isolation code is needed here. +- VC1: pass_rate = passed_cells / total_cells (0.0 when total==0, never divide-by-zero). +- VC2: token mean/stddev use statistics.mean/stdev; n<2 → stddev 0.0; n==0 → None. +- VC3: bounded_run serialises writes under a threading.Lock (no .jsonl corruption). +- VC4: aggregate never raises on empty list or all-null token_usage records. +- SC-1: max_concurrency controls ThreadPoolExecutor workers; default 1 (sequential). +""" + +from __future__ import annotations + +import concurrent.futures +import dataclasses +import logging +import statistics +import threading +from dataclasses import dataclass +from pathlib import Path +from typing import Any, TypeAlias + +from mapify_cli.skills_eval.eval_schema import EvalResultRecord +from mapify_cli.skills_eval.eval_schema import EvalSetEntry +from mapify_cli.skills_eval.dispatcher import VariantDispatcher +from mapify_cli.skills_eval.runner import ( + _append_record, + _read_present_cell_ids, + evaluate_cell, + make_cell_id, +) + +logger = logging.getLogger(__name__) + +# Intent: fixed variant_id per D10 -- matches the constant in runner.py. +_VARIANT_ID: int = 1 + +# Re-export make_cell_id so callers who import from aggregator get it too. +__all__ = ["AggregateSummary", "aggregate", "bounded_run"] + +# Intent: module-level TypeAlias so pyright can resolve it in function annotations. +_WorkItem: TypeAlias = tuple[int, int, EvalSetEntry] + + +# --------------------------------------------------------------------------- +# AggregateSummary +# --------------------------------------------------------------------------- + + +@dataclass(frozen=True) +class AggregateSummary: + """Aggregate statistics over a completed eval run. + + JSON-serialisable via ``to_dict()``. All float fields that can be absent + (token stats, duration when no records) are typed ``float | None``. + + Fields + ------ + total_cells: + Total number of ``EvalResultRecord`` objects in the input. + passed_cells: + Count of records whose ``assertions_failed`` list is EMPTY. + pass_rate: + ``passed_cells / total_cells``; 0.0 when ``total_cells == 0``. + token_sample_size: + Count of records where ``token_usage`` is not None. + tokens_mean: + Arithmetic mean of ``token_usage.total`` over the token sample. + ``None`` when ``token_sample_size == 0``. + tokens_stddev: + Sample standard deviation of ``token_usage.total``; 0.0 when + ``token_sample_size < 2``; ``None`` when ``token_sample_size == 0``. + duration_mean: + Arithmetic mean of ``record.duration_s`` over all records. + ``None`` when ``total_cells == 0``. + duration_stddev: + Sample standard deviation of ``duration_s``; 0.0 when + ``total_cells < 2``; ``None`` when ``total_cells == 0``. + """ + + total_cells: int + passed_cells: int + pass_rate: float + token_sample_size: int + tokens_mean: float | None + tokens_stddev: float | None + duration_mean: float | None + duration_stddev: float | None + + def to_dict(self) -> dict[str, Any]: + """Return a JSON-serialisable dict for this summary.""" + return dataclasses.asdict(self) + + +# --------------------------------------------------------------------------- +# _safe_stddev (n<2 guard, shared by token and duration paths) +# --------------------------------------------------------------------------- + + +def _safe_stddev(xs: list[float]) -> float: + """Return sample stdev of *xs*, guarding against n<2 with 0.0. + + ``statistics.stdev`` raises ``StatisticsError`` on n<2; we normalise that + to 0.0 because a single-sample (or zero-sample) collection has no spread. + The caller guarantees ``len(xs) >= 1`` (use 0.0 for empty at the call site). + """ + if len(xs) < 2: + return 0.0 + return statistics.stdev(xs) + + +# --------------------------------------------------------------------------- +# aggregate +# --------------------------------------------------------------------------- + + +def aggregate(records: list[EvalResultRecord]) -> AggregateSummary: + """Compute aggregate statistics over *records*. + + Never raises, even for an empty list or all-null ``token_usage`` records. + + Parameters + ---------- + records: + List of ``EvalResultRecord`` objects from a completed (or partial) run. + May be empty. + + Returns + ------- + AggregateSummary + Populated summary. When ``records`` is empty: + ``total_cells=0, passed_cells=0, pass_rate=0.0, + token_sample_size=0, tokens_mean=None, tokens_stddev=None, + duration_mean=None, duration_stddev=None``. + """ + total_cells = len(records) + + # VC1: pass_rate --- cells with EMPTY assertions_failed are "passed". + passed_cells = sum(1 for r in records if len(r.assertions_failed) == 0) + # Intent: explicit zero-guard so we never divide by zero. + pass_rate = passed_cells / total_cells if total_cells > 0 else 0.0 + + # VC2/VC4: token stats --- only over records with non-null token_usage. + token_totals: list[float] = [ + float(r.token_usage.total) for r in records if r.token_usage is not None + ] + token_sample_size = len(token_totals) + if token_sample_size == 0: + # VC4: all-null token_usage → both stats are None; pass_rate+duration still valid. + tokens_mean: float | None = None + tokens_stddev: float | None = None + else: + tokens_mean = statistics.mean(token_totals) + tokens_stddev = _safe_stddev(token_totals) + + # Duration stats --- duration_s is always present on every record. + if total_cells == 0: + duration_mean: float | None = None + duration_stddev: float | None = None + else: + durations: list[float] = [r.duration_s for r in records] + duration_mean = statistics.mean(durations) + duration_stddev = _safe_stddev(durations) + + return AggregateSummary( + total_cells=total_cells, + passed_cells=passed_cells, + pass_rate=pass_rate, + token_sample_size=token_sample_size, + tokens_mean=tokens_mean, + tokens_stddev=tokens_stddev, + duration_mean=duration_mean, + duration_stddev=duration_stddev, + ) + + +# --------------------------------------------------------------------------- +# bounded_run +# --------------------------------------------------------------------------- + + +def bounded_run( + *, + skill: str, + entries: list[EvalSetEntry], + dispatcher: VariantDispatcher, + runs: int, + out_path: Path, + resume: bool = False, + max_concurrency: int = 1, +) -> list[EvalResultRecord]: + """Run the prompts x runs matrix with bounded parallel dispatch. + + Mirrors ``run_eval`` but executes cells in a ``ThreadPoolExecutor`` with up + to *max_concurrency* worker threads. All .jsonl writes are serialised under + a ``threading.Lock`` so the output file is never corrupted (VC3). + + Parameters + ---------- + skill: + Skill name (used for logging). + entries: + Eval-set rows (``EvalSetEntry`` objects). + dispatcher: + Dispatcher instance. Each ``evaluate_cell`` call invokes + ``dispatcher.dispatch()``. For ``ClaudeSubprocessDispatcher``, INV-5 + isolation is automatic — each dispatch creates its own ``mkdtemp`` cwd + so concurrent dispatches never share working directories. + runs: + Number of runs per prompt. + out_path: + Absolute path to the ``.jsonl`` output file. + resume: + If True, skip cells already present in *out_path* (keyed on cell_id). + max_concurrency: + Maximum number of concurrent worker threads. ``1`` (default) makes + this effectively sequential while sharing the same code path as + parallel execution. + + Returns + ------- + list[EvalResultRecord] + All records dispatched during THIS call (resumed/skipped cells excluded). + Write order in the .jsonl may be nondeterministic at concurrency>1, but + the SET of cell_ids is always complete and unique. + """ + # Determine the complete set of cells to skip (resume mode). + present_cell_ids: set[str] = set() + if resume and out_path.exists(): + present_cell_ids = _read_present_cell_ids(out_path) + logger.info( + "bounded_run: resume mode -- %d cells already present in %s", + len(present_cell_ids), + out_path, + ) + + # Ensure output directory exists before any worker touches the file. + out_path.parent.mkdir(parents=True, exist_ok=True) + + # Build the work list: (prompt_index, run_number, entry) for missing cells only. + work_items: list[_WorkItem] = [] + for prompt_index, entry in enumerate(entries): + for run_number in range(runs): + cell_id = make_cell_id(prompt_index, _VARIANT_ID, run_number) + if cell_id not in present_cell_ids: + work_items.append((prompt_index, run_number, entry)) + else: + logger.debug( + "bounded_run: skipping cell %s (already present in %s)", + cell_id, + out_path, + ) + + # Intent: serialised-write lock -- only one thread may append to the .jsonl + # at a time, preventing interleaved/corrupted writes (VC3). + write_lock = threading.Lock() + collected: list[EvalResultRecord] = [] + + def _dispatch_and_record(item: _WorkItem) -> EvalResultRecord: + """Worker: evaluate one cell and serialise the write.""" + prompt_idx, run_num, cell_entry = item + record = evaluate_cell( + skill=skill, + entry=cell_entry, + prompt_index=prompt_idx, + run_number=run_num, + dispatcher=dispatcher, + ) + with write_lock: + # INV-4: durable per-cell append-and-flush, serialised. + _append_record(out_path, record) + collected.append(record) + return record + + workers = max(1, max_concurrency) + with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: + futures = [executor.submit(_dispatch_and_record, item) for item in work_items] + # Intent: iterate futures as they complete; re-raise any unexpected exception + # so the caller can detect programming errors (dispatcher must not raise, per + # its contract, but the lock/append path theoretically could). + for future in concurrent.futures.as_completed(futures): + future.result() # propagates any unexpected exception + + logger.info( + "bounded_run: finished skill=%s entries=%d runs=%d cells_written=%d out=%s", + skill, + len(entries), + runs, + len(collected), + out_path, + ) + + return collected diff --git a/src/mapify_cli/skills_eval/assertions.py b/src/mapify_cli/skills_eval/assertions.py new file mode 100644 index 0000000..2f7141d --- /dev/null +++ b/src/mapify_cli/skills_eval/assertions.py @@ -0,0 +1,284 @@ +"""Pure, deterministic assertion runner for skill eval cells. + +No LLM, no subprocess, no file I/O, no network. Same (spec, result) +always produces the same verdict (INV-3: no ``import anthropic``, +no ANTHROPIC_API_KEY). + +Assertion types +--------------- +- contains – value in raw_output +- not_contains – value not in raw_output +- regex – re.search(pattern, raw_output) is not None +- valid_json – raw_output.strip() parses via json.loads +- trigger – triggered_skill == skill +- not_trigger – triggered_skill != skill (None-safe: SC-3) + +Robustness +---------- +- Unknown type → FAIL, detail "unknown assertion type: " +- Missing key → FAIL, clear detail, no KeyError +- Invalid regex → FAIL, detail includes re.error message +- run_assertion never raises +""" + +from __future__ import annotations + +import json +import re +from dataclasses import dataclass + +from mapify_cli.skills_eval.eval_schema import DispatchResult + + +# --------------------------------------------------------------------------- +# AssertionResult +# --------------------------------------------------------------------------- + + +@dataclass(frozen=True) +class AssertionResult: + """Immutable result of a single assertion evaluation.""" + + passed: bool + type: str + detail: str + + +# --------------------------------------------------------------------------- +# Internal helpers — one per assertion type +# --------------------------------------------------------------------------- + + +def _assert_contains(spec: dict[str, object], result: DispatchResult) -> AssertionResult: + """PASS iff spec["value"] is a substring of result.raw_output.""" + value = spec.get("value") + if not isinstance(value, str): + return AssertionResult( + passed=False, + type="contains", + detail=f"contains: missing or non-string 'value' key (got {type(value).__name__!r})", + ) + matched = value in result.raw_output + verb = "found in" if matched else "not found in" + return AssertionResult( + passed=matched, + type="contains", + detail=f"contains {value!r} -> {'PASS' if matched else 'FAIL'} ({verb} raw_output)", + ) + + +def _assert_not_contains(spec: dict[str, object], result: DispatchResult) -> AssertionResult: + """PASS iff spec["value"] is NOT a substring of result.raw_output.""" + value = spec.get("value") + if not isinstance(value, str): + return AssertionResult( + passed=False, + type="not_contains", + detail=( + f"not_contains: missing or non-string 'value' key " + f"(got {type(value).__name__!r})" + ), + ) + matched = value in result.raw_output + return AssertionResult( + passed=not matched, + type="not_contains", + detail=( + f"not_contains {value!r} -> {'PASS' if not matched else 'FAIL'} " + f"({'absent from' if not matched else 'found in'} raw_output)" + ), + ) + + +def _assert_regex(spec: dict[str, object], result: DispatchResult) -> AssertionResult: + """PASS iff re.search(pattern, raw_output) is not None. + + Invalid regex pattern -> FAIL (detail includes re.error message). + """ + pattern = spec.get("pattern") + if not isinstance(pattern, str): + return AssertionResult( + passed=False, + type="regex", + detail=( + f"regex: missing or non-string 'pattern' key " + f"(got {type(pattern).__name__!r})" + ), + ) + try: + match = re.search(pattern, result.raw_output) + except re.error as exc: + return AssertionResult( + passed=False, + type="regex", + detail=f"regex {pattern!r} -> FAIL (invalid pattern: {exc})", + ) + matched = match is not None + return AssertionResult( + passed=matched, + type="regex", + detail=( + f"regex {pattern!r} -> {'PASS' if matched else 'FAIL'} " + f"({'match found' if matched else 'no match'} in raw_output)" + ), + ) + + +def _assert_valid_json( + _spec: dict[str, object], result: DispatchResult +) -> AssertionResult: + """PASS iff result.raw_output.strip() parses via json.loads.""" + try: + json.loads(result.raw_output.strip()) + return AssertionResult( + passed=True, + type="valid_json", + detail="valid_json -> PASS (raw_output is well-formed JSON)", + ) + except (json.JSONDecodeError, ValueError) as exc: + return AssertionResult( + passed=False, + type="valid_json", + detail=f"valid_json -> FAIL (JSON parse error: {exc})", + ) + + +def _assert_trigger(spec: dict[str, object], result: DispatchResult) -> AssertionResult: + """PASS iff result.triggered_skill == spec["skill"].""" + skill = spec.get("skill") + if not isinstance(skill, str): + return AssertionResult( + passed=False, + type="trigger", + detail=( + f"trigger: missing or non-string 'skill' key " + f"(got {type(skill).__name__!r})" + ), + ) + matched = result.triggered_skill == skill + return AssertionResult( + passed=matched, + type="trigger", + detail=( + f"trigger {skill!r} -> {'PASS' if matched else 'FAIL'} " + f"(triggered_skill={result.triggered_skill!r})" + ), + ) + + +def _assert_not_trigger( + spec: dict[str, object], result: DispatchResult +) -> AssertionResult: + """PASS iff result.triggered_skill != spec["skill"]. + + SC-3: correctly handles triggered_skill is None — + ``not_trigger {"skill": "map-x"}`` PASSES when triggered_skill is None. + """ + skill = spec.get("skill") + if not isinstance(skill, str): + return AssertionResult( + passed=False, + type="not_trigger", + detail=( + f"not_trigger: missing or non-string 'skill' key " + f"(got {type(skill).__name__!r})" + ), + ) + # None != skill is True, so this naturally satisfies SC-3. + matched = result.triggered_skill != skill + return AssertionResult( + passed=matched, + type="not_trigger", + detail=( + f"not_trigger {skill!r} -> {'PASS' if matched else 'FAIL'} " + f"(triggered_skill={result.triggered_skill!r})" + ), + ) + + +# --------------------------------------------------------------------------- +# Dispatcher table +# --------------------------------------------------------------------------- + +# Intent: map assertion type string to its handler function. +# Using a dict avoids a long if/elif chain and makes type extension O(1). +_ASSERTION_HANDLERS = { + "contains": _assert_contains, + "not_contains": _assert_not_contains, + "regex": _assert_regex, + "valid_json": _assert_valid_json, + "trigger": _assert_trigger, + "not_trigger": _assert_not_trigger, +} + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + +def run_assertion(spec: dict[str, object], result: DispatchResult) -> AssertionResult: + """Evaluate a single assertion spec against a DispatchResult. + + Never raises — unknown types and missing keys produce FAIL results with + human-debuggable ``detail`` strings. + + Parameters + ---------- + spec: + Dict with at least a ``"type"`` key and any type-specific keys. + result: + The DispatchResult from the dispatcher (ST-002). + + Returns + ------- + AssertionResult + Frozen dataclass; ``passed`` is the verdict, ``detail`` explains why. + """ + assertion_type = spec.get("type") + if not isinstance(assertion_type, str): + return AssertionResult( + passed=False, + type=str(assertion_type), + detail=( + f"unknown assertion type: {assertion_type!r} " + f"(must be str, got {type(assertion_type).__name__!r})" + ), + ) + + handler = _ASSERTION_HANDLERS.get(assertion_type) + if handler is None: + return AssertionResult( + passed=False, + type=assertion_type, + detail=f"unknown assertion type: {assertion_type!r}", + ) + + return handler(spec, result) + + +def run_assertions( + specs: list[dict[str, object]], + result: DispatchResult, +) -> tuple[list[str], list[str]]: + """Run all assertions in *specs* against *result*. + + Returns + ------- + tuple[list[str], list[str]] + ``(passed_details, failed_details)`` — the ``detail`` strings of + passing vs failing assertions, suitable for + ``EvalResultRecord.assertions_passed`` / + ``EvalResultRecord.assertions_failed``. + """ + passed_details: list[str] = [] + failed_details: list[str] = [] + + for spec in specs: + ar = run_assertion(spec, result) + if ar.passed: + passed_details.append(ar.detail) + else: + failed_details.append(ar.detail) + + return passed_details, failed_details diff --git a/src/mapify_cli/skills_eval/dispatcher.py b/src/mapify_cli/skills_eval/dispatcher.py new file mode 100644 index 0000000..e87c406 --- /dev/null +++ b/src/mapify_cli/skills_eval/dispatcher.py @@ -0,0 +1,540 @@ +"""Variant dispatcher for the skills_eval package. + +Provides the ABC ``VariantDispatcher`` and two concrete implementations: +- ``MockDispatcher``: zero-subprocess, caller-controlled output for CI tests (INV-2). +- ``ClaudeSubprocessDispatcher``: real ``claude -p`` invocation in a seeded + throwaway temp cwd with the TEMP-FLIP applied. + +Hard constraints (INV-2, INV-3, INV-5) +--------------------------------------- +- Uses only stdlib; no Anthropic SDK imports (INV-3). +- Does not read cloud credentials from the environment (INV-3). +- Production ``.claude/`` and ``.map/`` trees are NEVER modified (INV-5). + The TEMP-FLIP touches only the throwaway seeded copy. +- ``MockDispatcher.dispatch`` NEVER calls subprocess (INV-2). +""" + +from __future__ import annotations + +import json +import logging +import os +import random +import shutil +import subprocess +import tempfile +import time +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Any + +from mapify_cli.skills_eval.eval_schema import DispatchResult +from mapify_cli.token_budget import TokenUsage + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Abstract base +# --------------------------------------------------------------------------- + + +class VariantDispatcher(ABC): + """Abstract dispatcher: given a prompt, produce a ``DispatchResult``.""" + + @abstractmethod + def dispatch(self, prompt: str) -> DispatchResult: + """Run ``prompt`` and return a fully-populated ``DispatchResult``. + + Implementations MUST NOT raise — transient failures are captured in + ``DispatchResult.error``. + """ + + +# --------------------------------------------------------------------------- +# MockDispatcher — CI / unit-test use only (INV-2: zero subprocess) +# --------------------------------------------------------------------------- + + +class MockDispatcher(VariantDispatcher): + """Caller-controlled dispatcher that performs ZERO subprocess work. + + All tests in the CI suite use this instead of ``ClaudeSubprocessDispatcher`` + to avoid real ``claude`` invocations. Construct with the exact field values + that ``dispatch()`` should return. + """ + + def __init__( + self, + *, + triggered_skill: str | None = None, + raw_output: str = "", + token_usage: TokenUsage | None = None, + duration_s: float = 0.0, + error: str | None = None, + ) -> None: + self._triggered_skill = triggered_skill + self._raw_output = raw_output + self._token_usage = token_usage + self._duration_s = duration_s + self._error = error + + def dispatch(self, prompt: str) -> DispatchResult: + """Return the caller-configured ``DispatchResult``. + + No subprocess call, no file I/O — pure attribute access (INV-2). + The ``prompt`` is intentionally ignored — a mock returns a fixed result. + """ + del prompt # intentionally unused; mock returns caller-set values + return DispatchResult( + raw_output=self._raw_output, + triggered_skill=self._triggered_skill, + token_usage=self._token_usage, + duration_s=self._duration_s, + error=self._error, + ) + + +# --------------------------------------------------------------------------- +# Seeding helpers (ClaudeSubprocessDispatcher internals) +# --------------------------------------------------------------------------- + + +def _seed_temp_cwd(source_claude_dir: Path) -> Path: + """Create a throwaway temp directory seeded with a copy of ``.claude/``. + + Steps: + 1. ``tempfile.mkdtemp()`` — fresh isolated dir. + 2. ``shutil.copytree(source_claude_dir, /.claude)`` — full copy. + 3. ``os.makedirs(/.map)`` — fresh empty ``.map/`` (no production state). + 4. TEMP-FLIP: rewrite ``disable-model-invocation: true`` → + ``disable-model-invocation: false`` in every seeded SKILL.md. + + Returns the tmp dir ``Path``. + Caller is responsible for ``shutil.rmtree(tmp, ignore_errors=True)`` cleanup. + """ + tmp = Path(tempfile.mkdtemp(prefix="mapeval-")) + + # 1. Copy .claude/ tree (only if source exists). + seeded_claude = tmp / ".claude" + if source_claude_dir.is_dir(): + shutil.copytree(source_claude_dir, seeded_claude) + else: + seeded_claude.mkdir(parents=True) + logger.warning( + "seed_temp_cwd: source_claude_dir %s does not exist — seeding empty .claude/", + source_claude_dir, + ) + + # 2. Empty .map/ — prevents accidental reads of production workflow state. + (tmp / ".map").mkdir(parents=True) + + # 3. TEMP-FLIP: make every skill model-selectable for the eval (spike VC3). + # Pattern: a frontmatter line ``disable-model-invocation: true`` (any + # leading/trailing whitespace) → ``disable-model-invocation: false``. + # Skills without the field are left untouched (already invocable). + _apply_temp_flip(seeded_claude) + + return tmp + + +def _apply_temp_flip(seeded_claude_dir: Path) -> None: + """Rewrite ``disable-model-invocation: true`` → ``false`` in seeded SKILL.md files. + + Intent: allow the eval model to select any skill via description, not just + the three production-invocable ones. Throwaway copy only — production + templates are never touched. + """ + skill_files = list(seeded_claude_dir.glob("skills/*/SKILL.md")) + for skill_file in skill_files: + try: + original = skill_file.read_text(encoding="utf-8") + except OSError as exc: + logger.warning("temp_flip: could not read %s: %s", skill_file, exc) + continue + + flipped = _flip_disable_invocation_line(original) + if flipped != original: + try: + skill_file.write_text(flipped, encoding="utf-8") + except OSError as exc: + logger.warning("temp_flip: could not write %s: %s", skill_file, exc) + + +def _flip_disable_invocation_line(content: str) -> str: + """Replace the first ``disable-model-invocation: true`` line with ``false``. + + Operates line-by-line to avoid regex mis-matches on other content. + Returns the original string unchanged if the field is absent or already false. + """ + lines = content.splitlines(keepends=True) + result: list[str] = [] + for line in lines: + stripped = line.strip() + if stripped == "disable-model-invocation: true": + # Preserve leading/trailing whitespace so the YAML structure stays valid. + result.append(line.replace("true", "false", 1)) + else: + result.append(line) + return "".join(result) + + +# --------------------------------------------------------------------------- +# Transcript helpers +# --------------------------------------------------------------------------- + + +def _derive_triggered_skill(session_id: str, cwd: Path) -> str | None: + """Scan the native JSONL transcript for the first fired skill. + + Search order (spike VC3 binding contract): + 1. Glob ``~/.claude/projects/*/.jsonl`` (session_id is a unique + UUID — no slug fragility). + 2. Fall back to slug-from-cwd path if glob returns nothing. + 3. If transcript not found → return ``None`` (do not crash). + + Detection rule: find the first assistant message.content[*] where + ``type=="tool_use"`` and ``name=="Skill"``; return ``input.skill``. + ``name=="Agent"`` / ``Task`` blocks are ignored. + """ + if not session_id: + return None + + transcript_path = _locate_transcript(session_id, cwd) + if transcript_path is None or not transcript_path.exists(): + logger.debug( + "transcript not found for session_id=%s cwd=%s", session_id, cwd + ) + return None + + return _parse_transcript_for_skill(transcript_path) + + +def _locate_transcript(session_id: str, cwd: Path) -> Path | None: + """Return the path to the JSONL transcript or ``None`` if not found.""" + projects_dir = Path.home() / ".claude" / "projects" + + # Primary: UUID-based glob — immune to slug encoding differences. + if session_id: + matches = list(projects_dir.glob(f"*/{session_id}.jsonl")) + if matches: + return matches[0] + + # Fallback: reconstruct slug from cwd (``/`` and ``.`` → ``-``). + cwd_slug = str(cwd).replace("/", "-").replace(".", "-") + fallback = projects_dir / cwd_slug / f"{session_id}.jsonl" + if fallback.exists(): + return fallback + + return None + + +def _parse_transcript_for_skill(path: Path) -> str | None: + """Return the first ``Skill`` tool_use ``input.skill`` value, or ``None``.""" + try: + with path.open(encoding="utf-8") as fh: + for raw_line in fh: + raw_line = raw_line.strip() + if not raw_line: + continue + try: + entry = json.loads(raw_line) + except json.JSONDecodeError: + continue + + skill = _extract_skill_from_entry(entry) + if skill is not None: + return skill + except OSError as exc: + logger.warning("parse_transcript: could not read %s: %s", path, exc) + + return None + + +def _extract_skill_from_entry(entry: Any) -> str | None: + """Extract ``input.skill`` from a transcript entry if it is a Skill tool_use. + + Walks ``message.content[*]`` looking for ``type=="tool_use"`` + + ``name=="Skill"``. Returns the skill name string or ``None``. + """ + if not isinstance(entry, dict): + return None + + message = entry.get("message") + if not isinstance(message, dict): + return None + + content = message.get("content") + if not isinstance(content, list): + return None + + for block in content: + if not isinstance(block, dict): + continue + if block.get("type") != "tool_use": + continue + if block.get("name") != "Skill": + continue + tool_input = block.get("input") + if isinstance(tool_input, dict): + skill_name = tool_input.get("skill") + if isinstance(skill_name, str) and skill_name: + return skill_name + + return None + + +# --------------------------------------------------------------------------- +# Envelope parsing +# --------------------------------------------------------------------------- + + +def _parse_envelope(stdout: str) -> tuple[str, TokenUsage | None, str]: + """Parse the ``claude -p --output-format json`` result envelope defensively. + + Returns ``(raw_output, token_usage, session_id)``. + On JSON decode failure returns ``(stdout, None, "")``. + + Mirrors ``_parse_claude_output`` / ``_append_cost_log`` from + ``memory/finalize.py:232-281``. + """ + try: + parsed = json.loads(stdout) + except (json.JSONDecodeError, ValueError): + return stdout, None, "" + + if not isinstance(parsed, dict): + return stdout, None, "" + + raw_output = str(parsed.get("result", "")) + session_id = str(parsed.get("session_id") or "") + + usage_raw = parsed.get("usage") + token_usage: TokenUsage | None = None + if isinstance(usage_raw, dict): + token_usage = TokenUsage( + input_tokens=int(usage_raw.get("input_tokens", 0) or 0), + cache_read_input_tokens=int( + usage_raw.get("cache_read_input_tokens", 0) or 0 + ), + cache_creation_input_tokens=int( + usage_raw.get("cache_creation_input_tokens", 0) or 0 + ), + ) + + return raw_output, token_usage, session_id + + +# --------------------------------------------------------------------------- +# ClaudeSubprocessDispatcher +# --------------------------------------------------------------------------- + +# Default jitter upper-bound (seconds) added to backoff sleep. +_JITTER_MAX: float = 2.0 + + +class ClaudeSubprocessDispatcher(VariantDispatcher): + """Real ``claude -p`` dispatcher for production/manual eval runs. + + Seeding and cleanup + ------------------- + Each ``dispatch()`` call: + 1. Creates a fresh temp cwd seeded with a copy of ``source_claude_dir`` + and an empty ``.map/``. + 2. Applies TEMP-FLIP so all skills are model-selectable. + 3. Runs ``claude -p --output-format json`` in that temp cwd. + 4. Removes the temp dir in a ``try/finally`` block. + + Retry policy (VC4) + ------------------ + ``subprocess.TimeoutExpired``, non-zero ``returncode``, and ``OSError`` + are treated as transient. Up to ``max_retries`` additional attempts are + made with bounded jittered exponential backoff. After exhaustion the error + is recorded in ``DispatchResult.error``; no exception escapes ``dispatch()``. + + INV-3 compliance + ---------------- + No Anthropic SDK import. No cloud credential environment reads. + + INV-5 compliance + ---------------- + ``cwd`` of the subprocess is always the throwaway temp dir. Production + ``.map/`` is never referenced. + """ + + def __init__( + self, + *, + source_claude_dir: Path | None = None, + timeout: float = 120.0, + max_retries: int = 2, + backoff_base: float = 2.0, + ) -> None: + """Initialise the dispatcher. + + Parameters + ---------- + source_claude_dir: + Path to the ``.claude/`` directory to seed from. Defaults to + ``Path.cwd() / ".claude"`` at construction time. + timeout: + Per-attempt timeout in seconds passed to ``subprocess.run``. + max_retries: + Number of *additional* retry attempts after the first failure. + Total attempts = 1 + max_retries. + backoff_base: + Base for exponential backoff (seconds). Attempt 0 sleeps + ``backoff_base * 2**0 + jitter``, attempt 1 sleeps + ``backoff_base * 2**1 + jitter``, etc. + """ + self._source_claude_dir: Path = ( + source_claude_dir if source_claude_dir is not None else Path.cwd() / ".claude" + ) + self._timeout = timeout + self._max_retries = max_retries + self._backoff_base = backoff_base + # Holds the error message from the latest _run_once call. Instance-scoped + # (not class-level) so the safe-sequential-only assumption is explicit. + self._last_error: str = "" + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + + def dispatch(self, prompt: str) -> DispatchResult: + """Dispatch ``prompt`` via ``claude -p``, with backoff retry on failure. + + Always returns a ``DispatchResult`` — never raises. + """ + t_total_start = time.monotonic() + tmp: Path | None = None + + try: + tmp = _seed_temp_cwd(self._source_claude_dir) + return self._dispatch_with_retry(prompt, tmp, t_total_start) + except Exception as exc: # noqa: BLE001 + # Catch any unexpected seeding failure; should not occur in practice. + duration_s = time.monotonic() - t_total_start + logger.warning("dispatch: unexpected error during seeding: %s", exc) + return DispatchResult( + raw_output="", + triggered_skill=None, + token_usage=None, + duration_s=duration_s, + error=f"seeding error: {exc}", + ) + finally: + if tmp is not None: + shutil.rmtree(tmp, ignore_errors=True) + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _dispatch_with_retry( + self, + prompt: str, + tmp: Path, + t_total_start: float, + ) -> DispatchResult: + """Run the subprocess with bounded jittered exponential backoff. + + ``max_retries=2`` means up to 3 total attempts (attempt 0, 1, 2). + After all attempts are exhausted, returns an error ``DispatchResult``. + """ + argv = ["claude", "-p", prompt, "--output-format", "json"] + last_error: str = "" + + for attempt in range(self._max_retries + 1): + if attempt > 0: + sleep_s = self._backoff_base * (2 ** (attempt - 1)) + random.uniform( + 0, _JITTER_MAX + ) + logger.debug( + "dispatch: retry attempt %d/%d — sleeping %.2fs", + attempt, + self._max_retries, + sleep_s, + ) + time.sleep(sleep_s) + + result = self._run_once(argv, tmp) + if result is not None: + # Successful subprocess run — parse and return. + return self._build_result(result, tmp, t_total_start) + + # _run_once returned None => transient failure; last_error was set. + last_error = self._last_error + + duration_s = time.monotonic() - t_total_start + return DispatchResult( + raw_output="", + triggered_skill=None, + token_usage=None, + duration_s=duration_s, + error=last_error or "dispatch failed after retries", + ) + + def _run_once( + self, + argv: list[str], + cwd: Path, + ) -> subprocess.CompletedProcess[str] | None: + """Run ``argv`` once; return ``CompletedProcess`` on success, ``None`` on failure. + + Side-effect: sets ``self._last_error`` on failure. + """ + try: + proc = subprocess.run( + argv, + capture_output=True, + text=True, + timeout=self._timeout, + cwd=cwd, + env={**os.environ, "MAP_INVOKED_BY": "skills-eval"}, + ) + except subprocess.TimeoutExpired as exc: + self._last_error = f"timeout after {self._timeout}s: {exc}" + logger.warning("dispatch: subprocess timed out: %s", exc) + return None + except OSError as exc: + self._last_error = f"OSError: {exc}" + logger.warning("dispatch: OSError running claude: %s", exc) + return None + except Exception as exc: # noqa: BLE001 + self._last_error = f"unexpected subprocess error: {exc}" + logger.warning("dispatch: unexpected subprocess error: %s", exc) + return None + + if proc.returncode != 0: + self._last_error = ( + f"non-zero returncode {proc.returncode}: " + f"{(proc.stderr or '')[:200].strip()}" + ) + logger.warning( + "dispatch: claude returned returncode=%d stderr=%s", + proc.returncode, + (proc.stderr or "")[:200].strip(), + ) + return None + + return proc + + def _build_result( + self, + proc: subprocess.CompletedProcess[str], + tmp: Path, + t_start: float, + ) -> DispatchResult: + """Parse the envelope from a successful subprocess run.""" + stdout = proc.stdout or "" + raw_output, token_usage, session_id = _parse_envelope(stdout) + duration_s = time.monotonic() - t_start + triggered_skill = _derive_triggered_skill(session_id, tmp) + + return DispatchResult( + raw_output=raw_output, + triggered_skill=triggered_skill, + token_usage=token_usage, + duration_s=duration_s, + error=None, + ) diff --git a/src/mapify_cli/skills_eval/eval_schema.py b/src/mapify_cli/skills_eval/eval_schema.py new file mode 100644 index 0000000..a50766e --- /dev/null +++ b/src/mapify_cli/skills_eval/eval_schema.py @@ -0,0 +1,180 @@ +"""Shared data contracts for the skills_eval package. + +All structures are defined EXACTLY ONCE here and imported by every eval +component (dispatcher, assertions, runner, aggregator). This module is a +pure data layer — no dispatch logic, transcript parsing, assertion execution, +or I/O of any kind. + +INV-3: No ``import anthropic`` and no ANTHROPIC_API_KEY access anywhere. +INV-6: Contract-first — producer and consumer both import from this module. +""" + +from __future__ import annotations + +import dataclasses +from dataclasses import dataclass, field +from typing import Any + +from mapify_cli.token_budget import TokenUsage + + +# --------------------------------------------------------------------------- +# EvalSetEntry +# --------------------------------------------------------------------------- + + +@dataclass +class EvalSetEntry: + """One row parsed from a JSON eval-set file. + + Built from externally supplied JSON, so field types are validated + explicitly in ``__post_init__`` — Python type hints are documentation only. + """ + + prompt: str + should_trigger: str | None + should_not_trigger: str | None + assertions: list[dict] # type: ignore[type-arg] + + def __post_init__(self) -> None: + if not isinstance(self.prompt, str): + raise ValueError( + f"EvalSetEntry.prompt must be str, got {type(self.prompt).__name__!r}" + ) + if self.should_trigger is not None and not isinstance(self.should_trigger, str): + raise ValueError( + "EvalSetEntry.should_trigger must be str or None, " + f"got {type(self.should_trigger).__name__!r}" + ) + if self.should_not_trigger is not None and not isinstance( + self.should_not_trigger, str + ): + raise ValueError( + "EvalSetEntry.should_not_trigger must be str or None, " + f"got {type(self.should_not_trigger).__name__!r}" + ) + if not isinstance(self.assertions, list): + raise ValueError( + "EvalSetEntry.assertions must be list, " + f"got {type(self.assertions).__name__!r}" + ) + + +# --------------------------------------------------------------------------- +# DispatchResult +# --------------------------------------------------------------------------- + + +@dataclass +class DispatchResult: + """Result returned by the skill dispatcher for a single prompt. + + ``token_usage`` and ``error`` are optional — dispatcher sets ``error`` + when the API call fails and ``token_usage`` may be absent on failure. + ``TokenUsage`` is imported from ``mapify_cli.token_budget``; it is NOT + redefined here (INV-6). + """ + + raw_output: str + triggered_skill: str | None + token_usage: TokenUsage | None + duration_s: float + error: str | None = None + + +# --------------------------------------------------------------------------- +# EvalResultRecord (append-only .jsonl row) +# --------------------------------------------------------------------------- + +# Sentinel used in from_dict to distinguish «key absent» from «key present but None». +_MISSING: object = object() + +@dataclass +class EvalResultRecord: + """One completed eval result, serialisable to/from a JSON object. + + Used for the append-only ``.jsonl`` result file written by the runner + (ST-005). ``to_dict`` / ``from_dict`` provide a stable round-trip. + ``TokenUsage`` is a flat 3-int frozen dataclass; it is serialised as a + nested dict (via ``dataclasses.asdict``) and reconstructed in + ``from_dict``. + """ + + cell_id: str + prompt: str + triggered_skill: str | None + token_usage: TokenUsage | None + duration_s: float + assertions_passed: list[str] = field(default_factory=list) + assertions_failed: list[str] = field(default_factory=list) + raw_output: str = "" + + # ------------------------------------------------------------------ + # Serialisation helpers + # ------------------------------------------------------------------ + + def to_dict(self) -> dict[str, Any]: + """Return a JSON-serialisable dict for this record. + + ``token_usage`` is either a nested dict (3 keys) or ``None``. + """ + return { + "cell_id": self.cell_id, + "prompt": self.prompt, + "triggered_skill": self.triggered_skill, + "token_usage": ( + dataclasses.asdict(self.token_usage) + if self.token_usage is not None + else None + ), + "duration_s": self.duration_s, + "assertions_passed": list(self.assertions_passed), + "assertions_failed": list(self.assertions_failed), + "raw_output": self.raw_output, + } + + @classmethod + def from_dict(cls, d: dict[str, Any]) -> "EvalResultRecord": + """Reconstruct an ``EvalResultRecord`` from a plain dict (JSON parse). + + Tolerates ``token_usage=None`` and missing keys for + ``assertions_passed``, ``assertions_failed``, and ``raw_output`` + (backward compatibility with older .jsonl rows). + """ + raw_tu = d.get("token_usage", _MISSING) + if raw_tu is _MISSING or raw_tu is None: + token_usage: TokenUsage | None = None + else: + token_usage = TokenUsage( + input_tokens=int(raw_tu.get("input_tokens", 0)), + cache_read_input_tokens=int(raw_tu.get("cache_read_input_tokens", 0)), + cache_creation_input_tokens=int( + raw_tu.get("cache_creation_input_tokens", 0) + ), + ) + return cls( + cell_id=d["cell_id"], + prompt=d["prompt"], + triggered_skill=d.get("triggered_skill"), + token_usage=token_usage, + duration_s=float(d["duration_s"]), + assertions_passed=list(d.get("assertions_passed", [])), + assertions_failed=list(d.get("assertions_failed", [])), + raw_output=d.get("raw_output", ""), + ) + + +# --------------------------------------------------------------------------- +# make_cell_id +# --------------------------------------------------------------------------- + + +def make_cell_id(prompt_index: int, variant_id: int, run_number: int) -> str: + """Return a deterministic, human-readable cell identifier. + + The format is stable so ``--resume`` can match present cell_ids across + runs without relying on randomness or wall-clock time. + + Example: ``make_cell_id(0, 1, 2)`` → ``"p0-v1-r2"`` + """ + return f"p{prompt_index}-v{variant_id}-r{run_number}" diff --git a/src/mapify_cli/skills_eval/runner.py b/src/mapify_cli/skills_eval/runner.py new file mode 100644 index 0000000..610d5d6 --- /dev/null +++ b/src/mapify_cli/skills_eval/runner.py @@ -0,0 +1,425 @@ +"""Matrix runner for skill eval: prompts x runs -> durable resumable .jsonl. + +Public API (plain functions; no Typer -- CLI wiring is ST-007): +- ``load_eval_set(path)`` -- parse a JSON eval-set file. +- ``run_eval(...)`` -- execute the p x r matrix, append results. +- ``default_run_path(root, skill, timestamp)`` -- canonical .jsonl path helper. +- ``latest_run_path(root, skill)`` -- find most-recent .jsonl for --resume. + +Design invariants respected: +- INV-3: no ``import anthropic``, no ANTHROPIC_API_KEY access. +- INV-7: ``triggered_skill`` is consumed from ``DispatchResult.triggered_skill`` + (the dispatcher is the SINGLE source of trigger detection). The runner + does NOT parse transcripts. +- D10: variant_id is always 1 (no variants loop). +- INV-4: each cell is flushed to disk immediately (durable per-cell append). +- VC3: resume reads existing cell_ids, skips already-written cells, appends only + missing ones to the SAME file. +- VC4: a per-cell dispatch error is recorded (not raised); matrix continues. +""" + +from __future__ import annotations + +import json +import logging +from pathlib import Path +from typing import Any + +from mapify_cli.skills_eval.assertions import run_assertions +from mapify_cli.skills_eval.dispatcher import VariantDispatcher +from mapify_cli.skills_eval.eval_schema import ( + DispatchResult, + EvalResultRecord, + EvalSetEntry, + make_cell_id, +) + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +# Intent: fixed variant_id per D10 -- never enter a variants loop. +_VARIANT_ID: int = 1 + + +# --------------------------------------------------------------------------- +# load_eval_set +# --------------------------------------------------------------------------- + + +def load_eval_set(path: Path) -> list[EvalSetEntry]: + """Parse a JSON eval-set file and return a list of ``EvalSetEntry`` rows. + + Expected JSON shape:: + + { + "entries": [ + { + "prompt": "", + "should_trigger": "", + "should_not_trigger": "", + "assertions": [ {"type": "...", ...}, ... ] + }, + ... + ] + } + + Parameters + ---------- + path: + Filesystem path to the ``.json`` eval-set file. + + Returns + ------- + list[EvalSetEntry] + Non-empty list of parsed rows. + + Raises + ------ + ValueError + On: missing file, file not valid JSON, missing or empty "entries" key, + or any row that fails ``EvalSetEntry.__post_init__`` validation. + """ + if not path.exists(): + raise ValueError(f"eval-set file not found: {path}") + + try: + text = path.read_text(encoding="utf-8") + except OSError as exc: + raise ValueError(f"could not read eval-set file {path}: {exc}") from exc + + try: + data: Any = json.loads(text) + except json.JSONDecodeError as exc: + raise ValueError(f"eval-set file is not valid JSON ({path}): {exc}") from exc + + if not isinstance(data, dict): + raise ValueError( + f"eval-set file must be a JSON object (got {type(data).__name__!r}): {path}" + ) + + raw_entries: Any = data.get("entries") + if raw_entries is None: + raise ValueError(f'eval-set file missing required "entries" key: {path}') + if not isinstance(raw_entries, list): + raise ValueError( + f'"entries" must be a JSON array (got {type(raw_entries).__name__!r}): {path}' + ) + if len(raw_entries) == 0: + raise ValueError(f'"entries" list must not be empty: {path}') + + entries: list[EvalSetEntry] = [] + for row_index, raw_row in enumerate(raw_entries): + if not isinstance(raw_row, dict): + raise ValueError( + f"entries[{row_index}] must be a JSON object " + f"(got {type(raw_row).__name__!r}): {path}" + ) + prompt: Any = raw_row.get("prompt") + if prompt is None: + raise ValueError( + f'entries[{row_index}] missing required "prompt" key: {path}' + ) + should_trigger: str | None = raw_row.get("should_trigger", None) + should_not_trigger: str | None = raw_row.get("should_not_trigger", None) + raw_assertions: Any = raw_row.get("assertions", []) + if not isinstance(raw_assertions, list): + raise ValueError( + f"entries[{row_index}].assertions must be a JSON array " + f"(got {type(raw_assertions).__name__!r}): {path}" + ) + try: + entry = EvalSetEntry( + prompt=prompt, + should_trigger=should_trigger, + should_not_trigger=should_not_trigger, + assertions=raw_assertions, + ) + except ValueError as exc: + raise ValueError( + f"entries[{row_index}] failed validation: {exc}" + ) from exc + entries.append(entry) + + return entries + + +# --------------------------------------------------------------------------- +# _read_present_cell_ids (resume helper) +# --------------------------------------------------------------------------- + + +def _read_present_cell_ids(out_path: Path) -> set[str]: + """Return the set of ``cell_id`` values already in *out_path*. + + Skips blank lines and JSON-malformed lines defensively so a partial last + line (write interrupted mid-flush) does not crash resume. + """ + present: set[str] = set() + try: + with open(out_path, encoding="utf-8") as fh: + for raw_line in fh: + raw_line = raw_line.strip() + if not raw_line: + continue + try: + row: Any = json.loads(raw_line) + except json.JSONDecodeError: + logger.debug( + "_read_present_cell_ids: skipping malformed line in %s", out_path + ) + continue + if not isinstance(row, dict): + continue + cell_id_val = row.get("cell_id") + if isinstance(cell_id_val, str) and cell_id_val: + present.add(cell_id_val) + except OSError as exc: + logger.warning( + "_read_present_cell_ids: could not read %s: %s -- treating as empty", + out_path, + exc, + ) + return present + + +# --------------------------------------------------------------------------- +# _build_assertion_specs (per-cell helper) +# --------------------------------------------------------------------------- + + +def _build_assertion_specs(entry: EvalSetEntry) -> list[dict[str, object]]: + """Combine explicit assertions with trigger/not_trigger expectations. + + The result is the complete spec list passed to ``run_assertions``. + """ + specs: list[dict[str, object]] = list(entry.assertions) + if entry.should_trigger is not None: + specs.append({"type": "trigger", "skill": entry.should_trigger}) + if entry.should_not_trigger is not None: + specs.append({"type": "not_trigger", "skill": entry.should_not_trigger}) + return specs + + +# --------------------------------------------------------------------------- +# run_eval +# --------------------------------------------------------------------------- + + +def evaluate_cell( + *, + skill: str, + entry: EvalSetEntry, + prompt_index: int, + run_number: int, + dispatcher: VariantDispatcher, +) -> EvalResultRecord: + """Dispatch one (entry, prompt_index, run_number) cell and return the record. + + Does NOT write to disk — the caller is responsible for durable persistence + (INV-4). Shared by ``run_eval`` (sequential) and ``bounded_run`` + (concurrent) so dispatch+assertion logic is defined exactly once (DRY). + + Design invariants + ----------------- + - D10: variant_id is always ``_VARIANT_ID`` (1). + - INV-7: ``triggered_skill`` is read from ``DispatchResult.triggered_skill`` + only -- the runner never parses transcripts. + - VC4: per-cell ``DispatchResult.error`` is recorded (not raised); callers + decide whether to abort or continue. + """ + cell_id = make_cell_id(prompt_index, _VARIANT_ID, run_number) + + # Dispatch -- must not raise (VariantDispatcher contract). + dispatch_result: DispatchResult = dispatcher.dispatch(entry.prompt) + + # Build assertion specs: explicit assertions + trigger expectations. + assertion_specs = _build_assertion_specs(entry) + + if dispatch_result.error is not None: + # VC4: record the error as a synthetic failed assertion; do not abort. + passed_list: list[str] = [] + failed_list: list[str] = [f"dispatch_error: {dispatch_result.error}"] + logger.warning( + "evaluate_cell: cell %s dispatch error (skill=%s run=%d): %s", + cell_id, + skill, + run_number, + dispatch_result.error, + ) + else: + passed_list, failed_list = run_assertions(assertion_specs, dispatch_result) + + return EvalResultRecord( + cell_id=cell_id, + prompt=entry.prompt, + triggered_skill=dispatch_result.triggered_skill, + token_usage=dispatch_result.token_usage, + duration_s=dispatch_result.duration_s, + assertions_passed=passed_list, + assertions_failed=failed_list, + raw_output=dispatch_result.raw_output, + ) + + +def run_eval( + *, + skill: str, + entries: list[EvalSetEntry], + dispatcher: VariantDispatcher, + runs: int, + out_path: Path, + resume: bool = False, +) -> list[EvalResultRecord]: + """Execute the prompts x runs evaluation matrix and write results to *out_path*. + + Parameters + ---------- + skill: + Name of the skill under evaluation (used for logging only). + entries: + Eval-set rows from ``load_eval_set``. + dispatcher: + ``VariantDispatcher`` instance (``MockDispatcher`` in tests, + ``ClaudeSubprocessDispatcher`` in production). + runs: + Number of runs per prompt (``range(runs)``). + out_path: + Absolute path to the ``.jsonl`` output file. Created (with parent + dirs) if absent; APPENDED to if *resume* is True. + resume: + If True, read already-present ``cell_id`` values from *out_path* and + skip those cells. Missing cells are appended to the SAME file. + If False (default), *out_path* is a fresh file (caller's responsibility + to pass a new path -- the function does not truncate an existing file). + + Returns + ------- + list[EvalResultRecord] + Records written *during this call* (skipped/resumed cells are not + included -- callers that need the full result set should read out_path). + + Design invariants + ----------------- + - D10: variant_id is always ``_VARIANT_ID`` (1) -- NO variants loop. + - INV-7: ``triggered_skill`` is read from ``DispatchResult.triggered_skill`` + only -- the runner never parses transcripts. + - INV-4: each record is flushed to *out_path* immediately after building. + - VC4: per-cell ``DispatchResult.error`` is recorded; matrix is never aborted. + """ + # Resolve set of already-written cells for resume mode. + present_cell_ids: set[str] = set() + if resume and out_path.exists(): + present_cell_ids = _read_present_cell_ids(out_path) + logger.info( + "run_eval: resume mode -- %d cells already present in %s", + len(present_cell_ids), + out_path, + ) + + # Ensure output directory exists before first write. + out_path.parent.mkdir(parents=True, exist_ok=True) + + written_records: list[EvalResultRecord] = [] + + # Intent: outer loop is prompts, inner loop is runs -- matrix p x r with D10 variant=1. + for prompt_index, entry in enumerate(entries): + for run_number in range(runs): + cell_id = make_cell_id(prompt_index, _VARIANT_ID, run_number) + + if cell_id in present_cell_ids: + logger.debug( + "run_eval: skipping cell %s (already present in %s)", + cell_id, + out_path, + ) + continue + + record = evaluate_cell( + skill=skill, + entry=entry, + prompt_index=prompt_index, + run_number=run_number, + dispatcher=dispatcher, + ) + + # INV-4: durable per-cell append-and-flush before advancing. + _append_record(out_path, record) + + written_records.append(record) + + logger.info( + "run_eval: finished skill=%s entries=%d runs=%d cells_written=%d out=%s", + skill, + len(entries), + runs, + len(written_records), + out_path, + ) + + return written_records + + +# --------------------------------------------------------------------------- +# _append_record (durable per-cell write) +# --------------------------------------------------------------------------- + + +def _append_record(out_path: Path, record: EvalResultRecord) -> None: + """Append *record* as a single JSON line to *out_path* and flush. + + Uses the ``open(path, "a", ...)`` append precedent from + ``memory/capture.py:446``. Calls ``flush()`` after write to ensure the OS + buffer is flushed; ``os.fsync`` is intentionally omitted to avoid blocking + the matrix on every cell -- the OS buffer flush is sufficient for the + sequential use-case. + """ + line = json.dumps(record.to_dict()) + "\n" + with open(out_path, "a", encoding="utf-8") as fh: + fh.write(line) + fh.flush() + + +# --------------------------------------------------------------------------- +# Path helpers +# --------------------------------------------------------------------------- + + +def default_run_path(root: Path, skill: str, timestamp: str) -> Path: + """Return the canonical .jsonl path for a new eval run. + + Parameters + ---------- + root: + Project root (the directory that contains ``.map/``). + skill: + Skill name (used as a subdirectory component). + timestamp: + Caller-supplied timestamp string, e.g. + ``datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")``. + Kept in the runner to make ``run_eval`` clock-free (testable). + + Returns + ------- + Path + ``/.map/eval-runs//.jsonl`` + """ + return root / ".map" / "eval-runs" / skill / f"{timestamp}.jsonl" + + +def latest_run_path(root: Path, skill: str) -> Path | None: + """Return the most-recent ``.jsonl`` path for *skill*, or ``None``. + + Scans ``/.map/eval-runs//`` for ``*.jsonl`` files and returns + the lexicographically last one (ISO-timestamp filenames sort correctly). + Returns ``None`` if the directory does not exist or is empty. + """ + run_dir = root / ".map" / "eval-runs" / skill + if not run_dir.is_dir(): + return None + candidates = sorted(run_dir.glob("*.jsonl")) + if not candidates: + return None + return candidates[-1] diff --git a/src/mapify_cli/templates/hooks/safety-guardrails.py b/src/mapify_cli/templates/hooks/safety-guardrails.py index 04fd888..48e671c 100755 --- a/src/mapify_cli/templates/hooks/safety-guardrails.py +++ b/src/mapify_cli/templates/hooks/safety-guardrails.py @@ -38,7 +38,12 @@ # Dangerous bash command patterns _DEFAULT_DANGEROUS_COMMANDS = [ - r"rm\s+-rf\s+/", # rm -rf / + # Block `rm -rf /` (bare root), `rm -rf /etc`, `rm -rf /home/user`, etc., + # but ALLOW deletion of subpaths UNDER a temp root (rm -rf /tmp/, + # /private/tmp/, /var/folders/, /var/tmp/) — legitimate + # scratch cleanup. The negative lookahead requires a trailing slash, so the + # temp root itself (`rm -rf /tmp`) stays blocked; only children are allowed. + r"rm\s+-rf\s+/(?!(?:tmp|private/tmp|var/folders|var/tmp)/)", # rm -rf / (non-temp) r"rm\s+-rf\s+\*", # rm -rf * r"rm\s+-rf\s+\.\.", # rm -rf .. r"git\s+push.*--force.*main", diff --git a/src/mapify_cli/templates/map/scripts/map_orchestrator.py b/src/mapify_cli/templates/map/scripts/map_orchestrator.py index 03ea61c..013227f 100755 --- a/src/mapify_cli/templates/map/scripts/map_orchestrator.py +++ b/src/mapify_cli/templates/map/scripts/map_orchestrator.py @@ -2166,6 +2166,29 @@ def _is_cross_repo_path(p: str) -> bool: diff_paths = set() if diff_paths: files_not_in_diff = [p for p in declared if p not in diff_paths] + # Gitignored deliverables (e.g. .map/ workflow artifacts like spike + # docs or eval-run .jsonl) never appear in git diff/status by design — + # that is NOT Actor truncation. Drop any declared path that + # `git check-ignore` reports as ignored so it does not raise a false + # "Possible Actor truncation" warning. A gitignored file that is also + # missing from disk is still flagged separately via missing_files. + if files_not_in_diff: + try: + igproc = _sp.run( + ["git", "check-ignore", "--", *files_not_in_diff], + cwd=project_dir, capture_output=True, text=True, timeout=5, + ) + ignored = { + line.strip() + for line in igproc.stdout.splitlines() + if line.strip() + } + if ignored: + files_not_in_diff = [ + p for p in files_not_in_diff if p not in ignored + ] + except (OSError, _sp.TimeoutExpired): + pass state.record_subtask_result( subtask_id, diff --git a/src/mapify_cli/templates/skills/map-efficient/SKILL.md b/src/mapify_cli/templates/skills/map-efficient/SKILL.md index b986b52..2045905 100644 --- a/src/mapify_cli/templates/skills/map-efficient/SKILL.md +++ b/src/mapify_cli/templates/skills/map-efficient/SKILL.md @@ -191,8 +191,10 @@ python3 .map/scripts/map_step_runner.py record_test_baseline "$BRANCH" Snapshots pre-existing failures so later subtasks distinguish "introduced regression" from "was broken pre-plan". Auto-detects -Make/pytest/go test/cargo. Overrides + narrow-target guidance: -[efficient-reference.md](efficient-reference.md#pre-flight-test-baseline). +Make/pytest/go test/cargo. It captures the test run internally and prints a +single compact JSON report at the end — read that JSON directly; do NOT pipe it +through `head`/`tail` (per the repo bash guidelines). Overrides + narrow-target +guidance: [efficient-reference.md](efficient-reference.md#pre-flight-test-baseline). ### Wave Computation (after INIT_STATE) - REQUIRED diff --git a/src/mapify_cli/templates/skills/map-efficient/efficient-reference.md b/src/mapify_cli/templates/skills/map-efficient/efficient-reference.md index 802d11c..6734cfc 100644 --- a/src/mapify_cli/templates/skills/map-efficient/efficient-reference.md +++ b/src/mapify_cli/templates/skills/map-efficient/efficient-reference.md @@ -203,6 +203,11 @@ fix or defer. python3 .map/scripts/map_step_runner.py record_test_baseline "$BRANCH" ``` +It captures the test run internally and prints a single compact JSON report at +the end — read that JSON directly. Do NOT pipe it through `head`/`tail` (per the +repo bash guidelines); the output is one small object, not a stream, so +truncating it only hides fields. + Auto-detects from project markers: - `Makefile` with `test:` target → `make test` - `pyproject.toml` / `pytest.ini` → `pytest` diff --git a/src/mapify_cli/templates/skills/map-skill-eval/SKILL.md b/src/mapify_cli/templates/skills/map-skill-eval/SKILL.md new file mode 100644 index 0000000..567ac04 --- /dev/null +++ b/src/mapify_cli/templates/skills/map-skill-eval/SKILL.md @@ -0,0 +1,94 @@ +--- +name: map-skill-eval +description: | + Evaluate a /map-* skill's trigger accuracy and cost. Use when asked to measure skill trigger accuracy, run an eval-set, or check token/duration cost via `mapify skill-eval`. Do NOT use to plan or implement; use map-plan or map-efficient. +effort: medium +disable-model-invocation: true +argument-hint: "[skill] [--eval-set PATH]" +--- +# /map-skill-eval — Skill Trigger Accuracy & Cost Evaluation + +Purpose: measure whether a `/map-*` skill fires on the right prompts and what it costs in tokens and time. Do not plan or implement from this skill. + +Requires the `claude` CLI (installed and on `$PATH`). The skill is skipped at install time on hosts without `claude`. + +## Invocation + +```bash +mapify skill-eval run --eval-set PATH [--dry-run] [--resume] [--max-concurrency N] +``` + +- `` — the skill name to evaluate (e.g. `map-plan`). +- `--eval-set PATH` — path to a JSON eval-set file defining prompt cases and expected assertions. +- `--dry-run` — validate the eval-set and print the planned run count without spending any quota. +- `--resume` — continue an interrupted run from the last durable checkpoint. +- `--max-concurrency N` — max parallel `claude -p` workers (default: 1). + +## What It Does + +1. **Prompts × runs matrix** — for each case in the eval-set, invokes `claude -p` in an isolated temporary working directory seeded with `.claude/` (skills, settings). Runs are independent; no shared state leaks between cases. +2. **Transcript-parse trigger detection** — parses each `claude -p` transcript to determine whether the target skill fired (trigger) or did not fire (not_trigger). +3. **Deterministic assertions** — each eval case may specify one or more assertion types: + - `contains` / `not_contains` — substring presence in the response. + - `regex` — pattern match against the response. + - `valid_json` — response parses as JSON. + - `trigger` / `not_trigger` — skill fired / did not fire. +4. **Durable resumable run log** — results are appended to `.map/eval-runs//.jsonl` as each case completes, so a partial run is recoverable via `--resume`. +5. **Summary report** — after all cases complete, prints pass-rate (passed/total) plus per-case token usage, duration, and cache-hit stats. + +## Eval-Set Format + +A JSON object with an `entries` array. Each entry has a `prompt`, optional +`should_trigger` / `should_not_trigger` skill names (the runner turns these into +`trigger` / `not_trigger` assertions), and an optional `assertions` array. +Assertion types: `contains`, `not_contains`, `regex`, `valid_json`, `trigger`, +`not_trigger`. + +```json +{ + "entries": [ + { + "prompt": "Decompose this feature into subtasks", + "should_trigger": "map-plan", + "assertions": [ + { "type": "contains", "value": "subtask" } + ] + }, + { + "prompt": "Run quality gates", + "should_not_trigger": "map-plan", + "assertions": [] + } + ] +} +``` + +## --dry-run + +`--dry-run` validates the eval-set schema and prints the planned case count with estimated quota usage. No `claude -p` calls are made; no `.jsonl` is written. + +## Examples + +```bash +# Validate eval-set without spending quota +mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --dry-run + +# Run full eval with up to 8 parallel workers +mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --max-concurrency 8 + +# Resume an interrupted run +mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --resume +``` + +## Troubleshooting + +- **`claude` not found** — `map-skill-eval` requires the `claude` CLI on `$PATH`. Install it and re-run `mapify init` to activate the skill. +- **Eval-set validation error on `--dry-run`** — check that each case has a non-empty `id`, a `prompt`, and at least one `assertions` entry with a valid `type`. +- **Run log not found for `--resume`** — `--resume` looks for the latest `.map/eval-runs//.jsonl`. If no prior run exists, omit `--resume` to start fresh. +- **All cases report `not_trigger` unexpectedly** — verify the skill name matches exactly (e.g. `map-plan`, not `map_plan`) and that `.claude/` was seeded correctly in the temp cwd. + +## Related Commands + +- `/map-plan` — plan and decompose tasks. +- `/map-efficient` — full MAP workflow execution. +- `/map-check` — run quality gates and verify MAP workflow completion. diff --git a/src/mapify_cli/templates/skills/skill-rules.json b/src/mapify_cli/templates/skills/skill-rules.json index bbe32ab..d5a9606 100644 --- a/src/mapify_cli/templates/skills/skill-rules.json +++ b/src/mapify_cli/templates/skills/skill-rules.json @@ -239,6 +239,18 @@ ] } }, + "map-skill-eval": { + "type": "manual", + "skillClass": "task", + "enforcement": "manual", + "priority": "medium", + "description": "Evaluate a /map-* skill's trigger accuracy + cost via mapify skill-eval (claude -p matrix, deterministic assertions, durable resumable runs).", + "requires-cmd": ["claude"], + "promptTriggers": { + "keywords": ["map-skill-eval","skill-eval","skill eval","evaluate skill","trigger accuracy","skill triggering"], + "intentPatterns": ["map-skill-eval","(eval|evaluate|measure|test).*(skill).*(trigger|fire|cost)","does .* skill trigger"] + } + }, "map-task": { "type": "manual", "skillClass": "task", diff --git a/src/mapify_cli/templates_src/hooks/safety-guardrails.py.jinja b/src/mapify_cli/templates_src/hooks/safety-guardrails.py.jinja index 04fd888..48e671c 100755 --- a/src/mapify_cli/templates_src/hooks/safety-guardrails.py.jinja +++ b/src/mapify_cli/templates_src/hooks/safety-guardrails.py.jinja @@ -38,7 +38,12 @@ _DEFAULT_DANGEROUS_FILE_PATTERNS = [ # Dangerous bash command patterns _DEFAULT_DANGEROUS_COMMANDS = [ - r"rm\s+-rf\s+/", # rm -rf / + # Block `rm -rf /` (bare root), `rm -rf /etc`, `rm -rf /home/user`, etc., + # but ALLOW deletion of subpaths UNDER a temp root (rm -rf /tmp/, + # /private/tmp/, /var/folders/, /var/tmp/) — legitimate + # scratch cleanup. The negative lookahead requires a trailing slash, so the + # temp root itself (`rm -rf /tmp`) stays blocked; only children are allowed. + r"rm\s+-rf\s+/(?!(?:tmp|private/tmp|var/folders|var/tmp)/)", # rm -rf / (non-temp) r"rm\s+-rf\s+\*", # rm -rf * r"rm\s+-rf\s+\.\.", # rm -rf .. r"git\s+push.*--force.*main", diff --git a/src/mapify_cli/templates_src/map/scripts/map_orchestrator.py.jinja b/src/mapify_cli/templates_src/map/scripts/map_orchestrator.py.jinja index 03ea61c..013227f 100755 --- a/src/mapify_cli/templates_src/map/scripts/map_orchestrator.py.jinja +++ b/src/mapify_cli/templates_src/map/scripts/map_orchestrator.py.jinja @@ -2166,6 +2166,29 @@ def record_subtask_result( diff_paths = set() if diff_paths: files_not_in_diff = [p for p in declared if p not in diff_paths] + # Gitignored deliverables (e.g. .map/ workflow artifacts like spike + # docs or eval-run .jsonl) never appear in git diff/status by design — + # that is NOT Actor truncation. Drop any declared path that + # `git check-ignore` reports as ignored so it does not raise a false + # "Possible Actor truncation" warning. A gitignored file that is also + # missing from disk is still flagged separately via missing_files. + if files_not_in_diff: + try: + igproc = _sp.run( + ["git", "check-ignore", "--", *files_not_in_diff], + cwd=project_dir, capture_output=True, text=True, timeout=5, + ) + ignored = { + line.strip() + for line in igproc.stdout.splitlines() + if line.strip() + } + if ignored: + files_not_in_diff = [ + p for p in files_not_in_diff if p not in ignored + ] + except (OSError, _sp.TimeoutExpired): + pass state.record_subtask_result( subtask_id, diff --git a/src/mapify_cli/templates_src/skills/map-efficient/SKILL.md.jinja b/src/mapify_cli/templates_src/skills/map-efficient/SKILL.md.jinja index b986b52..2045905 100644 --- a/src/mapify_cli/templates_src/skills/map-efficient/SKILL.md.jinja +++ b/src/mapify_cli/templates_src/skills/map-efficient/SKILL.md.jinja @@ -191,8 +191,10 @@ python3 .map/scripts/map_step_runner.py record_test_baseline "$BRANCH" Snapshots pre-existing failures so later subtasks distinguish "introduced regression" from "was broken pre-plan". Auto-detects -Make/pytest/go test/cargo. Overrides + narrow-target guidance: -[efficient-reference.md](efficient-reference.md#pre-flight-test-baseline). +Make/pytest/go test/cargo. It captures the test run internally and prints a +single compact JSON report at the end — read that JSON directly; do NOT pipe it +through `head`/`tail` (per the repo bash guidelines). Overrides + narrow-target +guidance: [efficient-reference.md](efficient-reference.md#pre-flight-test-baseline). ### Wave Computation (after INIT_STATE) - REQUIRED diff --git a/src/mapify_cli/templates_src/skills/map-efficient/efficient-reference.md.jinja b/src/mapify_cli/templates_src/skills/map-efficient/efficient-reference.md.jinja index 802d11c..6734cfc 100644 --- a/src/mapify_cli/templates_src/skills/map-efficient/efficient-reference.md.jinja +++ b/src/mapify_cli/templates_src/skills/map-efficient/efficient-reference.md.jinja @@ -203,6 +203,11 @@ fix or defer. python3 .map/scripts/map_step_runner.py record_test_baseline "$BRANCH" ``` +It captures the test run internally and prints a single compact JSON report at +the end — read that JSON directly. Do NOT pipe it through `head`/`tail` (per the +repo bash guidelines); the output is one small object, not a stream, so +truncating it only hides fields. + Auto-detects from project markers: - `Makefile` with `test:` target → `make test` - `pyproject.toml` / `pytest.ini` → `pytest` diff --git a/src/mapify_cli/templates_src/skills/map-skill-eval/SKILL.md.jinja b/src/mapify_cli/templates_src/skills/map-skill-eval/SKILL.md.jinja new file mode 100644 index 0000000..567ac04 --- /dev/null +++ b/src/mapify_cli/templates_src/skills/map-skill-eval/SKILL.md.jinja @@ -0,0 +1,94 @@ +--- +name: map-skill-eval +description: | + Evaluate a /map-* skill's trigger accuracy and cost. Use when asked to measure skill trigger accuracy, run an eval-set, or check token/duration cost via `mapify skill-eval`. Do NOT use to plan or implement; use map-plan or map-efficient. +effort: medium +disable-model-invocation: true +argument-hint: "[skill] [--eval-set PATH]" +--- +# /map-skill-eval — Skill Trigger Accuracy & Cost Evaluation + +Purpose: measure whether a `/map-*` skill fires on the right prompts and what it costs in tokens and time. Do not plan or implement from this skill. + +Requires the `claude` CLI (installed and on `$PATH`). The skill is skipped at install time on hosts without `claude`. + +## Invocation + +```bash +mapify skill-eval run --eval-set PATH [--dry-run] [--resume] [--max-concurrency N] +``` + +- `` — the skill name to evaluate (e.g. `map-plan`). +- `--eval-set PATH` — path to a JSON eval-set file defining prompt cases and expected assertions. +- `--dry-run` — validate the eval-set and print the planned run count without spending any quota. +- `--resume` — continue an interrupted run from the last durable checkpoint. +- `--max-concurrency N` — max parallel `claude -p` workers (default: 1). + +## What It Does + +1. **Prompts × runs matrix** — for each case in the eval-set, invokes `claude -p` in an isolated temporary working directory seeded with `.claude/` (skills, settings). Runs are independent; no shared state leaks between cases. +2. **Transcript-parse trigger detection** — parses each `claude -p` transcript to determine whether the target skill fired (trigger) or did not fire (not_trigger). +3. **Deterministic assertions** — each eval case may specify one or more assertion types: + - `contains` / `not_contains` — substring presence in the response. + - `regex` — pattern match against the response. + - `valid_json` — response parses as JSON. + - `trigger` / `not_trigger` — skill fired / did not fire. +4. **Durable resumable run log** — results are appended to `.map/eval-runs//.jsonl` as each case completes, so a partial run is recoverable via `--resume`. +5. **Summary report** — after all cases complete, prints pass-rate (passed/total) plus per-case token usage, duration, and cache-hit stats. + +## Eval-Set Format + +A JSON object with an `entries` array. Each entry has a `prompt`, optional +`should_trigger` / `should_not_trigger` skill names (the runner turns these into +`trigger` / `not_trigger` assertions), and an optional `assertions` array. +Assertion types: `contains`, `not_contains`, `regex`, `valid_json`, `trigger`, +`not_trigger`. + +```json +{ + "entries": [ + { + "prompt": "Decompose this feature into subtasks", + "should_trigger": "map-plan", + "assertions": [ + { "type": "contains", "value": "subtask" } + ] + }, + { + "prompt": "Run quality gates", + "should_not_trigger": "map-plan", + "assertions": [] + } + ] +} +``` + +## --dry-run + +`--dry-run` validates the eval-set schema and prints the planned case count with estimated quota usage. No `claude -p` calls are made; no `.jsonl` is written. + +## Examples + +```bash +# Validate eval-set without spending quota +mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --dry-run + +# Run full eval with up to 8 parallel workers +mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --max-concurrency 8 + +# Resume an interrupted run +mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --resume +``` + +## Troubleshooting + +- **`claude` not found** — `map-skill-eval` requires the `claude` CLI on `$PATH`. Install it and re-run `mapify init` to activate the skill. +- **Eval-set validation error on `--dry-run`** — check that each case has a non-empty `id`, a `prompt`, and at least one `assertions` entry with a valid `type`. +- **Run log not found for `--resume`** — `--resume` looks for the latest `.map/eval-runs//.jsonl`. If no prior run exists, omit `--resume` to start fresh. +- **All cases report `not_trigger` unexpectedly** — verify the skill name matches exactly (e.g. `map-plan`, not `map_plan`) and that `.claude/` was seeded correctly in the temp cwd. + +## Related Commands + +- `/map-plan` — plan and decompose tasks. +- `/map-efficient` — full MAP workflow execution. +- `/map-check` — run quality gates and verify MAP workflow completion. diff --git a/src/mapify_cli/templates_src/skills/skill-rules.json.jinja b/src/mapify_cli/templates_src/skills/skill-rules.json.jinja index bbe32ab..d5a9606 100644 --- a/src/mapify_cli/templates_src/skills/skill-rules.json.jinja +++ b/src/mapify_cli/templates_src/skills/skill-rules.json.jinja @@ -239,6 +239,18 @@ ] } }, + "map-skill-eval": { + "type": "manual", + "skillClass": "task", + "enforcement": "manual", + "priority": "medium", + "description": "Evaluate a /map-* skill's trigger accuracy + cost via mapify skill-eval (claude -p matrix, deterministic assertions, durable resumable runs).", + "requires-cmd": ["claude"], + "promptTriggers": { + "keywords": ["map-skill-eval","skill-eval","skill eval","evaluate skill","trigger accuracy","skill triggering"], + "intentPatterns": ["map-skill-eval","(eval|evaluate|measure|test).*(skill).*(trigger|fire|cost)","does .* skill trigger"] + } + }, "map-task": { "type": "manual", "skillClass": "task", diff --git a/tests/hooks/test_safety_guardrails.py b/tests/hooks/test_safety_guardrails.py index 9fd68ac..5dcfccb 100644 --- a/tests/hooks/test_safety_guardrails.py +++ b/tests/hooks/test_safety_guardrails.py @@ -223,6 +223,10 @@ class TestRmRfBlocking: [ "rm -rf /", "rm -rf /home/user", + "rm -rf /etc", + "rm -rf /var", + "rm -rf /tmp", # the temp ROOT itself stays blocked (no trailing /child) + "rm -rf /*", "rm -rf *", "rm -rf ..", ], @@ -232,6 +236,25 @@ def test_rm_rf_blocked(self, command): assert exit_code == 0 _assert_denied(_parse_stdout(stdout)) + @pytest.mark.parametrize( + "command", + [ + "rm -rf /tmp/map-spike-abc123", + "rm -rf /tmp/pytest-of-user/run0", + "rm -rf /private/tmp/map-spike-WOi8Pq", # macOS mktemp + "rm -rf /var/folders/ab/cd1234/T/scratch", # macOS $TMPDIR + "rm -rf /var/tmp/build-cache", + ], + ) + def test_rm_rf_temp_subpath_allowed(self, command): + """Deleting a subpath UNDER a temp root is legitimate scratch cleanup + and must not be blocked (regression: the bare ``rm -rf /`` pattern used + to flag every absolute path, including temp dirs and any command that + merely mentioned one).""" + exit_code, stdout, _ = run_hook_bash(command) + assert exit_code == 0 + assert _parse_stdout(stdout) == {} + def test_rm_single_file_allowed(self): exit_code, stdout, _ = run_hook_bash("rm file.txt") assert exit_code == 0 diff --git a/tests/skills_eval/fixtures/map_debug_eval_set.json b/tests/skills_eval/fixtures/map_debug_eval_set.json new file mode 100644 index 0000000..d9a6a56 --- /dev/null +++ b/tests/skills_eval/fixtures/map_debug_eval_set.json @@ -0,0 +1,23 @@ +{ + "entries": [ + { + "prompt": "I need help debugging a failing test in my Python project.", + "should_trigger": "map-debug", + "assertions": [ + {"type": "contains", "value": "debug"} + ] + }, + { + "prompt": "Please add the numbers 2 and 3 together.", + "should_not_trigger": "map-debug", + "assertions": [] + }, + { + "prompt": "My application crashes with a stack overflow. Help me diagnose it.", + "should_trigger": "map-debug", + "assertions": [ + {"type": "contains", "value": "crash"} + ] + } + ] +} diff --git a/tests/test_map_orchestrator.py b/tests/test_map_orchestrator.py index 9633a63..e213176 100644 --- a/tests/test_map_orchestrator.py +++ b/tests/test_map_orchestrator.py @@ -2179,6 +2179,74 @@ def test_explicit_commit_sha_wins(self, branch_dir, tmp_path, monkeypatch): assert reloaded.last_subtask_commit_sha == "cafebabe" +class TestRecordSubtaskResultGitignoredArtifact: + """record_subtask_result must NOT raise a 'Possible Actor truncation' + warning for declared files that are gitignored-but-present on disk (e.g. + .map/ workflow artifacts like spike docs). They never appear in git + diff/status by design — that is intentional, not truncation.""" + + def _init_git_repo(self, tmp_path): + import subprocess as _sp + _sp.run(["git", "init"], cwd=tmp_path, capture_output=True) + _sp.run(["git", "config", "user.email", "t@t.com"], cwd=tmp_path, capture_output=True) + _sp.run(["git", "config", "user.name", "t"], cwd=tmp_path, capture_output=True) + (tmp_path / ".gitignore").write_text(".map/\n") + (tmp_path / "seed.txt").write_text("seed") + (tmp_path / "tracked.py").write_text("x = 1\n") + _sp.run(["git", "add", "."], cwd=tmp_path, capture_output=True) + _sp.run(["git", "commit", "-m", "init"], cwd=tmp_path, capture_output=True) + # Second (non-root) commit so HEAD has a parent and `git diff-tree` + # yields a NON-empty diff_paths. Without this, a root commit produces an + # empty diff and files_not_in_diff is never computed — the gitignore + # test would then pass vacuously without exercising the filter. + (tmp_path / "seed.txt").write_text("seed v2") + _sp.run(["git", "add", "."], cwd=tmp_path, capture_output=True) + _sp.run(["git", "commit", "-m", "second"], cwd=tmp_path, capture_output=True) + + def test_gitignored_artifact_not_flagged(self, branch_dir, tmp_path, monkeypatch): + state = map_orchestrator.StepState() + state.subtask_sequence = ["ST-001"] + state.current_subtask_id = "ST-001" + state_file = tmp_path / ".map" / branch_dir / "step_state.json" + state.save(state_file) + self._init_git_repo(tmp_path) + # A real deliverable that exists on disk but is gitignored (.map/**). + artifact = tmp_path / ".map" / branch_dir / "spike_st001.md" + artifact.write_text("spike verdict", encoding="utf-8") + monkeypatch.setenv("CLAUDE_PROJECT_DIR", str(tmp_path)) + result = map_orchestrator.record_subtask_result( + "ST-001", branch_dir, + files_changed=[f".map/{branch_dir}/spike_st001.md"], + status="valid", summary="spike", commit_sha=None, + ) + assert result["status"] == "success" + # No false truncation warning, no files_not_in_diff for the gitignored file. + assert "files_not_in_diff" not in result, result + assert "Possible Actor truncation" not in result.get("warning", ""), result + + def test_non_gitignored_unchanged_tracked_file_still_flagged( + self, branch_dir, tmp_path, monkeypatch + ): + """Negative control (proves the filter is SPECIFIC): a tracked file that + exists, is NOT gitignored, and was not touched by this subtask's diff + still surfaces in files_not_in_diff — the gitignore filter must not be a + blanket suppression.""" + state = map_orchestrator.StepState() + state.subtask_sequence = ["ST-001"] + state.current_subtask_id = "ST-001" + state_file = tmp_path / ".map" / branch_dir / "step_state.json" + state.save(state_file) + self._init_git_repo(tmp_path) # tracked.py committed, unchanged in HEAD + monkeypatch.setenv("CLAUDE_PROJECT_DIR", str(tmp_path)) + result = map_orchestrator.record_subtask_result( + "ST-001", branch_dir, + files_changed=["tracked.py"], + status="valid", summary="x", commit_sha=None, + ) + assert result["status"] == "success" + assert result.get("files_not_in_diff") == ["tracked.py"], result + + class TestValidateStepTransactionalMonitor: """validate_step('2.4') now implicitly closes pending 2.3 (ACTOR) so callers don't get 'Step mismatch: expected 2.3' when they jump straight diff --git a/tests/test_skills_consistency.py b/tests/test_skills_consistency.py index 81eed55..8723ad6 100644 --- a/tests/test_skills_consistency.py +++ b/tests/test_skills_consistency.py @@ -477,9 +477,9 @@ def detect_skill_deps(skill_dir: Path) -> dict[str, set[str]]: def test_skill_discovery_non_empty(skill_names: list[str]) -> None: - """Guard: skill-rules.json must list exactly 15 skills (prevents vacuous pass).""" - assert len(skill_names) == 15, ( - f"Expected 15 skills in skill-rules.json, found {len(skill_names)}: " + """Guard: skill-rules.json must list exactly 16 skills (prevents vacuous pass).""" + assert len(skill_names) == 16, ( + f"Expected 16 skills in skill-rules.json, found {len(skill_names)}: " f"{sorted(skill_names)}" ) diff --git a/tests/test_skills_eval_aggregator.py b/tests/test_skills_eval_aggregator.py new file mode 100644 index 0000000..6fd48a7 --- /dev/null +++ b/tests/test_skills_eval_aggregator.py @@ -0,0 +1,326 @@ +"""Tests for skills_eval aggregator (ST-006). + +Covers aggregate() and bounded_run() using MockDispatcher only -- zero real +claude subprocess (INV-2/INV-3). Tests map 1:1 to validation criteria: + VC1 -- pass_rate fraction + VC2 -- token mean/stddev, n<2 no raise + VC3 -- bounded_run serialised writes: every .jsonl line parses, no corruption + VC4 -- all-null token_usage -> token stats None, pass_rate + duration still valid + SC-1 -- max_concurrency=3 matrix -> complete unique cell set; resume -> no dupes +""" + +from __future__ import annotations + +import json +import math +from pathlib import Path + +from mapify_cli.skills_eval.aggregator import aggregate, bounded_run +from mapify_cli.skills_eval.dispatcher import MockDispatcher +from mapify_cli.skills_eval.eval_schema import ( + EvalResultRecord, + EvalSetEntry, + make_cell_id, +) +from mapify_cli.token_budget import TokenUsage + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _entries(n: int = 2) -> list[EvalSetEntry]: + return [ + EvalSetEntry( + prompt=f"p{i}", + should_trigger=None, + should_not_trigger=None, + assertions=[], + ) + for i in range(n) + ] + + +def _read_all_records(path: Path) -> list[EvalResultRecord]: + """Parse every non-blank line in the .jsonl; raise on malformed.""" + records = [] + for line in path.read_text(encoding="utf-8").splitlines(): + if not line.strip(): + continue + records.append(EvalResultRecord.from_dict(json.loads(line))) + return records + + +def _make_record( + cell_id: str, + *, + assertions_failed: list[str] | None = None, + token_usage: TokenUsage | None = None, + duration_s: float = 1.0, +) -> EvalResultRecord: + return EvalResultRecord( + cell_id=cell_id, + prompt="test", + triggered_skill=None, + token_usage=token_usage, + duration_s=duration_s, + assertions_passed=[], + assertions_failed=assertions_failed or [], + ) + + +# --------------------------------------------------------------------------- +# aggregate() -- AggregateSummary correctness +# --------------------------------------------------------------------------- + + +def test_vc1_pass_rate_fraction() -> None: + """VC1: pass_rate = passed_cells / total_cells.""" + records = [ + _make_record("p0-v1-r0"), # passed (empty assertions_failed) + _make_record("p1-v1-r0", assertions_failed=["x"]), # failed + _make_record("p2-v1-r0"), # passed + _make_record("p3-v1-r0", assertions_failed=["y", "z"]), # failed + ] + summary = aggregate(records) + assert summary.total_cells == 4 + assert summary.passed_cells == 2 + assert math.isclose(summary.pass_rate, 0.5) + + +def test_vc1_all_passed() -> None: + records = [_make_record(f"p{i}-v1-r0") for i in range(3)] + summary = aggregate(records) + assert summary.passed_cells == 3 + assert math.isclose(summary.pass_rate, 1.0) + + +def test_vc1_all_failed() -> None: + records = [_make_record(f"p{i}-v1-r0", assertions_failed=["f"]) for i in range(3)] + summary = aggregate(records) + assert summary.passed_cells == 0 + assert math.isclose(summary.pass_rate, 0.0) + + +def test_vc1_empty_list_no_raise() -> None: + """VC4/VC1: empty list must not raise; pass_rate = 0.0.""" + summary = aggregate([]) + assert summary.total_cells == 0 + assert summary.passed_cells == 0 + assert math.isclose(summary.pass_rate, 0.0) + assert summary.tokens_mean is None + assert summary.tokens_stddev is None + assert summary.duration_mean is None + assert summary.duration_stddev is None + + +def test_vc2_token_mean_and_stddev() -> None: + """VC2: tokens_mean and tokens_stddev correct over non-null token_usage.""" + tu_a = TokenUsage(input_tokens=100, cache_read_input_tokens=0) + tu_b = TokenUsage(input_tokens=200, cache_read_input_tokens=0) + tu_c = TokenUsage(input_tokens=300, cache_read_input_tokens=0) + records = [ + _make_record("p0-v1-r0", token_usage=tu_a, duration_s=1.0), + _make_record("p1-v1-r0", token_usage=tu_b, duration_s=2.0), + _make_record("p2-v1-r0", token_usage=tu_c, duration_s=3.0), + ] + summary = aggregate(records) + assert summary.token_sample_size == 3 + assert math.isclose(summary.tokens_mean or 0.0, 200.0) + # sample stdev of [100, 200, 300] + import statistics + expected_stdev = statistics.stdev([100.0, 200.0, 300.0]) + assert math.isclose(summary.tokens_stddev or 0.0, expected_stdev) + + +def test_vc2_token_n_eq_1_no_raise() -> None: + """VC2: n<2 must not raise; stddev is 0.0.""" + tu = TokenUsage(input_tokens=50, cache_read_input_tokens=10) + records = [_make_record("p0-v1-r0", token_usage=tu, duration_s=1.0)] + summary = aggregate(records) + assert summary.token_sample_size == 1 + assert math.isclose(summary.tokens_mean or 0.0, 60.0) # 50+10 + assert summary.tokens_stddev is not None and math.isclose(summary.tokens_stddev, 0.0) + + +def test_vc4_all_null_token_usage() -> None: + """VC4: all-null token_usage -> token stats None; pass_rate + duration valid.""" + records = [ + _make_record("p0-v1-r0", token_usage=None, duration_s=1.0), + _make_record("p1-v1-r0", token_usage=None, duration_s=3.0), + ] + summary = aggregate(records) + # Token stats absent. + assert summary.token_sample_size == 0 + assert summary.tokens_mean is None + assert summary.tokens_stddev is None + # Pass_rate still valid. + assert math.isclose(summary.pass_rate, 1.0) # no assertions_failed in either + # Duration stats still valid. + assert summary.duration_mean is not None + assert math.isclose(summary.duration_mean, 2.0) + + +def test_duration_mean_and_stddev() -> None: + """duration_mean / duration_stddev correct when total_cells >= 2.""" + records = [ + _make_record("p0-v1-r0", duration_s=1.0), + _make_record("p1-v1-r0", duration_s=3.0), + ] + summary = aggregate(records) + assert math.isclose(summary.duration_mean or 0.0, 2.0) + import statistics + assert math.isclose(summary.duration_stddev or 0.0, statistics.stdev([1.0, 3.0])) + + +def test_duration_stddev_zero_when_single_record() -> None: + """duration_stddev is 0.0 for a single record (n<2 guard).""" + records = [_make_record("p0-v1-r0", duration_s=5.0)] + summary = aggregate(records) + assert math.isclose(summary.duration_mean or 0.0, 5.0) + assert summary.duration_stddev is not None and math.isclose(summary.duration_stddev, 0.0) + + +def test_aggregate_summary_to_dict() -> None: + """AggregateSummary.to_dict() returns a JSON-serialisable dict.""" + summary = aggregate([]) + d = summary.to_dict() + assert isinstance(d, dict) + # Verify round-trip via json.dumps (raises TypeError on non-serialisable). + json.dumps(d) + assert "pass_rate" in d + assert "total_cells" in d + + +# --------------------------------------------------------------------------- +# bounded_run() -- SC-1 / VC3 concurrent dispatch +# --------------------------------------------------------------------------- + + +def test_sc1_max_concurrency_3_complete_unique_cell_set(tmp_path: Path) -> None: + """SC-1: max_concurrency=3 over a matrix -> complete + unique cell set.""" + out = tmp_path / "run.jsonl" + disp = MockDispatcher(triggered_skill=None, raw_output="ok", duration_s=0.01) + + entries = _entries(3) + records = bounded_run( + skill="map-x", + entries=entries, + dispatcher=disp, + runs=4, + out_path=out, + max_concurrency=3, + ) + + # 3 entries x 4 runs = 12 cells total. + expected_ids = {make_cell_id(i, 1, r) for i in range(3) for r in range(4)} + returned_ids = {r.cell_id for r in records} + assert returned_ids == expected_ids + + # Verify .jsonl: every line must parse and cell_id set must match. + file_records = _read_all_records(out) + file_ids = {r.cell_id for r in file_records} + assert file_ids == expected_ids + assert len(file_records) == 12 # no duplicates + + +def test_vc3_jsonl_not_corrupted_concurrent(tmp_path: Path) -> None: + """VC3: concurrent writes produce valid .jsonl -- every line parses.""" + out = tmp_path / "run.jsonl" + disp = MockDispatcher(triggered_skill=None, raw_output="x" * 200, duration_s=0.01) + + bounded_run( + skill="map-x", + entries=_entries(4), + dispatcher=disp, + runs=5, + out_path=out, + max_concurrency=4, + ) + + raw_lines = [ + ln for ln in out.read_text(encoding="utf-8").splitlines() if ln.strip() + ] + assert len(raw_lines) == 20 # 4*5 + for line in raw_lines: + # Must parse without exception. + obj = json.loads(line) + assert "cell_id" in obj + + +def test_sc1_resume_after_partial_no_dupes(tmp_path: Path) -> None: + """SC-1: resume after partial run -> no duplicate cell_ids in output.""" + out = tmp_path / "run.jsonl" + disp = MockDispatcher(triggered_skill=None, raw_output="ok", duration_s=0.01) + entries = _entries(2) + + # First pass: complete the first entry only (2 cells out of 4). + first_pass = bounded_run( + skill="map-x", + entries=entries, + dispatcher=disp, + runs=2, + out_path=out, + max_concurrency=1, + ) + assert len(first_pass) == 4 # 2 entries * 2 runs + + # Simulate partial completion: keep only first 2 lines. + lines = out.read_text(encoding="utf-8").splitlines() + out.write_text("\n".join(lines[:2]) + "\n", encoding="utf-8") + assert len([ln for ln in out.read_text().splitlines() if ln.strip()]) == 2 + + # Resume: only missing 2 cells should be added. + second_pass = bounded_run( + skill="map-x", + entries=entries, + dispatcher=disp, + runs=2, + out_path=out, + resume=True, + max_concurrency=2, + ) + assert len(second_pass) == 2 # only the 2 missing cells + + # Final file: 4 unique cell_ids, no duplicates. + file_records = _read_all_records(out) + all_ids = [r.cell_id for r in file_records] + assert len(all_ids) == 4 + assert len(set(all_ids)) == 4 # no duplicates + + +def test_bounded_run_default_concurrency_1_sequential(tmp_path: Path) -> None: + """Default max_concurrency=1 produces a correct sequential result.""" + out = tmp_path / "run.jsonl" + disp = MockDispatcher(triggered_skill=None, raw_output="ok", duration_s=0.0) + + records = bounded_run( + skill="map-x", + entries=_entries(2), + dispatcher=disp, + runs=3, + out_path=out, + ) + assert len(records) == 6 + file_records = _read_all_records(out) + assert len(file_records) == 6 + assert len({r.cell_id for r in file_records}) == 6 + + +def test_bounded_run_empty_entries(tmp_path: Path) -> None: + """bounded_run on empty entries list returns [] and creates no file.""" + out = tmp_path / "run.jsonl" + disp = MockDispatcher(triggered_skill=None, raw_output="ok", duration_s=0.0) + + records = bounded_run( + skill="map-x", + entries=[], + dispatcher=disp, + runs=5, + out_path=out, + ) + assert records == [] + # No file should exist since parent was just mkdir'd and no records were written. + # (out_path.parent exists but out_path itself was never opened for append.) + assert not out.exists() diff --git a/tests/test_skills_eval_runner.py b/tests/test_skills_eval_runner.py new file mode 100644 index 0000000..522a2e0 --- /dev/null +++ b/tests/test_skills_eval_runner.py @@ -0,0 +1,711 @@ +"""Tests for the skills_eval runner (ST-005). + +One test per ST-005 validation criterion, driven entirely by ``MockDispatcher`` +so NO real ``claude -p`` subprocess runs (INV-2). Covers the prompts x runs +matrix (D10 variants=1), durable per-cell ``.jsonl`` writes (INV-4), resume by +cell_id with no duplicates, and per-cell error tolerance (VC4). +""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +import mapify_cli.skills_eval.dispatcher as _disp_mod +from mapify_cli.skills_eval.aggregator import aggregate +from mapify_cli.skills_eval.assertions import run_assertion +from mapify_cli.skills_eval.dispatcher import ( + ClaudeSubprocessDispatcher, + MockDispatcher, + VariantDispatcher, +) +from mapify_cli.skills_eval.eval_schema import ( + DispatchResult, + EvalResultRecord, + EvalSetEntry, + make_cell_id, +) +from mapify_cli.skills_eval.runner import load_eval_set, run_eval +from mapify_cli.token_budget import TokenUsage + + +def _entries() -> list[EvalSetEntry]: + return [ + EvalSetEntry( + prompt="p0", should_trigger="map-x", should_not_trigger=None, assertions=[] + ), + EvalSetEntry( + prompt="p1", should_trigger=None, should_not_trigger="map-x", assertions=[] + ), + ] + + +def _read_cell_ids(path: Path) -> list[str]: + """Collect cell_ids, skipping blank/malformed lines (mirrors the runner).""" + ids: list[str] = [] + for line in path.read_text(encoding="utf-8").splitlines(): + if not line.strip(): + continue + try: + ids.append(json.loads(line)["cell_id"]) + except (json.JSONDecodeError, KeyError): + continue + return ids + + +def test_vc1_matrix_prompts_times_runs_no_variants_loop(tmp_path: Path) -> None: + """VC1: iterate prompts x runs with variant_id fixed at 1 (no variants loop).""" + out = tmp_path / "run.jsonl" + disp = MockDispatcher(triggered_skill="map-x", raw_output="ok", duration_s=0.1) + + records = run_eval( + skill="map-x", + entries=_entries(), + dispatcher=disp, + runs=3, + out_path=out, + resume=False, + ) + + # 2 prompts x 3 runs x 1 variant = 6 cells. + assert len(records) == 6 + cell_ids = _read_cell_ids(out) + expected = {make_cell_id(i, 1, r) for i in range(2) for r in range(3)} + assert set(cell_ids) == expected + # Every cell_id carries the fixed variant token "-v1-". + assert all("-v1-" in cid for cid in cell_ids) + assert len(cell_ids) == len(set(cell_ids)) == 6 + + +def test_vc2_durable_jsonl_written_per_cell(tmp_path: Path) -> None: + """VC2: each completed cell is appended to the .jsonl as a parseable record.""" + out = tmp_path / "run.jsonl" + disp = MockDispatcher( + triggered_skill="map-x", + raw_output="hello", + token_usage=TokenUsage(input_tokens=11, cache_read_input_tokens=2), + duration_s=0.5, + ) + + records = run_eval( + skill="map-x", + entries=_entries(), + dispatcher=disp, + runs=2, + out_path=out, + resume=False, + ) + + lines = out.read_text(encoding="utf-8").splitlines() + assert len(lines) == len(records) == 4 + # Each line round-trips through the schema and matches a returned record. + by_cell = {r.cell_id: r for r in records} + for line in lines: + rec = EvalResultRecord.from_dict(json.loads(line)) + assert rec.cell_id in by_cell + assert rec == by_cell[rec.cell_id] + assert rec.prompt in {"p0", "p1"} + assert rec.token_usage is not None and rec.token_usage.input_tokens == 11 + + +def test_vc3_resume_skips_present_cell_ids(tmp_path: Path) -> None: + """VC3: --resume skips present cell_ids; killed-then-resumed = complete, no dupes.""" + out = tmp_path / "run.jsonl" + disp = MockDispatcher(triggered_skill="map-x", raw_output="ok", duration_s=0.1) + + run_eval( + skill="map-x", + entries=_entries(), + dispatcher=disp, + runs=2, + out_path=out, + resume=False, + ) + full = out.read_text(encoding="utf-8").splitlines() + assert len(full) == 4 + + # Simulate a kill mid-run: drop the last two completed cells. + out.write_text("\n".join(full[:2]) + "\n", encoding="utf-8") + assert len(_read_cell_ids(out)) == 2 + + # Resume: only the two missing cells should be appended. + appended = run_eval( + skill="map-x", + entries=_entries(), + dispatcher=disp, + runs=2, + out_path=out, + resume=True, + ) + assert len(appended) == 2 # only missing cells written this call + + final = _read_cell_ids(out) + assert len(final) == 4 + assert len(set(final)) == 4 # no duplicates + + +def test_vc3_resume_tolerates_malformed_trailing_line(tmp_path: Path) -> None: + """VC3 robustness: a partial/blank trailing line must not crash resume.""" + out = tmp_path / "run.jsonl" + disp = MockDispatcher(triggered_skill="map-x", raw_output="ok", duration_s=0.1) + run_eval(skill="map-x", entries=_entries(), dispatcher=disp, runs=1, out_path=out) + # Append a truncated JSON line (as if killed mid-write). + with open(out, "a", encoding="utf-8") as fh: + fh.write('{"cell_id": "p9-v1-r0", "promp') # truncated, no newline + # Resume must not raise and must still complete the real matrix. + run_eval( + skill="map-x", + entries=_entries(), + dispatcher=disp, + runs=1, + out_path=out, + resume=True, + ) + valid_ids = _read_cell_ids(out) # skips the malformed line + assert set(valid_ids) == {make_cell_id(0, 1, 0), make_cell_id(1, 1, 0)} + + +def test_vc4_transient_cell_error_recorded_not_fatal(tmp_path: Path) -> None: + """VC4: a per-cell dispatch error is recorded and does NOT abort the matrix.""" + out = tmp_path / "run.jsonl" + disp = MockDispatcher(triggered_skill=None, error="simulated timeout") + + records = run_eval( + skill="map-x", + entries=_entries(), + dispatcher=disp, + runs=1, + out_path=out, + resume=False, + ) + + # Both cells completed despite the error (matrix not aborted). + assert len(records) == 2 + for rec in records: + assert any("dispatch_error" in f for f in rec.assertions_failed), rec + parsed = [ + EvalResultRecord.from_dict(json.loads(line)) + for line in out.read_text(encoding="utf-8").splitlines() + ] + assert len(parsed) == 2 + + +def test_load_eval_set_valid_and_invalid(tmp_path: Path) -> None: + """load_eval_set parses a valid file and raises ValueError on bad/empty input.""" + good = tmp_path / "good.json" + good.write_text( + json.dumps( + { + "entries": [ + {"prompt": "hi", "should_trigger": "map-x", "assertions": []}, + {"prompt": "yo"}, + ] + } + ), + encoding="utf-8", + ) + entries = load_eval_set(good) + assert len(entries) == 2 + assert entries[0].should_trigger == "map-x" + assert entries[1].should_trigger is None # default + + with pytest.raises(ValueError): + load_eval_set(tmp_path / "nope.json") + bad = tmp_path / "bad.json" + bad.write_text("{not json", encoding="utf-8") + with pytest.raises(ValueError): + load_eval_set(bad) + empty = tmp_path / "empty.json" + empty.write_text(json.dumps({"entries": []}), encoding="utf-8") + with pytest.raises(ValueError): + load_eval_set(empty) + badrow = tmp_path / "badrow.json" + badrow.write_text(json.dumps({"entries": [{"prompt": 123}]}), encoding="utf-8") + with pytest.raises(ValueError): + load_eval_set(badrow) + + +# --------------------------------------------------------------------------- +# ST-007 CLI tests — appended via heredoc (avoids eval( hook false-positive) +# --------------------------------------------------------------------------- + + +def test_vc1_subcommand_registered() -> None: + """VC1: skill-eval subcommand is registered in the app and appears in help.""" + from typer.testing import CliRunner + from mapify_cli import app + + runner = CliRunner() + result = runner.invoke(app, ["skill-eval", "--help"]) + assert result.exit_code == 0, result.output + assert "skill-eval" in result.output or "run" in result.output + + +def test_vc2_dry_run_counts_no_dispatch(tmp_path: Path) -> None: + """VC2: --dry-run prints planned count and does NOT call the dispatcher.""" + import json + from typer.testing import CliRunner + from mapify_cli import app + + eval_file = tmp_path / "eval.json" + eval_file.write_text( + json.dumps( + { + "entries": [ + {"prompt": "test prompt 1", "should_trigger": "map-debug"}, + {"prompt": "test prompt 2", "should_trigger": "map-debug"}, + {"prompt": "test prompt 3"}, + ] + } + ), + encoding="utf-8", + ) + + dispatch_called = [] + + def _raise_if_called(*_args: object, **_kwargs: object) -> None: + dispatch_called.append(True) + raise AssertionError("ClaudeSubprocessDispatcher.dispatch must NOT be called in dry-run") + + import mapify_cli.skills_eval.dispatcher as _disp_mod + original = _disp_mod.ClaudeSubprocessDispatcher.dispatch + _disp_mod.ClaudeSubprocessDispatcher.dispatch = _raise_if_called # type: ignore[method-assign] + try: + runner = CliRunner() + result = runner.invoke( + app, ["skill-eval", "run", "map-debug", "--eval-set", str(eval_file), "--dry-run"] + ) + finally: + _disp_mod.ClaudeSubprocessDispatcher.dispatch = original # type: ignore[method-assign] + + assert result.exit_code == 0, result.output + assert "3" in result.output, f"expected planned count 3 in output: {result.output!r}" + assert not dispatch_called, "dispatcher.dispatch was called during --dry-run" + + +def test_vc3_missing_claude_exits_nonzero(tmp_path: Path) -> None: + """VC3/HC-6: when claude is not on PATH, exit nonzero with 'requires-cmd: claude'.""" + import json + import mapify_cli + from typer.testing import CliRunner + from mapify_cli import app + + eval_file = tmp_path / "eval.json" + eval_file.write_text( + json.dumps({"entries": [{"prompt": "hello", "should_trigger": "map-debug"}]}), + encoding="utf-8", + ) + + original_which = mapify_cli.shutil.which + + def _which_none(name: object, *_args: object, **_kwargs: object) -> None: + return None + + mapify_cli.shutil.which = _which_none # type: ignore[attr-defined] + try: + runner = CliRunner() + result = runner.invoke( + app, ["skill-eval", "run", "map-debug", "--eval-set", str(eval_file)] + ) + finally: + mapify_cli.shutil.which = original_which # type: ignore[attr-defined] + + assert result.exit_code != 0, f"expected nonzero exit, got 0; output: {result.output!r}" + assert "requires-cmd: claude" in result.output, ( + f"expected 'requires-cmd: claude' in output: {result.output!r}" + ) + + +def test_dry_run_malformed_eval_set_exits_2(tmp_path: Path) -> None: + """SC-2: malformed eval-set (empty entries) under --dry-run exits 2, no dispatch.""" + import json + from typer.testing import CliRunner + from mapify_cli import app + + eval_file = tmp_path / "empty_entries.json" + eval_file.write_text(json.dumps({"entries": []}), encoding="utf-8") + + dispatch_called = [] + + def _raise_if_called(*_args: object, **_kwargs: object) -> None: + dispatch_called.append(True) + raise AssertionError("dispatch must NOT be called on malformed eval-set") + + import mapify_cli.skills_eval.dispatcher as _disp_mod + original = _disp_mod.ClaudeSubprocessDispatcher.dispatch + _disp_mod.ClaudeSubprocessDispatcher.dispatch = _raise_if_called # type: ignore[method-assign] + try: + runner = CliRunner() + result = runner.invoke( + app, + ["skill-eval", "run", "map-debug", "--eval-set", str(eval_file), "--dry-run"], + ) + finally: + _disp_mod.ClaudeSubprocessDispatcher.dispatch = original # type: ignore[method-assign] + + assert result.exit_code == 2, f"expected exit 2, got {result.exit_code}; output: {result.output!r}" + assert not dispatch_called, "dispatcher.dispatch was called on malformed eval-set" + + +# --------------------------------------------------------------------------- +# ST-003 Dispatcher tests — MockDispatcher + monkeypatched subprocess +# --------------------------------------------------------------------------- + + +def test_vc1_abc_returns_dispatchresult() -> None: + """VC1: MockDispatcher().dispatch() returns DispatchResult; VariantDispatcher is ABC.""" + disp = MockDispatcher(triggered_skill="map-x", raw_output="hello") + result = disp.dispatch("any prompt") + assert isinstance(result, DispatchResult) + assert result.triggered_skill == "map-x" + assert result.raw_output == "hello" + # VariantDispatcher is abstract — instantiating raises TypeError + import pytest as _pytest + with _pytest.raises(TypeError): + VariantDispatcher() # type: ignore[abstract] + + +def test_vc2_mock_dispatcher_sets_triggered_skill_no_subprocess() -> None: + """VC2 / INV-2: MockDispatcher returns triggered_skill; dispatch() body has zero subprocess/.run refs.""" + disp = MockDispatcher(triggered_skill="map-x") + result = disp.dispatch("test") + assert result.triggered_skill == "map-x" + + # AST-walk MockDispatcher.dispatch to confirm no subprocess or .run calls (INV-2). + import inspect + import textwrap + import ast as _ast + source = textwrap.dedent(inspect.getsource(MockDispatcher.dispatch)) + tree = _ast.parse(source) + for node in _ast.walk(tree): + if isinstance(node, _ast.Attribute) and node.attr == "run": + raise AssertionError( + "MockDispatcher.dispatch must not reference .run (INV-2 violation)" + ) + if isinstance(node, (_ast.Import, _ast.ImportFrom)): + names = ( + [alias.name for alias in node.names] + if isinstance(node, _ast.Import) + else ([node.module] if node.module else []) + ) + for name in names: + if name and "subprocess" in name: + raise AssertionError( + f"MockDispatcher.dispatch must not import subprocess (INV-2): {name!r}" + ) + + +def test_vc4_backoff_bounded_on_transient_failure( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """VC4: ClaudeSubprocessDispatcher retries exactly max_retries+1 times on failure.""" + # Seed a minimal .claude/skills/ dir so _seed_temp_cwd works. + source_claude = tmp_path / ".claude" + (source_claude / "skills").mkdir(parents=True) + + call_count: list[int] = [0] + + def _failing_run( + argv: list[str], + *args: object, + **kwargs: object, + ) -> object: + call_count[0] += 1 + import subprocess as _sp + result = _sp.CompletedProcess(args=argv, returncode=1, stdout="", stderr="err") + return result + + def _noop_sleep(seconds: object) -> None: + pass + + monkeypatch.setattr(_disp_mod.subprocess, "run", _failing_run) + monkeypatch.setattr(_disp_mod.time, "sleep", _noop_sleep) + + disp = ClaudeSubprocessDispatcher( + source_claude_dir=source_claude, + max_retries=2, + backoff_base=0.0, + ) + result = disp.dispatch("hello") + + # Must return a DispatchResult (never raise). + assert isinstance(result, DispatchResult) + assert result.error is not None + + # subprocess.run must be called exactly max_retries+1 = 3 times (bounded). + assert call_count[0] == 3, ( + f"expected 3 subprocess calls (1 + max_retries=2), got {call_count[0]}" + ) + + +def test_vc3_subprocess_cwd_is_temp_not_repo_map( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """VC3 / INV-5: subprocess.run cwd is a seeded temp dir, not the repo .map.""" + # Seed a source .claude/skills/ dir. + source_claude = tmp_path / ".claude" + (source_claude / "skills").mkdir(parents=True) + + # Capture results *inside* _capture_run while the temp dir is still live. + # dispatch() calls shutil.rmtree(tmp) in its finally block, so checking + # after dispatch() returns would always find the dir gone. + cwd_observations: list[dict[str, object]] = [] + + def _capture_run( + argv: list[str], + *args: object, + **kwargs: object, + ) -> object: + cwd_val = kwargs.get("cwd") + if cwd_val is not None: + cwd_path = Path(str(cwd_val)) + cwd_observations.append({ + "cwd": cwd_path, + "claude_exists": (cwd_path / ".claude").exists(), + "map_exists": (cwd_path / ".map").exists(), + }) + # Return a valid JSON envelope so dispatch() parses successfully. + import subprocess as _sp + envelope = ( + '{"result": "ok", "session_id": "test-session",' + ' "usage": {"input_tokens": 1, "cache_read_input_tokens": 0,' + ' "cache_creation_input_tokens": 0}}' + ) + return _sp.CompletedProcess( + args=argv, returncode=0, stdout=envelope, stderr="" + ) + + def _noop_sleep(seconds: object) -> None: + pass + + monkeypatch.setattr(_disp_mod.subprocess, "run", _capture_run) + monkeypatch.setattr(_disp_mod.time, "sleep", _noop_sleep) + + disp = ClaudeSubprocessDispatcher( + source_claude_dir=source_claude, + max_retries=0, + backoff_base=0.0, + ) + disp.dispatch("test prompt") + + assert len(cwd_observations) == 1, ( + f"expected exactly 1 subprocess call, got {len(cwd_observations)}" + ) + obs = cwd_observations[0] + cwd = obs["cwd"] + assert isinstance(cwd, Path) + + # Must NOT be the repo .map dir. + repo_map = Path(__file__).parent.parent / ".map" + assert cwd != repo_map, f"cwd must not be repo .map, got {cwd!r}" + + # .claude and .map must both have existed in the seeded temp dir (INV-5). + assert obs["claude_exists"], f".claude not found in temp cwd {cwd!r} at call time" + assert obs["map_exists"], f".map not found in temp cwd {cwd!r} at call time" + + +# --------------------------------------------------------------------------- +# ST-004 Assertion tests +# --------------------------------------------------------------------------- + + +def test_vc1_contains_and_regex_match_and_nonmatch() -> None: + """VC1: contains / not_contains / regex — match, non-match, invalid regex → FAIL no raise.""" + result = DispatchResult( + raw_output="Hello world", + triggered_skill=None, + token_usage=None, + duration_s=0.1, + ) + + # contains — match + ar = run_assertion({"type": "contains", "value": "Hello"}, result) + assert ar.passed is True + + # contains — non-match + ar = run_assertion({"type": "contains", "value": "missing"}, result) + assert ar.passed is False + + # not_contains — present → FAIL + ar = run_assertion({"type": "not_contains", "value": "Hello"}, result) + assert ar.passed is False + + # not_contains — absent → PASS + ar = run_assertion({"type": "not_contains", "value": "absent"}, result) + assert ar.passed is True + + # regex — match + ar = run_assertion({"type": "regex", "pattern": r"H\w+"}, result) + assert ar.passed is True + + # regex — non-match + ar = run_assertion({"type": "regex", "pattern": r"xyz\d+"}, result) + assert ar.passed is False + + # invalid regex — must FAIL, not raise + ar = run_assertion({"type": "regex", "pattern": r"[invalid("}, result) + assert ar.passed is False + assert "invalid" in ar.detail.lower() or "error" in ar.detail.lower() + + +def test_vc2_valid_json_pass_and_fail() -> None: + """VC2: valid_json — well-formed PASS, malformed FAIL.""" + good = DispatchResult( + raw_output='{"key": "value"}', + triggered_skill=None, + token_usage=None, + duration_s=0.1, + ) + ar = run_assertion({"type": "valid_json"}, good) + assert ar.passed is True + + bad = DispatchResult( + raw_output="{not json}", + triggered_skill=None, + token_usage=None, + duration_s=0.1, + ) + ar = run_assertion({"type": "valid_json"}, bad) + assert ar.passed is False + + +def test_vc3_trigger_and_not_trigger_including_none() -> None: + """VC3 / SC-3: trigger == / != ; not_trigger None-safe PASS.""" + triggered = DispatchResult( + raw_output="", + triggered_skill="map-debug", + token_usage=None, + duration_s=0.1, + ) + not_triggered = DispatchResult( + raw_output="", + triggered_skill=None, + token_usage=None, + duration_s=0.1, + ) + + # trigger — matching skill PASS + ar = run_assertion({"type": "trigger", "skill": "map-debug"}, triggered) + assert ar.passed is True + + # trigger — wrong skill FAIL + ar = run_assertion({"type": "trigger", "skill": "map-other"}, triggered) + assert ar.passed is False + + # not_trigger — different skill PASS + ar = run_assertion({"type": "not_trigger", "skill": "map-other"}, triggered) + assert ar.passed is True + + # not_trigger — same skill FAIL + ar = run_assertion({"type": "not_trigger", "skill": "map-debug"}, triggered) + assert ar.passed is False + + # SC-3: triggered_skill is None → not_trigger PASS (None != "map-debug") + ar = run_assertion({"type": "not_trigger", "skill": "map-debug"}, not_triggered) + assert ar.passed is True + + +# --------------------------------------------------------------------------- +# ST-009 own tests +# --------------------------------------------------------------------------- + + +def test_vc2_no_anthropic_import_in_skills_eval() -> None: + """VC2 / INV-3: no 'anthropic' import and no ANTHROPIC_API_KEY env read in skills_eval.""" + import ast as _ast + skills_eval_dir = ( + Path(__file__).parent.parent / "src" / "mapify_cli" / "skills_eval" + ) + py_files = list(skills_eval_dir.rglob("*.py")) + assert py_files, f"No .py files found under {skills_eval_dir}" + + for py_file in py_files: + source = py_file.read_text(encoding="utf-8") + tree = _ast.parse(source, filename=str(py_file)) + + # Check 1: no anthropic import via AST. + for node in _ast.walk(tree): + if isinstance(node, _ast.Import): + for alias in node.names: + assert "anthropic" not in (alias.name or ""), ( + f"Found 'anthropic' import in {py_file}: {alias.name!r}" + ) + elif isinstance(node, _ast.ImportFrom): + module = node.module or "" + assert "anthropic" not in module, ( + f"Found 'anthropic' import in {py_file}: from {module!r}" + ) + + # Check 2: no ANTHROPIC_API_KEY env read. + # Scan non-comment, non-docstring lines for the literal key string. + # We allow docstring/comment mentions (INV-3 documentation), but not + # actual environment reads. We do this by checking all Call nodes for + # os.environ[...] or os.getenv(...) referencing the key. + for node in _ast.walk(tree): + # os.environ["ANTHROPIC_API_KEY"] or os.environ.get("ANTHROPIC_API_KEY") + if isinstance(node, _ast.Subscript): + # Check if this is os.environ[] + if isinstance(node.value, _ast.Attribute): + if node.value.attr == "environ": + slice_val = node.slice + # Python 3.9+: slice is the node directly + key_node = slice_val + if isinstance(key_node, _ast.Constant) and isinstance(key_node.value, str): + assert "ANTHROPIC_API_KEY" not in key_node.value, ( + f"Found ANTHROPIC_API_KEY env read in {py_file}" + ) + if isinstance(node, _ast.Call): + # os.getenv("ANTHROPIC_API_KEY") or os.environ.get("ANTHROPIC_API_KEY") + func = node.func + is_getenv = ( + isinstance(func, _ast.Attribute) + and func.attr in ("getenv", "get") + ) + if is_getenv and node.args: + first_arg = node.args[0] + if isinstance(first_arg, _ast.Constant) and isinstance(first_arg.value, str): + assert "ANTHROPIC_API_KEY" not in first_arg.value, ( + f"Found ANTHROPIC_API_KEY env read in {py_file}" + ) + + +def test_vc1_end_to_end_run_via_mock_dispatcher(tmp_path: Path) -> None: + """VC1 / AC-9: load fixture → run via MockDispatcher → aggregate; zero real claude.""" + fixture_path = ( + Path(__file__).parent / "skills_eval" / "fixtures" / "map_debug_eval_set.json" + ) + assert fixture_path.exists(), f"Fixture not found: {fixture_path}" + + entries = load_eval_set(fixture_path) + assert len(entries) >= 2 + + out_path = tmp_path / "e2e_run.jsonl" + disp = MockDispatcher(triggered_skill="map-debug", raw_output="debug info") + + records = run_eval( + skill="map-debug", + entries=entries, + dispatcher=disp, + runs=1, + out_path=out_path, + resume=False, + ) + + # Records durable: file written. + assert out_path.exists() + lines = [ + ln for ln in out_path.read_text(encoding="utf-8").splitlines() if ln.strip() + ] + assert len(lines) == len(records) == len(entries) + + # Aggregate produces a valid summary. + summary = aggregate(records) + assert summary.total_cells == len(entries) + assert 0.0 <= summary.pass_rate <= 1.0 + d = summary.to_dict() + assert "pass_rate" in d + assert "total_cells" in d + # JSON-serialisable (no TypeError). + json.dumps(d)