diff --git a/.claude/skills/map-plan/SKILL.md b/.claude/skills/map-plan/SKILL.md index 270c024a..ec40bc40 100644 --- a/.claude/skills/map-plan/SKILL.md +++ b/.claude/skills/map-plan/SKILL.md @@ -1,7 +1,7 @@ --- name: map-plan description: | - ARCHITECT phase only: decompose a complex task into atomic subtasks via task-decomposer. Use when starting a feature, refactor, or complex bug fix and you need a plan first. Do NOT use to execute work; use map-task or map-efficient. + ARCHITECT phase only: produce an upfront plan by decomposing a complex task into atomic subtasks with clear dependencies, via task-decomposer. Use when the user asks to plan, create a structured plan, break down, decompose, or stage work — e.g. planning a feature, refactor, migration, API/versioning upgrade, or incremental/phased rollout into smaller independent steps before any code is written. Trigger on phrasing like "plan a…", "create a plan for…", "decompose…into tasks", "break this into steps", "roll out incrementally", or "smaller independent steps". Do NOT use to execute work; use map-task or map-efficient. effort: high argument-hint: "[task description]" --- diff --git a/.claude/skills/map-skill-eval/SKILL.md b/.claude/skills/map-skill-eval/SKILL.md index 567ac04f..1aaacc89 100644 --- a/.claude/skills/map-skill-eval/SKILL.md +++ b/.claude/skills/map-skill-eval/SKILL.md @@ -87,6 +87,60 @@ mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --resume - **Run log not found for `--resume`** — `--resume` looks for the latest `.map/eval-runs//.jsonl`. If no prior run exists, omit `--resume` to start fresh. - **All cases report `not_trigger` unexpectedly** — verify the skill name matches exactly (e.g. `map-plan`, not `map_plan`) and that `.claude/` was seeded correctly in the temp cwd. +## Optimize a skill description + +Anti-overfit description optimizer: deterministic 60/40 train/test split, up to N iterations (iteration 0 = baseline = current description). Selects the candidate with the highest held-out TEST pass-rate; an overfit candidate (train pass-rate up, test pass-rate down) is flagged and never selected. + +```bash +mapify skill-eval optimize --eval-set PATH [--iterations N] [--apply] [--open] [--dry-run] +``` + +- `` — skill to optimize (e.g. `map-plan`). +- `--eval-set PATH` — eval-set JSON with `>= 5` entries (a 60/40 split needs `n_test >= 3`; a smaller set exits with code 2, spending zero quota). +- `--iterations N` — maximum optimization iterations (default: 5). Iteration 0 is the baseline. +- `--apply` — patch the winning description into the SKILL.md frontmatter `description:` of `templates_src/skills//SKILL.md.jinja` and re-render so generated trees stay byte-identical; the change is staged, not committed. `skill-rules.json` `description` is NOT auto-patched (update it by hand). Two no-op cases: "No improvement found" (baseline already optimal) and "Winner identical to current". +- `--open` — open the HTML report in the browser after the run (best-effort; never errors the run). +- `--dry-run` — print the planned call budget (iterations × (n_train + n_test) dispatch calls + iterations proposer calls) and `model: default (resolved by claude CLI)`, then exit 0 spending zero quota. + +Writes a durable `OptimizeResult` JSON and an HTML report to `.map/eval-runs//-optimize.json` and `-optimize.html`. + +Default mode is propose-only: nothing outside `.map/` is modified. + +### Examples + +```bash +# Preview quota usage without spending any +mapify skill-eval optimize map-plan --eval-set .map/evals/map-plan.json --dry-run + +# Run 3 optimization iterations and open the HTML report +mapify skill-eval optimize map-plan --eval-set .map/evals/map-plan.json --iterations 3 --open + +# Run, then auto-apply the winning description if improvement found +mapify skill-eval optimize map-plan --eval-set .map/evals/map-plan.json --apply +``` + +## View an optimization report + +Renders the latest (or a specified `--result`) stored `OptimizeResult` JSON as an HTML report. + +```bash +mapify skill-eval view [--result PATH] [--open] +``` + +- `` — skill whose optimization results to view. +- `--result PATH` — path to a specific `*-optimize.json` result file; defaults to the latest in `.map/eval-runs//`. +- `--open` — open the rendered HTML report in the browser. + +### Examples + +```bash +# View the latest optimization report for map-plan +mapify skill-eval view map-plan + +# Open a specific result file in the browser +mapify skill-eval view map-plan --result .map/eval-runs/map-plan/20260601T120000-optimize.json --open +``` + ## Related Commands - `/map-plan` — plan and decompose tasks. diff --git a/.claude/skills/skill-rules.json b/.claude/skills/skill-rules.json index d5a96063..b8e18828 100644 --- a/.claude/skills/skill-rules.json +++ b/.claude/skills/skill-rules.json @@ -244,11 +244,11 @@ "skillClass": "task", "enforcement": "manual", "priority": "medium", - "description": "Evaluate a /map-* skill's trigger accuracy + cost via mapify skill-eval (claude -p matrix, deterministic assertions, durable resumable runs).", + "description": "Evaluate a /map-* skill's trigger accuracy + cost, OR optimize its description via anti-overfit held-out selection, via mapify skill-eval (claude -p matrix, deterministic assertions, durable resumable runs).", "requires-cmd": ["claude"], "promptTriggers": { - "keywords": ["map-skill-eval","skill-eval","skill eval","evaluate skill","trigger accuracy","skill triggering"], - "intentPatterns": ["map-skill-eval","(eval|evaluate|measure|test).*(skill).*(trigger|fire|cost)","does .* skill trigger"] + "keywords": ["map-skill-eval","skill-eval","skill eval","evaluate skill","trigger accuracy","skill triggering","optimize skill","skill optimize","description optimizer","optimize description"], + "intentPatterns": ["map-skill-eval","(eval|evaluate|measure|test).*(skill).*(trigger|fire|cost)","does .* skill trigger","(optimize|improve).*(skill).*(description|trigger)"] } }, "map-task": { diff --git a/.map/scripts/map_orchestrator.py b/.map/scripts/map_orchestrator.py index 03ea61cb..013227f2 100755 --- a/.map/scripts/map_orchestrator.py +++ b/.map/scripts/map_orchestrator.py @@ -2166,6 +2166,29 @@ def _is_cross_repo_path(p: str) -> bool: diff_paths = set() if diff_paths: files_not_in_diff = [p for p in declared if p not in diff_paths] + # Gitignored deliverables (e.g. .map/ workflow artifacts like spike + # docs or eval-run .jsonl) never appear in git diff/status by design — + # that is NOT Actor truncation. Drop any declared path that + # `git check-ignore` reports as ignored so it does not raise a false + # "Possible Actor truncation" warning. A gitignored file that is also + # missing from disk is still flagged separately via missing_files. + if files_not_in_diff: + try: + igproc = _sp.run( + ["git", "check-ignore", "--", *files_not_in_diff], + cwd=project_dir, capture_output=True, text=True, timeout=5, + ) + ignored = { + line.strip() + for line in igproc.stdout.splitlines() + if line.strip() + } + if ignored: + files_not_in_diff = [ + p for p in files_not_in_diff if p not in ignored + ] + except (OSError, _sp.TimeoutExpired): + pass state.record_subtask_result( subtask_id, diff --git a/src/mapify_cli/__init__.py b/src/mapify_cli/__init__.py index cf89218e..5d857cce 100644 --- a/src/mapify_cli/__init__.py +++ b/src/mapify_cli/__init__.py @@ -1565,6 +1565,229 @@ def validate_graph( raise typer.Exit(2) +def _open_best_effort(path: Path) -> None: + """Open *path* in the default browser — swallow any error (VC5/SC-2).""" + import webbrowser # lazy import: optional use-path + + try: + webbrowser.open(path.as_uri()) + except Exception: # noqa: BLE001 + pass # SC-2: never errors the run + + +def _read_skill_description(root: Path, skill: str) -> str: + """Return the description: field from SKILL.md frontmatter, or '' on any failure.""" + skill_md = root / ".claude" / "skills" / skill / "SKILL.md" + if not skill_md.exists(): + return "" + try: + from mapify_cli.skill_ir import parse_frontmatter # lazy import + + text = skill_md.read_text(encoding="utf-8") + if not text.startswith("---\n"): + return "" + close = text.find("\n---", 4) + if close == -1: + return "" + frontmatter_text = text[4:close] + parsed = parse_frontmatter(frontmatter_text) + return str(parsed.get("description", "")) + except Exception: # noqa: BLE001 + return "" + + +# --------------------------------------------------------------------------- +# skill-eval optimize +# --------------------------------------------------------------------------- + +_OPTIMIZE_MIN_ENTRIES: int = 5 + + +@skill_eval_app.command("optimize") +def skill_eval_optimize( + skill: str = typer.Argument(..., help="Skill under optimisation, e.g. map-plan"), + eval_set: Optional[Path] = typer.Option( + None, "--eval-set", help="Path to eval-set JSON" + ), + iterations: int = typer.Option( + 5, "--iterations", min=1, help="Total iterations including baseline (default 5)" + ), + apply: bool = typer.Option( + False, "--apply", help="Apply the winning description back to the .jinja source" + ), + open_html: bool = typer.Option( + False, "--open", help="Open the HTML report in the default browser" + ), + dry_run: bool = typer.Option( + False, "--dry-run", help="Print planned call budget; spend nothing, no dispatcher" + ), +) -> None: + """Optimise a skill's trigger description via repeated eval iterations. + + Exit codes: + 0 - Success (or dry-run completed) + 1 - Runtime error (claude not found) + 2 - Validation error (missing --eval-set, malformed eval-set, or < 5 entries) + """ + import json # lazy — keep top-level import time low + + import mapify_cli.skills_eval.runner as _runner + from datetime import timezone + + # 1. --eval-set is required. + if eval_set is None: + console.print("[bold red]Error:[/bold red] provide --eval-set PATH") + raise typer.Exit(2) + + # 2. Load and validate eval-set. + try: + entries = _runner.load_eval_set(eval_set) + except ValueError as exc: + console.print(f"[bold red]Error:[/bold red] {exc}") + raise typer.Exit(2) + + # 3. MIN-SIZE guard — BEFORE dry-run and BEFORE any dispatcher (VC2). + if len(entries) < _OPTIMIZE_MIN_ENTRIES: + console.print( + f"[bold red]Error:[/bold red] eval-set has {len(entries)} " + f"{'entry' if len(entries) == 1 else 'entries'}; " + f"optimize requires >= {_OPTIMIZE_MIN_ENTRIES} entries" + ) + raise typer.Exit(2) + + # 4. DRY-RUN — print budget, exit 0, construct NO dispatcher (VC1). + if dry_run: + from mapify_cli.skills_eval.description_optimizer import ( + _DEFAULT_SEED, + split_train_test, + ) + + train, test = split_train_test(entries, _DEFAULT_SEED) + n_train = len(train) + n_test = len(test) + total_dispatches = iterations * (n_train + n_test) + console.print( + f"[bold]Dry-run:[/bold] " + f"{iterations} x ({n_train}+{n_test}) = [cyan]{total_dispatches}[/cyan] " + f"dispatch calls + [cyan]{iterations}[/cyan] proposer calls" + ) + console.print("model: default (resolved by claude CLI)") + raise typer.Exit(0) + + # 5. CLAUDE CHECK — require claude BEFORE any invocation (VC3). + if shutil.which("claude") is None: + console.print( + "[bold red]Error:[/bold red] requires-cmd: claude — " + "install the claude CLI and ensure it is on PATH" + ) + raise typer.Exit(1) + + # 6. REAL RUN. + import mapify_cli.skills_eval.proposer as _proposer + from mapify_cli.skills_eval.description_optimizer import optimize + from mapify_cli.skills_eval.viewer import render_to_path + + root = Path.cwd() + out_dir = root / ".map" / "eval-runs" / skill + out_dir.mkdir(parents=True, exist_ok=True) + run_ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") + + current_description = _read_skill_description(root, skill) + + result = optimize( + skill=skill, + entries=entries, + current_description=current_description, + proposer=_proposer.propose_description, + dispatcher=None, + source_claude_dir=root / ".claude", + out_dir=out_dir, + run_ts=run_ts, + iterations=iterations, + ) + + json_path = out_dir / f"{run_ts}-optimize.json" + html_path = out_dir / f"{run_ts}-optimize.html" + json_path.write_text(json.dumps(result.to_dict(), indent=2), encoding="utf-8") + render_to_path(result, html_path) + + status_label = "no improvement" if result.no_improvement else f"iter {result.winning_iteration}" + winner_iter = next( + (it for it in result.iterations if it.selected), + None, + ) + test_pass_rate = winner_iter.test_pass_rate if winner_iter is not None else 0.0 + console.print( + f"[bold]Optimize complete:[/bold] skill=[bold]{skill}[/bold] " + f"winner=[cyan]{status_label}[/cyan] " + f"test_pass_rate=[cyan]{test_pass_rate:.1%}[/cyan]" + ) + console.print(f" artifact: [cyan]{json_path}[/cyan]") + + if apply: + from mapify_cli.skills_eval.apply_patcher import apply_optimized_description + + apply_optimized_description( + skill=skill, + winner=result.winning_description, + current_description=current_description, + no_improvement=result.no_improvement, + repo_root=root, + stage=True, + ) + + if open_html: + _open_best_effort(html_path) + + +# --------------------------------------------------------------------------- +# skill-eval view +# --------------------------------------------------------------------------- + + +@skill_eval_app.command("view") +def skill_eval_view( + skill: str = typer.Argument(..., help="Skill whose optimization result to view"), + result_path: Optional[Path] = typer.Option( + None, "--result", help="Path to a specific *-optimize.json file" + ), + open_html: bool = typer.Option( + False, "--open", help="Open the HTML report in the default browser" + ), +) -> None: + """Render the latest (or specified) optimize result as an HTML report. + + Exit codes: + 0 - Success + 2 - No optimize result found + """ + import json + + from mapify_cli.skills_eval.eval_schema import OptimizeResult + from mapify_cli.skills_eval.viewer import render_to_path + + out_dir = Path.cwd() / ".map" / "eval-runs" / skill + + if result_path is not None: + path = result_path + else: + candidates = sorted(out_dir.glob("*-optimize.json")) + if not candidates: + console.print( + f"[bold red]Error:[/bold red] no optimize result found under {out_dir}" + ) + raise typer.Exit(2) + path = candidates[-1] + + res = OptimizeResult.from_dict(json.loads(path.read_text(encoding="utf-8"))) + html = path.with_suffix(".html") + render_to_path(res, html) + console.print(f" report: [cyan]{html}[/cyan]") + + if open_html: + _open_best_effort(html) + + def main(): app() diff --git a/src/mapify_cli/skills_eval/apply_patcher.py b/src/mapify_cli/skills_eval/apply_patcher.py new file mode 100644 index 00000000..8439d228 --- /dev/null +++ b/src/mapify_cli/skills_eval/apply_patcher.py @@ -0,0 +1,226 @@ +"""Apply an optimized skill description to the single-source .jinja template. + +Invariants enforced by this module +----------------------------------- +INV-5 / source-untouched: only the specified .jinja file is written; + generated trees are updated via render_repo_trees. +INV-9 / path-safety: target .jinja must resolve under templates_src/; + paths under .git/ are rejected. +VC3 / no-op messages: two distinct messages for the two no-write outcomes; + baseline is NEVER written back (no_improvement guard). +VC4 / fail-loud: patch_skill_description raises ValueError on bad input + with NO partial write; YAML block scalar format enforced. +""" + +from __future__ import annotations + +import subprocess +from pathlib import Path + + +def patch_skill_description(skill_md_path: Path, new_description: str) -> None: + """Rewrite the ``description:`` block scalar in a SKILL.md.jinja frontmatter. + + The file MUST begin with ``---\\n`` and contain a closing ``\\n---``. + The ``description:`` key MUST exist at column 0 in the frontmatter. + + All computation is done in-memory; the file is written ONCE at the end. + On any validation error the file is left completely untouched. + + Args: + skill_md_path: Path to the .jinja file to patch (or any SKILL.md). + new_description: Replacement description text (already .strip()'d). + + Raises: + ValueError: frontmatter missing, description key absent, or path rejected. + FileNotFoundError: skill_md_path does not exist. + """ + # Intent: read-then-compute-then-write-once; no partial write on failure. + text = skill_md_path.read_text(encoding="utf-8") + + # --- Validate frontmatter presence --- + if not text.startswith("---\n"): + raise ValueError( + f"No YAML frontmatter found in {skill_md_path}: " + "file must start with '---\\n'" + ) + # Find closing --- (must have \n--- to be valid) + close_idx = text.find("\n---", 3) + if close_idx == -1: + raise ValueError( + f"No closing '---' found in frontmatter of {skill_md_path}" + ) + # The closing marker: we want the \n--- (possibly \n---\n or \n--- at EOF) + # close_idx points to the '\n' before '---' + frontmatter_end = close_idx + 4 # len('\n---') = 4 + + frontmatter = text[4:close_idx] # content between the two --- + rest_of_file = text[frontmatter_end:] # everything from closing --- onward + + # --- Locate description: key at column 0 --- + fm_lines = frontmatter.split("\n") + desc_line_idx: int | None = None + for i, line in enumerate(fm_lines): + if line.startswith("description:"): + desc_line_idx = i + break + + if desc_line_idx is None: + raise ValueError( + f"No 'description:' key found at column 0 in frontmatter of {skill_md_path}" + ) + + # --- Find the indented body lines that belong to description --- + # A YAML block scalar body includes indented lines AND interior blank lines + # (blank lines are part of a multi-paragraph '|' block), ending at the first + # NON-blank 0-indent line (the next key) or end of frontmatter. + body_start = desc_line_idx + 1 + body_end = body_start # exclusive end index + for i in range(body_start, len(fm_lines)): + line = fm_lines[i] + if line == "" or line.startswith(" ") or line.startswith("\t"): + body_end = i + 1 + else: + break + # Trim TRAILING blank lines back out of the replaced span so a blank separator + # between the description block and the next key is preserved (no whitespace drift). + # Interior blanks are kept because the trim stops at the first non-blank line. + while body_end > body_start and fm_lines[body_end - 1] == "": + body_end -= 1 + + # --- Build the new description block scalar (|- strip chomping) --- + # Use '|-' so YAML-parsed value == new_description exactly (no trailing newline). + new_desc_lines = ["description: |-"] + for desc_line in new_description.split("\n"): + new_desc_lines.append(f" {desc_line}") + + # --- Reassemble frontmatter --- + new_fm_lines = ( + fm_lines[:desc_line_idx] + + new_desc_lines + + fm_lines[body_end:] + ) + new_frontmatter = "\n".join(new_fm_lines) + new_text = "---\n" + new_frontmatter + "\n---" + rest_of_file + + # Intent: single atomic write after all validation passes. + skill_md_path.write_text(new_text, encoding="utf-8") + + +def apply_optimized_description( + *, + skill: str, + winner: str, + current_description: str, + no_improvement: bool, + repo_root: Path, + stage: bool = True, +) -> str: + """Apply ``winner`` description to the skill's single-source .jinja template. + + Prints a user-facing message and returns an outcome code string. + + Outcome codes: + ``'no_improvement'`` — baseline wins; nothing written. + ``'identical'`` — winner == current; nothing written. + ``'applied'`` — .jinja patched and trees re-rendered. + + Args: + skill: Skill name (e.g. ``'map-skill-eval'``). + winner: Candidate description text that won the eval. + current_description: Current description text from the .jinja. + no_improvement: True when baseline beat every candidate on held-out test. + repo_root: Absolute path to the repository root. + stage: If True, stage the patched .jinja + existing gate trees via a scoped + ``git add -- `` after rendering (warn-only on failure; never commits). + + Returns: + Outcome code string. + + Raises: + ValueError: Path safety violation (outside templates_src or under .git/). + FileNotFoundError: .jinja source file does not exist. + """ + # --- VC3: no_improvement guard — NEVER write baseline back --- + if no_improvement: + print( + "No improvement found: current description already optimal on held-out test" + ) + return "no_improvement" + + # --- VC3: winner identical guard --- + if winner == current_description: + print("Winner identical to current description; no file changes") + return "identical" + + # --- Resolve and validate target path BEFORE any filesystem touch (INV-9) --- + # Path-safety runs first so a traversal/.git target is rejected even when the + # file does not exist (resolve() normalises '..' without requiring existence). + templates_src_root = (repo_root / "src" / "mapify_cli" / "templates_src").resolve() + jinja = repo_root / "src" / "mapify_cli" / "templates_src" / "skills" / skill / "SKILL.md.jinja" + resolved = jinja.resolve() + + # Intent: reject paths that escape templates_src or touch .git/ + try: + resolved.relative_to(templates_src_root) + except ValueError: + raise ValueError( + f"Path safety violation: {resolved} is not under {templates_src_root}" + ) from None + + if ".git" in resolved.parts: + raise ValueError(f"Path safety violation: {resolved} is under .git/") + + if not jinja.exists(): + raise FileNotFoundError( + f"SKILL.md.jinja not found for skill '{skill}': {jinja}" + ) + + # --- Patch the .jinja source --- + patch_skill_description(jinja, winner) + + # --- Re-render both providers (INV-5) --- + # Intent: import inline to keep module importable without jinja2 at module load time. + # Pass templates_src_root explicitly so that when repo_root is a temp dir + # the renderer reads from the temp copy, not the installed package source. + from mapify_cli.delivery.template_renderer import render_repo_trees # noqa: PLC0415 + + render_templates_src = repo_root / "src" / "mapify_cli" / "templates_src" + render_repo_trees("claude", repo_root=repo_root, templates_src_root=render_templates_src) + render_repo_trees("codex", repo_root=repo_root, templates_src_root=render_templates_src) + + # --- Optionally stage ONLY the patched source + rendered gate trees (warn-only) --- + # Scoped staging (never `git add -A`): avoids sweeping unrelated dirty files + # (in-progress work, .map/ workflow state) into the caller's next commit. + if stage: + gate_relpaths = ( + ".claude", + ".codex", + "src/mapify_cli/templates", + ".agents/skills", + ) + to_stage: list[str] = [str(jinja.relative_to(repo_root))] + for rel in gate_relpaths: + if (repo_root / rel).exists(): + to_stage.append(rel) + result = subprocess.run( + ["git", "add", "--", *to_stage], + cwd=repo_root, + capture_output=True, + text=True, + check=False, + ) + if result.returncode != 0: + print( + f"Warning: 'git add' failed (exit {result.returncode}): " + f"{result.stderr.strip()}" + ) + + # --- Reminder: skill-rules.json is NOT auto-patched --- + print( + f"Applied new description for '{skill}'. " + "Note: skill-rules.json description is NOT auto-patched — " + "update it by hand or via ST-008." + ) + + return "applied" diff --git a/src/mapify_cli/skills_eval/description_optimizer.py b/src/mapify_cli/skills_eval/description_optimizer.py new file mode 100644 index 00000000..03f352c8 --- /dev/null +++ b/src/mapify_cli/skills_eval/description_optimizer.py @@ -0,0 +1,460 @@ +"""Anti-overfit skill description optimizer. + +Runs N iterations of description proposal + evaluation, selects the candidate +that maximises held-out TEST pass-rate without overfitting to training data. + +Invariants enforced by this module +----------------------------------- +INV-3 / no-anthropic: no ``import anthropic``, no ANTHROPIC_API_KEY access. +INV-2 / no-datetime: no ``import datetime``; caller supplies ``run_ts``. +INV-2 / no-random: no ``import random``; split uses hashlib (deterministic). +INV-2 / clock-free: timestamp comes from caller (``run_ts`` param), never + from ``time.time()`` or ``datetime.now()`` here. +INV-5 / source-untouched: production .claude/ and templates_src/ are NEVER + modified. Each iteration seeds a throwaway temp dir + and cleans it up in a ``finally`` block. +VC1 / anti-overfit: a candidate with train_pass_rate > baseline.train_pass_rate + AND test_pass_rate < baseline.test_pass_rate is flagged + ``overfit=True`` and is NEVER selected as the winner. +""" + +from __future__ import annotations + +import hashlib +import json +import shutil +import tempfile +from pathlib import Path + +from mapify_cli.skills_eval.aggregator import aggregate +from mapify_cli.skills_eval.dispatcher import ClaudeSubprocessDispatcher, VariantDispatcher +from mapify_cli.skills_eval.eval_schema import ( + EvalResultRecord, + EvalSetEntry, + OptimizeIterationRecord, + OptimizeResult, + ProposerFn, +) +from mapify_cli.skills_eval.runner import run_eval + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +_DEFAULT_SEED: int = 1337 + + +# --------------------------------------------------------------------------- +# Deterministic train/test split (VC2 / INV-2 / HC-8) +# --------------------------------------------------------------------------- + + +def split_train_test( + entries: list[EvalSetEntry], + seed: int, +) -> tuple[list[EvalSetEntry], list[EvalSetEntry]]: + """Return a deterministic (train, test) split. + + Pure function: given identical ``entries`` and ``seed`` the result is + always the same. Uses ``hashlib.sha256`` — no ``random``, no ``datetime``. + + Test fraction is ``max(1, round(n * 0.4))`` entries; remainder is train. + """ + n = len(entries) + n_test = max(1, round(n * 0.4)) + # Sort indices by hash so the assignment is deterministic and seed-sensitive. + order = sorted( + range(n), + key=lambda i: hashlib.sha256(f"{seed}:{i}".encode()).digest(), + ) + test_idx: set[int] = set(order[:n_test]) + train = [e for i, e in enumerate(entries) if i not in test_idx] + test = [e for i, e in enumerate(entries) if i in test_idx] + return train, test + + +# --------------------------------------------------------------------------- +# Candidate SKILL.md patching helpers (VC6 / INV-5) +# --------------------------------------------------------------------------- + + +def _set_frontmatter_description(content: str, new_desc: str) -> str: + """Replace the ``description:`` line in YAML frontmatter with ``new_desc``. + + Rules: + - File MUST start with ``---\\n`` (YAML frontmatter). + - Frontmatter MUST contain a line starting with ``description:``. + - Only that single line is replaced; all other keys and the body are + preserved unchanged. + - ``new_desc`` is serialised as a double-quoted YAML scalar so that + embedded colons, quotes, and newlines parse back correctly. + + Raises ``ValueError`` if the preconditions are not met (fail-loud). + """ + if not content.startswith("---\n"): + raise ValueError( + "SKILL.md content does not start with YAML frontmatter ('---\\n')" + ) + + # Locate the closing --- of the frontmatter block. + close_idx = content.find("\n---", 4) + if close_idx == -1: + raise ValueError("SKILL.md frontmatter has no closing '---'") + + frontmatter = content[4:close_idx] # between opening and closing --- + body_after = content[close_idx:] # from \n--- onward (inclusive) + + # Find the description line inside the frontmatter. + fm_lines = frontmatter.split("\n") + desc_line_idx: int | None = None + for idx, line in enumerate(fm_lines): + if line.startswith("description:"): + desc_line_idx = idx + break + + if desc_line_idx is None: + raise ValueError( + "SKILL.md frontmatter does not contain a 'description:' key" + ) + + # Serialise new_desc as a double-quoted YAML scalar so round-trip is safe. + escaped = new_desc.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n") + new_line = f'description: "{escaped}"' + fm_lines[desc_line_idx] = new_line + + new_frontmatter = "\n".join(fm_lines) + return "---\n" + new_frontmatter + body_after + + +def _patch_candidate_skill_md( + cand_claude_dir: Path, + skill: str, + new_desc: str, +) -> None: + """Overwrite the ``description:`` in the candidate's ``SKILL.md``. + + Intent: mutate the throwaway copy only; production files are never touched. + Raises ``FileNotFoundError`` if the skill's SKILL.md is absent. + """ + skill_md = cand_claude_dir / "skills" / skill / "SKILL.md" + if not skill_md.exists(): + raise FileNotFoundError( + f"Candidate SKILL.md not found at {skill_md}; " + "source_claude_dir may be missing the skill." + ) + original = skill_md.read_text(encoding="utf-8") + patched = _set_frontmatter_description(original, new_desc) + skill_md.write_text(patched, encoding="utf-8") + + +# --------------------------------------------------------------------------- +# Token total helper +# --------------------------------------------------------------------------- + + +def _sum_tokens(records: list[EvalResultRecord]) -> int: + """Sum ``token_usage.total`` over records that have token_usage.""" + return sum(r.token_usage.total for r in records if r.token_usage is not None) + + +# --------------------------------------------------------------------------- +# Baseline failing record loader (proposer input helper) +# --------------------------------------------------------------------------- + + +def _load_baseline_failing( + baseline_rec: OptimizeIterationRecord, +) -> list[EvalResultRecord]: + """Load train records from the baseline's .jsonl and return only failing ones. + + Returns an empty list if the path is empty or the file cannot be read + (fail-soft: the proposer receives an empty list rather than crashing). + """ + path_str = baseline_rec.train_jsonl_path + if not path_str: + return [] + + path = Path(path_str) + if not path.exists(): + return [] + + records: list[EvalResultRecord] = [] + try: + with path.open(encoding="utf-8") as fh: + for line in fh: + line = line.strip() + if not line: + continue + try: + d = json.loads(line) + rec = EvalResultRecord.from_dict(d) + if rec.assertions_failed: + records.append(rec) + except (json.JSONDecodeError, KeyError, ValueError): + pass + except OSError: + pass + + return records + + +# --------------------------------------------------------------------------- +# Single-iteration runner +# --------------------------------------------------------------------------- + + +def _run_one_iteration( + *, + iteration: int, + candidate_description: str, + skill: str, + train: list[EvalSetEntry], + test: list[EvalSetEntry], + dispatcher: VariantDispatcher | None, + source_claude_dir: Path, + out_dir: Path, + run_ts: str, +) -> OptimizeIterationRecord: + """Seed a throwaway temp dir, patch the description, run train+test. + + Always cleans up the temp dir in ``finally`` (INV-5). + VC3: writes to distinct paths per (iteration, split); resume=False always. + """ + # VC3: distinct paths per (iteration, split) + train_path = out_dir / f"{run_ts}-optimize-iter{iteration}-train.jsonl" + test_path = out_dir / f"{run_ts}-optimize-iter{iteration}-test.jsonl" + + cand_root = Path(tempfile.mkdtemp(prefix="mapeval-candidate-")) + try: + # Seed the candidate .claude/ tree from production source + cand_claude = cand_root / ".claude" + shutil.copytree(source_claude_dir, cand_claude) + + # Patch the description in the throwaway copy only (INV-5) + _patch_candidate_skill_md(cand_claude, skill, candidate_description) + + # Dispatcher: injected (tests) or fresh production dispatcher + if dispatcher is not None: + iter_dispatcher: VariantDispatcher = dispatcher + else: + iter_dispatcher = ClaudeSubprocessDispatcher( + source_claude_dir=cand_claude, + ) + + # VC3: resume=False — always write fresh cells, never resume + train_records = run_eval( + skill=skill, + entries=train, + dispatcher=iter_dispatcher, + runs=1, + out_path=train_path, + resume=False, + ) + test_records = run_eval( + skill=skill, + entries=test, + dispatcher=iter_dispatcher, + runs=1, + out_path=test_path, + resume=False, + ) + finally: + shutil.rmtree(cand_root, ignore_errors=True) + + train_summary = aggregate(train_records) + test_summary = aggregate(test_records) + + return OptimizeIterationRecord( + iteration=iteration, + candidate_description=candidate_description, + train_pass_rate=train_summary.pass_rate, + test_pass_rate=test_summary.pass_rate, + train_tokens_total=_sum_tokens(train_records), + test_tokens_total=_sum_tokens(test_records), + train_jsonl_path=str(train_path), + test_jsonl_path=str(test_path), + ) + + +# --------------------------------------------------------------------------- +# Overfit flagging and selection (VC1 / D4 / INV-4) +# --------------------------------------------------------------------------- + + +def _flag_overfit( + record: OptimizeIterationRecord, + baseline: OptimizeIterationRecord, +) -> None: + """Set ``overfit=True`` in-place when the anti-overfit condition holds. + + Overfit iff: iter > 0, not proposal_failed, + train improved vs baseline AND test regressed vs baseline. + """ + if ( + record.iteration != 0 + and not record.proposal_failed + and record.train_pass_rate > baseline.train_pass_rate + and record.test_pass_rate < baseline.test_pass_rate + ): + record.overfit = True + + +def _select_winner( + iteration_records: list[OptimizeIterationRecord], +) -> tuple[OptimizeIterationRecord, bool]: + """Return ``(winner, no_improvement)`` from the completed iteration list. + + Selection rules (D4): + 1. Strict candidates: non-baseline, not proposal_failed, test_pass_rate + strictly greater than baseline.test_pass_rate. + 2. If no strict candidates: winner = baseline (iter 0), no_improvement=True. + 3. Tie-break: (-test_pass_rate, total_tokens, iteration) ascending. + 4. Full-tie across all iterations: baseline wins, no_improvement=True. + """ + baseline = iteration_records[0] + strict = [ + it + for it in iteration_records[1:] + if not it.proposal_failed and it.test_pass_rate > baseline.test_pass_rate + ] + if not strict: + return baseline, True + + winner = min( + strict, + key=lambda it: ( + -it.test_pass_rate, + it.train_tokens_total + it.test_tokens_total, + it.iteration, + ), + ) + return winner, False + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + +def optimize( + *, + skill: str, + entries: list[EvalSetEntry], + current_description: str, + proposer: ProposerFn, + dispatcher: VariantDispatcher | None = None, + source_claude_dir: Path, + out_dir: Path, + run_ts: str, + iterations: int = 5, + seed: int = _DEFAULT_SEED, +) -> OptimizeResult: + """Run N-iteration description optimization; return the best candidate. + + Parameters + ---------- + skill: + Name of the skill whose SKILL.md ``description:`` is being optimised. + entries: + Full entry list; split internally into train/test via ``split_train_test``. + current_description: + Baseline description (iteration 0). + proposer: + Callable receiving (current_description, baseline_failing_train_records) + and returning a new candidate string or ``None`` on exhaustion/failure. + dispatcher: + Optional injected dispatcher. ``None`` => production subprocess path. + source_claude_dir: + Production ``.claude/`` directory to seed throwaway candidates from. + out_dir: + Directory for per-iteration ``.jsonl`` result files. + run_ts: + Caller-supplied timestamp string (clock-free invariant: never generated here). + iterations: + Total iterations including baseline (iter 0). Must be >= 1. + seed: + Integer seed for the deterministic split. + + Returns + ------- + OptimizeResult + Full result. ``eval_set_path`` is ``""`` — the CLI owns that field. + + Raises + ------ + ValueError + If ``iterations < 1`` (there must be at least the baseline iteration). + """ + if iterations < 1: + raise ValueError(f"iterations must be >= 1, got {iterations}") + + out_dir.mkdir(parents=True, exist_ok=True) + train, test = split_train_test(entries, seed) + + iteration_records: list[OptimizeIterationRecord] = [] + # Baseline failing train records fed to the proposer from iteration 1 onward. + baseline_failing: list[EvalResultRecord] = [] + + for i in range(iterations): + if i == 0: + # Baseline: patch for uniformity; description = current_description + candidate_description: str = current_description + else: + proposed = proposer(current_description, baseline_failing) + if not proposed: + # VC4: proposal_failed iteration; no run_eval call; loop continues + failed_rec = OptimizeIterationRecord( + iteration=i, + candidate_description=None, + train_pass_rate=0.0, + test_pass_rate=0.0, + proposal_failed=True, + ) + iteration_records.append(failed_rec) + continue + candidate_description = proposed + + rec = _run_one_iteration( + iteration=i, + candidate_description=candidate_description, + skill=skill, + train=train, + test=test, + dispatcher=dispatcher, + source_claude_dir=source_claude_dir, + out_dir=out_dir, + run_ts=run_ts, + ) + + if i == 0: + # Capture baseline failing train records for proposer input + baseline_failing = _load_baseline_failing(rec) + + iteration_records.append(rec) + + # Flag overfit candidates against the baseline + if iteration_records: + baseline_rec = iteration_records[0] + for rec in iteration_records[1:]: + _flag_overfit(rec, baseline_rec) + + winner, no_improvement = _select_winner(iteration_records) + winner.selected = True + + if winner.iteration == 0 or winner.candidate_description is None: + winning_description = current_description + else: + winning_description = winner.candidate_description + + return OptimizeResult( + skill=skill, + eval_set_path="", + seed=seed, + n_train=len(train), + n_test=len(test), + baseline_description=current_description, + winning_description=winning_description, + winning_iteration=winner.iteration, + no_improvement=no_improvement, + iterations=iteration_records, + ) diff --git a/src/mapify_cli/skills_eval/dispatcher.py b/src/mapify_cli/skills_eval/dispatcher.py index e87c4064..e656438a 100644 --- a/src/mapify_cli/skills_eval/dispatcher.py +++ b/src/mapify_cli/skills_eval/dispatcher.py @@ -98,6 +98,37 @@ def dispatch(self, prompt: str) -> DispatchResult: # Seeding helpers (ClaudeSubprocessDispatcher internals) # --------------------------------------------------------------------------- +# Subdir name (under the throwaway eval cwd) handed to the telegram-bridge +# plugin as its state dir — see _eval_subprocess_env. +_NO_TELEGRAM_STATE_DIRNAME = ".map-eval-no-telegram" + + +def _eval_subprocess_env(cwd: Path) -> dict[str, str]: + """Build the environment for an eval ``claude -p`` subprocess. + + Two scoped overrides on top of the inherited environment: + + - ``MAP_INVOKED_BY`` — recursion guard so MAP's own hooks no-op inside the + eval subprocess. + - ``TG_STATE_DIR`` — points the ``telegram-bridge`` plugin's state dir at a + config-less path **inside the throwaway cwd**. The plugin's SessionStart + hook may still inject its "always-listen — run `tg listen`" instruction + (plugin hooks run in a restricted env that does not receive this override), + but the instruction is now **inert**: when the eval ``claude -p`` agent + actually runs ``tg listen`` / ``tg send``, those commands inherit THIS + subprocess env, find no ``config.json`` under ``TG_STATE_DIR``, and exit + immediately (``die("no config.json")``) instead of blocking on the Telegram + long-poll. Without this, ``tg listen`` blocks until the dispatch timeout and + a triggered-skill cell mis-records as a non-trigger. The operator's real + ``~/.claude/telegram`` config is never touched — this is a per-subprocess + override on a path that is removed with the temp cwd. + """ + return { + **os.environ, + "MAP_INVOKED_BY": "skills-eval", + "TG_STATE_DIR": str(cwd / _NO_TELEGRAM_STATE_DIRNAME), + } + def _seed_temp_cwd(source_claude_dir: Path) -> Path: """Create a throwaway temp directory seeded with a copy of ``.claude/``. @@ -490,7 +521,7 @@ def _run_once( text=True, timeout=self._timeout, cwd=cwd, - env={**os.environ, "MAP_INVOKED_BY": "skills-eval"}, + env=_eval_subprocess_env(cwd), ) except subprocess.TimeoutExpired as exc: self._last_error = f"timeout after {self._timeout}s: {exc}" diff --git a/src/mapify_cli/skills_eval/eval_schema.py b/src/mapify_cli/skills_eval/eval_schema.py index a50766e3..d7b5325f 100644 --- a/src/mapify_cli/skills_eval/eval_schema.py +++ b/src/mapify_cli/skills_eval/eval_schema.py @@ -13,7 +13,7 @@ import dataclasses from dataclasses import dataclass, field -from typing import Any +from typing import Any, Callable from mapify_cli.token_budget import TokenUsage @@ -164,6 +164,139 @@ def from_dict(cls, d: dict[str, Any]) -> "EvalResultRecord": ) +# --------------------------------------------------------------------------- +# OptimizeIterationRecord (one iteration row in an optimization run) +# --------------------------------------------------------------------------- + + +@dataclass +class OptimizeIterationRecord: + """One iteration of a skill-description optimization run. + + ``to_dict`` / ``from_dict`` provide a stable round-trip compatible with the + ``_MISSING``-tolerant pattern used by ``EvalResultRecord``. Token totals + are flat ``int`` fields — no ``TokenUsage`` object here (the optimizer + aggregates per-iteration totals from multiple eval runs). + """ + + # --- required (no default) --- + iteration: int + candidate_description: str | None + train_pass_rate: float + test_pass_rate: float + # --- optional / defaulted --- + train_tokens_total: int = 0 + test_tokens_total: int = 0 + selected: bool = False + proposal_failed: bool = False + overfit: bool = False + train_jsonl_path: str = "" + test_jsonl_path: str = "" + + def to_dict(self) -> dict[str, Any]: + return { + "iteration": self.iteration, + "candidate_description": self.candidate_description, + "train_pass_rate": self.train_pass_rate, + "test_pass_rate": self.test_pass_rate, + "train_tokens_total": self.train_tokens_total, + "test_tokens_total": self.test_tokens_total, + "selected": self.selected, + "proposal_failed": self.proposal_failed, + "overfit": self.overfit, + "train_jsonl_path": self.train_jsonl_path, + "test_jsonl_path": self.test_jsonl_path, + } + + @classmethod + def from_dict(cls, d: dict[str, Any]) -> "OptimizeIterationRecord": + """Reconstruct from a plain dict; tolerates absent optional keys.""" + raw_desc = d.get("candidate_description", _MISSING) + candidate_description: str | None = ( + None if (raw_desc is _MISSING or raw_desc is None) else str(raw_desc) + ) + return cls( + iteration=int(d["iteration"]), + candidate_description=candidate_description, + train_pass_rate=float(d["train_pass_rate"]), + test_pass_rate=float(d["test_pass_rate"]), + train_tokens_total=int(d.get("train_tokens_total", 0)), + test_tokens_total=int(d.get("test_tokens_total", 0)), + selected=bool(d.get("selected", False)), + proposal_failed=bool(d.get("proposal_failed", False)), + overfit=bool(d.get("overfit", False)), + train_jsonl_path=str(d.get("train_jsonl_path", "")), + test_jsonl_path=str(d.get("test_jsonl_path", "")), + ) + + +# --------------------------------------------------------------------------- +# OptimizeResult (summary of a full optimization run) +# --------------------------------------------------------------------------- + + +@dataclass +class OptimizeResult: + """Aggregated result of a multi-iteration skill-description optimization. + + ``iterations`` holds the per-iteration records; ``winning_description`` + and ``winning_iteration`` identify the best candidate found. + ``no_improvement`` is ``True`` when no candidate beat the baseline. + """ + + skill: str + eval_set_path: str + seed: int + n_train: int + n_test: int + baseline_description: str + winning_description: str + winning_iteration: int + no_improvement: bool + iterations: list[OptimizeIterationRecord] = field(default_factory=list) + + def to_dict(self) -> dict[str, Any]: + return { + "skill": self.skill, + "eval_set_path": self.eval_set_path, + "seed": self.seed, + "n_train": self.n_train, + "n_test": self.n_test, + "baseline_description": self.baseline_description, + "winning_description": self.winning_description, + "winning_iteration": self.winning_iteration, + "no_improvement": self.no_improvement, + "iterations": [it.to_dict() for it in self.iterations], + } + + @classmethod + def from_dict(cls, d: dict[str, Any]) -> "OptimizeResult": + """Reconstruct from a plain dict; tolerates absent ``iterations``.""" + raw_iters = d.get("iterations", []) + iterations = [OptimizeIterationRecord.from_dict(x) for x in raw_iters] + return cls( + skill=str(d["skill"]), + eval_set_path=str(d["eval_set_path"]), + seed=int(d["seed"]), + n_train=int(d["n_train"]), + n_test=int(d["n_test"]), + baseline_description=str(d["baseline_description"]), + winning_description=str(d["winning_description"]), + winning_iteration=int(d["winning_iteration"]), + no_improvement=bool(d["no_improvement"]), + iterations=iterations, + ) + + +# --------------------------------------------------------------------------- +# ProposerFn (type alias for skill-description proposal callables) +# --------------------------------------------------------------------------- + +# A proposer takes the current description and the list of train eval records, +# and returns a new candidate description (or None to signal exhaustion/failure). +ProposerFn = Callable[["str", "list[EvalResultRecord]"], "str | None"] + + # --------------------------------------------------------------------------- # make_cell_id # --------------------------------------------------------------------------- diff --git a/src/mapify_cli/skills_eval/proposer.py b/src/mapify_cli/skills_eval/proposer.py new file mode 100644 index 00000000..d0a9b885 --- /dev/null +++ b/src/mapify_cli/skills_eval/proposer.py @@ -0,0 +1,153 @@ +"""Default ProposerFn implementation for skill-description optimization. + +Calls ``claude -p --output-format json`` as a subprocess and reads +ONLY the ``.result`` field from the JSON envelope. + +Invariants enforced here: +- INV-1 / VC3: argv is a list; untrusted record text is a discrete element. +- INV-2 / HC-7: MAP_INVOKED_BY is set on every subprocess env. +- VC4: all failure modes (non-zero exit, malformed JSON, empty .result, + OSError, TimeoutExpired) return None — never raise. +- No ``import anthropic``, no ANTHROPIC_API_KEY access (INV-1 / HC-2 / AC-10). +- No ``--model`` flag (D2). +""" +from __future__ import annotations + +import json +import os +import subprocess + +from mapify_cli.skills_eval.eval_schema import EvalResultRecord + +# Default subprocess timeout in seconds. +_DEFAULT_TIMEOUT: int = 120 + +# Value set in MAP_INVOKED_BY for every proposer subprocess call. +_MAP_INVOKED_BY_VALUE: str = "skills-eval-proposer" + +# Hard cap on a proposed description's length: the Agent Skills spec maximum for +# the `description` field (1024 chars), which the official skill-creator also +# validates against. A candidate over this is rejected so the eval never proves +# an unshippable description. (Note: Claude Code briefly truncated descriptions +# at 250 chars in v2.1.86, raised to 1536 in v2.1.105, then dropped the +# per-description cap entirely in v2.1.129+ for a usage-ranked listing budget — +# so 250 was a transient cap, not the spec; current Claude Code loads the full +# description for triggering.) +_DEFAULT_MAX_CHARS: int = 1024 + + +def _build_prompt( + current_description: str, + failing_train_records: list[EvalResultRecord], + max_chars: int = _DEFAULT_MAX_CHARS, +) -> str: + """Build an improvement prompt from the current description and failing records. + + The prompt is a discrete argv element, never interpolated into a shell string. + Record content (.prompt, .assertions_failed) is treated as UNTRUSTED text. + """ + lines: list[str] = [ + "You are optimizing the trigger description of a Claude Code skill.", + "", + "Current description:", + current_description.strip(), + "", + "The following eval prompts are currently failing (they should trigger", + "the skill, but do not). For each, the assertions that failed are listed:", + "", + ] + for i, rec in enumerate(failing_train_records, start=1): + lines.append(f"--- Failing record {i} ---") + lines.append(f"Prompt: {rec.prompt}") + if rec.assertions_failed: + lines.append("Assertions failed:") + for assertion in rec.assertions_failed: + lines.append(f" - {assertion}") + lines.append("") + + lines += [ + "Write an improved skill trigger description that would cause the", + "skill to be triggered for the failing prompts above while remaining", + "precise and not overly broad.", + "", + f"HARD LIMIT: the new description MUST be at most {max_chars} characters " + f"(aim for ~{max(0, max_chars - 20)} to be safe). It is shown in a UI that " + "truncates anything longer, so a longer description is unusable no matter " + "how well it triggers. Count characters and stay within the limit.", + "", + "Respond with ONLY the new description text, no preamble, no explanation.", + ] + return "\n".join(lines) + + +def propose_description( + current_description: str, + failing_train_records: list[EvalResultRecord], + max_chars: int = _DEFAULT_MAX_CHARS, +) -> str | None: + """Propose an improved skill description using ``claude -p``. + + Mirrors the subprocess/envelope pattern from ``dispatcher._run_once`` and + ``dispatcher._parse_envelope``. + + The proposal is capped at ``max_chars`` characters (the skill-listing UI + limit): the prompt asks for it, and a candidate that still exceeds the cap + is rejected (``None``) so the optimizer never evaluates an unshippable + description. + + Returns the proposed description text (stripped) on success, or ``None`` + on any failure: + - non-zero subprocess exit code + - subprocess timeout (``subprocess.TimeoutExpired``) + - claude not on PATH (``OSError`` / ``FileNotFoundError``) + - any other unexpected exception + - malformed JSON stdout + - missing or whitespace-only ``.result`` in the JSON envelope + - a proposal longer than ``max_chars`` characters + """ + prompt = _build_prompt(current_description, failing_train_records, max_chars) + + # Intent: argv is always a list; prompt is a discrete element (never shell-interpolated). + argv: list[str] = ["claude", "-p", prompt, "--output-format", "json"] + + try: + proc = subprocess.run( + argv, + capture_output=True, + text=True, + timeout=_DEFAULT_TIMEOUT, + cwd=str(os.getcwd()), + env={**os.environ, "MAP_INVOKED_BY": _MAP_INVOKED_BY_VALUE}, + ) + except subprocess.TimeoutExpired: + return None + except OSError: + # Covers FileNotFoundError (claude not on PATH) and other OS-level errors. + return None + except Exception: # noqa: BLE001 + return None + + if proc.returncode != 0: + return None + + # Intent: parse the JSON envelope defensively — mirror dispatcher._parse_envelope. + try: + parsed = json.loads(proc.stdout) + except (json.JSONDecodeError, ValueError): + return None + + if not isinstance(parsed, dict): + return None + + raw = str(parsed.get("result", "")) + candidate = raw.strip() + if not candidate: + return None + + # Reject an over-limit proposal so the optimizer never evaluates (and a + # later --apply never tries to ship) a description the skill-listing UI + # would truncate. Treated like a proposal failure for that iteration. + if len(candidate) > max_chars: + return None + + return candidate diff --git a/src/mapify_cli/skills_eval/viewer.py b/src/mapify_cli/skills_eval/viewer.py new file mode 100644 index 00000000..1076c488 --- /dev/null +++ b/src/mapify_cli/skills_eval/viewer.py @@ -0,0 +1,224 @@ +"""HTML report renderer for OptimizeResult. + +Pure render module — no dispatch, no shell calls, no anthropic. +INV-6: imports OptimizeResult/OptimizeIterationRecord from eval_schema only. +Security: jinja2.Environment(autoescape=True); candidate_description is +untrusted (claude -p output) and MUST be escaped. +""" + +from __future__ import annotations + +import difflib +from pathlib import Path + +from mapify_cli.skills_eval.eval_schema import OptimizeResult + +# --------------------------------------------------------------------------- +# HTML template (module-level string) +# --------------------------------------------------------------------------- + +_HTML = """\ + + + + + + Skill Optimization Report — {{ result.skill | e }} + + + +

Skill Optimization Report

+
+ Skill: {{ result.skill | e }}  |  + Seed: {{ result.seed }}  |  + Train / Test: {{ result.n_train }} / {{ result.n_test }}  |  + Winning iteration: {{ result.winning_iteration }} +
+ {% if result.no_improvement %} +
No improvement found — baseline retained.
+ {% endif %} + + + + + + + + + + + + + + {% for row in rows %} + + + + + + + + + + {% endfor %} + +
#StatusTrain %Test %Train tokTest tokDescription diff vs prior
{{ row.iteration }} + {% if row.selected %}selected{% endif %} + {% if row.overfit %}overfit{% endif %} + {% if row.proposal_failed %}failed{% endif %} + {{ "%.1f"|format(row.train_pass_rate * 100) }}%{{ "%.1f"|format(row.test_pass_rate * 100) }}%{{ row.train_tokens_total }}{{ row.test_tokens_total }} + {% if row.proposal_failed %} + proposal failed — no candidate + {% elif row.diff_lines %} +
+ {% for dl in row.diff_lines %} + {% if dl.kind == "add" %}{{ dl.text | e }} + {% elif dl.kind == "rem" %}{{ dl.text | e }} + {% elif dl.kind == "sep" %}{{ dl.text | e }} + {% else %}{{ dl.text | e }} + {% endif %} + {% endfor %} +
+ {% else %} +
{{ row.description | e }}
+ {% endif %} +
+ + +""" + + +# --------------------------------------------------------------------------- +# Diff helpers +# --------------------------------------------------------------------------- + + +class _DiffLine: + """One diff line with a kind tag for template branching.""" + + __slots__ = ("kind", "text") + + def __init__(self, kind: str, text: str) -> None: + self.kind = kind + self.text = text + + +def _compute_diff(prior: str, current: str) -> list[_DiffLine]: + """Return unified-diff lines tagged by kind: add / rem / ctx / sep.""" + prior_lines = prior.splitlines(keepends=True) + current_lines = current.splitlines(keepends=True) + diff = list( + difflib.unified_diff(prior_lines, current_lines, lineterm="", n=2) + ) + if not diff: + # identical — show nothing; caller will fall back to plain description + return [] + + result: list[_DiffLine] = [] + for raw in diff: + if raw.startswith("+++") or raw.startswith("---"): + # skip file-header lines produced by unified_diff + continue + if raw.startswith("@@"): + result.append(_DiffLine("sep", raw.rstrip("\n"))) + elif raw.startswith("+"): + result.append(_DiffLine("add", raw[1:].rstrip("\n"))) + elif raw.startswith("-"): + result.append(_DiffLine("rem", raw[1:].rstrip("\n"))) + else: + result.append(_DiffLine("ctx", raw.rstrip("\n"))) + return result + + +# --------------------------------------------------------------------------- +# Row builder +# --------------------------------------------------------------------------- + + +def _build_rows(result: OptimizeResult) -> list[dict[str, object]]: + """Build per-iteration row dicts for template rendering.""" + rows: list[dict[str, object]] = [] + # Always a str: seeded from baseline_description and only ever reassigned to a + # non-None candidate_description below (so the baseline row diffs against itself). + prior_desc: str = result.baseline_description + + for rec in result.iterations: + diff_lines: list[_DiffLine] = [] + description = rec.candidate_description or "" + + if rec.proposal_failed or rec.candidate_description is None: + # No candidate — nothing to diff + pass + elif rec.iteration == 0: + # Baseline iteration has no prior — show the description as-is + pass + else: + diff_lines = _compute_diff(prior_desc, rec.candidate_description) + + row: dict[str, object] = { + "iteration": rec.iteration, + "train_pass_rate": rec.train_pass_rate, + "test_pass_rate": rec.test_pass_rate, + "train_tokens_total": rec.train_tokens_total, + "test_tokens_total": rec.test_tokens_total, + "selected": rec.selected, + "overfit": rec.overfit, + "proposal_failed": rec.proposal_failed, + "diff_lines": diff_lines, + "description": description, + } + rows.append(row) + + # Advance prior only when a valid candidate exists + if rec.candidate_description is not None and not rec.proposal_failed: + prior_desc = rec.candidate_description + + return rows + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + +def render_html(result: OptimizeResult) -> str: + """Render an OptimizeResult to an HTML string. + + Uses jinja2.Environment(autoescape=True) so candidate_description values + (untrusted claude -p output) are HTML-escaped automatically. + """ + import jinja2 # noqa: PLC0415 + + rows = _build_rows(result) + env = jinja2.Environment(autoescape=True) # SECURITY: autoescape mandatory + tmpl = env.from_string(_HTML) + return tmpl.render(result=result, rows=rows) + + +def render_to_path(result: OptimizeResult, html_path: Path) -> None: + """Render result to HTML and write to html_path.""" + html_path.write_text(render_html(result), encoding="utf-8") diff --git a/src/mapify_cli/templates/skills/map-plan/SKILL.md b/src/mapify_cli/templates/skills/map-plan/SKILL.md index 270c024a..ec40bc40 100644 --- a/src/mapify_cli/templates/skills/map-plan/SKILL.md +++ b/src/mapify_cli/templates/skills/map-plan/SKILL.md @@ -1,7 +1,7 @@ --- name: map-plan description: | - ARCHITECT phase only: decompose a complex task into atomic subtasks via task-decomposer. Use when starting a feature, refactor, or complex bug fix and you need a plan first. Do NOT use to execute work; use map-task or map-efficient. + ARCHITECT phase only: produce an upfront plan by decomposing a complex task into atomic subtasks with clear dependencies, via task-decomposer. Use when the user asks to plan, create a structured plan, break down, decompose, or stage work — e.g. planning a feature, refactor, migration, API/versioning upgrade, or incremental/phased rollout into smaller independent steps before any code is written. Trigger on phrasing like "plan a…", "create a plan for…", "decompose…into tasks", "break this into steps", "roll out incrementally", or "smaller independent steps". Do NOT use to execute work; use map-task or map-efficient. effort: high argument-hint: "[task description]" --- diff --git a/src/mapify_cli/templates/skills/map-skill-eval/SKILL.md b/src/mapify_cli/templates/skills/map-skill-eval/SKILL.md index 567ac04f..1aaacc89 100644 --- a/src/mapify_cli/templates/skills/map-skill-eval/SKILL.md +++ b/src/mapify_cli/templates/skills/map-skill-eval/SKILL.md @@ -87,6 +87,60 @@ mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --resume - **Run log not found for `--resume`** — `--resume` looks for the latest `.map/eval-runs//.jsonl`. If no prior run exists, omit `--resume` to start fresh. - **All cases report `not_trigger` unexpectedly** — verify the skill name matches exactly (e.g. `map-plan`, not `map_plan`) and that `.claude/` was seeded correctly in the temp cwd. +## Optimize a skill description + +Anti-overfit description optimizer: deterministic 60/40 train/test split, up to N iterations (iteration 0 = baseline = current description). Selects the candidate with the highest held-out TEST pass-rate; an overfit candidate (train pass-rate up, test pass-rate down) is flagged and never selected. + +```bash +mapify skill-eval optimize --eval-set PATH [--iterations N] [--apply] [--open] [--dry-run] +``` + +- `` — skill to optimize (e.g. `map-plan`). +- `--eval-set PATH` — eval-set JSON with `>= 5` entries (a 60/40 split needs `n_test >= 3`; a smaller set exits with code 2, spending zero quota). +- `--iterations N` — maximum optimization iterations (default: 5). Iteration 0 is the baseline. +- `--apply` — patch the winning description into the SKILL.md frontmatter `description:` of `templates_src/skills//SKILL.md.jinja` and re-render so generated trees stay byte-identical; the change is staged, not committed. `skill-rules.json` `description` is NOT auto-patched (update it by hand). Two no-op cases: "No improvement found" (baseline already optimal) and "Winner identical to current". +- `--open` — open the HTML report in the browser after the run (best-effort; never errors the run). +- `--dry-run` — print the planned call budget (iterations × (n_train + n_test) dispatch calls + iterations proposer calls) and `model: default (resolved by claude CLI)`, then exit 0 spending zero quota. + +Writes a durable `OptimizeResult` JSON and an HTML report to `.map/eval-runs//-optimize.json` and `-optimize.html`. + +Default mode is propose-only: nothing outside `.map/` is modified. + +### Examples + +```bash +# Preview quota usage without spending any +mapify skill-eval optimize map-plan --eval-set .map/evals/map-plan.json --dry-run + +# Run 3 optimization iterations and open the HTML report +mapify skill-eval optimize map-plan --eval-set .map/evals/map-plan.json --iterations 3 --open + +# Run, then auto-apply the winning description if improvement found +mapify skill-eval optimize map-plan --eval-set .map/evals/map-plan.json --apply +``` + +## View an optimization report + +Renders the latest (or a specified `--result`) stored `OptimizeResult` JSON as an HTML report. + +```bash +mapify skill-eval view [--result PATH] [--open] +``` + +- `` — skill whose optimization results to view. +- `--result PATH` — path to a specific `*-optimize.json` result file; defaults to the latest in `.map/eval-runs//`. +- `--open` — open the rendered HTML report in the browser. + +### Examples + +```bash +# View the latest optimization report for map-plan +mapify skill-eval view map-plan + +# Open a specific result file in the browser +mapify skill-eval view map-plan --result .map/eval-runs/map-plan/20260601T120000-optimize.json --open +``` + ## Related Commands - `/map-plan` — plan and decompose tasks. diff --git a/src/mapify_cli/templates/skills/skill-rules.json b/src/mapify_cli/templates/skills/skill-rules.json index d5a96063..b8e18828 100644 --- a/src/mapify_cli/templates/skills/skill-rules.json +++ b/src/mapify_cli/templates/skills/skill-rules.json @@ -244,11 +244,11 @@ "skillClass": "task", "enforcement": "manual", "priority": "medium", - "description": "Evaluate a /map-* skill's trigger accuracy + cost via mapify skill-eval (claude -p matrix, deterministic assertions, durable resumable runs).", + "description": "Evaluate a /map-* skill's trigger accuracy + cost, OR optimize its description via anti-overfit held-out selection, via mapify skill-eval (claude -p matrix, deterministic assertions, durable resumable runs).", "requires-cmd": ["claude"], "promptTriggers": { - "keywords": ["map-skill-eval","skill-eval","skill eval","evaluate skill","trigger accuracy","skill triggering"], - "intentPatterns": ["map-skill-eval","(eval|evaluate|measure|test).*(skill).*(trigger|fire|cost)","does .* skill trigger"] + "keywords": ["map-skill-eval","skill-eval","skill eval","evaluate skill","trigger accuracy","skill triggering","optimize skill","skill optimize","description optimizer","optimize description"], + "intentPatterns": ["map-skill-eval","(eval|evaluate|measure|test).*(skill).*(trigger|fire|cost)","does .* skill trigger","(optimize|improve).*(skill).*(description|trigger)"] } }, "map-task": { diff --git a/src/mapify_cli/templates_src/skills/map-plan/SKILL.md.jinja b/src/mapify_cli/templates_src/skills/map-plan/SKILL.md.jinja index 270c024a..ec40bc40 100644 --- a/src/mapify_cli/templates_src/skills/map-plan/SKILL.md.jinja +++ b/src/mapify_cli/templates_src/skills/map-plan/SKILL.md.jinja @@ -1,7 +1,7 @@ --- name: map-plan description: | - ARCHITECT phase only: decompose a complex task into atomic subtasks via task-decomposer. Use when starting a feature, refactor, or complex bug fix and you need a plan first. Do NOT use to execute work; use map-task or map-efficient. + ARCHITECT phase only: produce an upfront plan by decomposing a complex task into atomic subtasks with clear dependencies, via task-decomposer. Use when the user asks to plan, create a structured plan, break down, decompose, or stage work — e.g. planning a feature, refactor, migration, API/versioning upgrade, or incremental/phased rollout into smaller independent steps before any code is written. Trigger on phrasing like "plan a…", "create a plan for…", "decompose…into tasks", "break this into steps", "roll out incrementally", or "smaller independent steps". Do NOT use to execute work; use map-task or map-efficient. effort: high argument-hint: "[task description]" --- diff --git a/src/mapify_cli/templates_src/skills/map-skill-eval/SKILL.md.jinja b/src/mapify_cli/templates_src/skills/map-skill-eval/SKILL.md.jinja index 567ac04f..1aaacc89 100644 --- a/src/mapify_cli/templates_src/skills/map-skill-eval/SKILL.md.jinja +++ b/src/mapify_cli/templates_src/skills/map-skill-eval/SKILL.md.jinja @@ -87,6 +87,60 @@ mapify skill-eval run map-plan --eval-set .map/evals/map-plan.json --resume - **Run log not found for `--resume`** — `--resume` looks for the latest `.map/eval-runs//.jsonl`. If no prior run exists, omit `--resume` to start fresh. - **All cases report `not_trigger` unexpectedly** — verify the skill name matches exactly (e.g. `map-plan`, not `map_plan`) and that `.claude/` was seeded correctly in the temp cwd. +## Optimize a skill description + +Anti-overfit description optimizer: deterministic 60/40 train/test split, up to N iterations (iteration 0 = baseline = current description). Selects the candidate with the highest held-out TEST pass-rate; an overfit candidate (train pass-rate up, test pass-rate down) is flagged and never selected. + +```bash +mapify skill-eval optimize --eval-set PATH [--iterations N] [--apply] [--open] [--dry-run] +``` + +- `` — skill to optimize (e.g. `map-plan`). +- `--eval-set PATH` — eval-set JSON with `>= 5` entries (a 60/40 split needs `n_test >= 3`; a smaller set exits with code 2, spending zero quota). +- `--iterations N` — maximum optimization iterations (default: 5). Iteration 0 is the baseline. +- `--apply` — patch the winning description into the SKILL.md frontmatter `description:` of `templates_src/skills//SKILL.md.jinja` and re-render so generated trees stay byte-identical; the change is staged, not committed. `skill-rules.json` `description` is NOT auto-patched (update it by hand). Two no-op cases: "No improvement found" (baseline already optimal) and "Winner identical to current". +- `--open` — open the HTML report in the browser after the run (best-effort; never errors the run). +- `--dry-run` — print the planned call budget (iterations × (n_train + n_test) dispatch calls + iterations proposer calls) and `model: default (resolved by claude CLI)`, then exit 0 spending zero quota. + +Writes a durable `OptimizeResult` JSON and an HTML report to `.map/eval-runs//-optimize.json` and `-optimize.html`. + +Default mode is propose-only: nothing outside `.map/` is modified. + +### Examples + +```bash +# Preview quota usage without spending any +mapify skill-eval optimize map-plan --eval-set .map/evals/map-plan.json --dry-run + +# Run 3 optimization iterations and open the HTML report +mapify skill-eval optimize map-plan --eval-set .map/evals/map-plan.json --iterations 3 --open + +# Run, then auto-apply the winning description if improvement found +mapify skill-eval optimize map-plan --eval-set .map/evals/map-plan.json --apply +``` + +## View an optimization report + +Renders the latest (or a specified `--result`) stored `OptimizeResult` JSON as an HTML report. + +```bash +mapify skill-eval view [--result PATH] [--open] +``` + +- `` — skill whose optimization results to view. +- `--result PATH` — path to a specific `*-optimize.json` result file; defaults to the latest in `.map/eval-runs//`. +- `--open` — open the rendered HTML report in the browser. + +### Examples + +```bash +# View the latest optimization report for map-plan +mapify skill-eval view map-plan + +# Open a specific result file in the browser +mapify skill-eval view map-plan --result .map/eval-runs/map-plan/20260601T120000-optimize.json --open +``` + ## Related Commands - `/map-plan` — plan and decompose tasks. diff --git a/src/mapify_cli/templates_src/skills/skill-rules.json.jinja b/src/mapify_cli/templates_src/skills/skill-rules.json.jinja index d5a96063..b8e18828 100644 --- a/src/mapify_cli/templates_src/skills/skill-rules.json.jinja +++ b/src/mapify_cli/templates_src/skills/skill-rules.json.jinja @@ -244,11 +244,11 @@ "skillClass": "task", "enforcement": "manual", "priority": "medium", - "description": "Evaluate a /map-* skill's trigger accuracy + cost via mapify skill-eval (claude -p matrix, deterministic assertions, durable resumable runs).", + "description": "Evaluate a /map-* skill's trigger accuracy + cost, OR optimize its description via anti-overfit held-out selection, via mapify skill-eval (claude -p matrix, deterministic assertions, durable resumable runs).", "requires-cmd": ["claude"], "promptTriggers": { - "keywords": ["map-skill-eval","skill-eval","skill eval","evaluate skill","trigger accuracy","skill triggering"], - "intentPatterns": ["map-skill-eval","(eval|evaluate|measure|test).*(skill).*(trigger|fire|cost)","does .* skill trigger"] + "keywords": ["map-skill-eval","skill-eval","skill eval","evaluate skill","trigger accuracy","skill triggering","optimize skill","skill optimize","description optimizer","optimize description"], + "intentPatterns": ["map-skill-eval","(eval|evaluate|measure|test).*(skill).*(trigger|fire|cost)","does .* skill trigger","(optimize|improve).*(skill).*(description|trigger)"] } }, "map-task": { diff --git a/tests/skills_eval/fixtures/README.md b/tests/skills_eval/fixtures/README.md new file mode 100644 index 00000000..60666f90 --- /dev/null +++ b/tests/skills_eval/fixtures/README.md @@ -0,0 +1,66 @@ +# Skill Eval Fixtures + +This directory contains JSON eval-set fixtures for the MAP Framework skill evaluation system. + +## Fixture Types + +### Smoke Fixture (do not modify) + +- `map_debug_eval_set.json` — 3-entry smoke fixture used by fast unit tests. + This file exists to validate the `load_eval_set` / `run_eval` pipeline with + minimal prompt count. **Do not add entries or rename it** — tests pin the exact + entry count (3) and any change will break the smoke gate. + +### Optimizer Eval-Sets + +The following fixtures are consumed by `skill-eval optimize` to run the +anti-overfit description optimizer: + +- `map_plan_optimize_eval_set.json` — eval-set for the `map-plan` skill + (task decomposition / feature planning prompts) +- `map_efficient_optimize_eval_set.json` — eval-set for the `map-efficient` skill + (MAP build-loop execution / subtask-apply prompts) +- `map_debug_optimize_eval_set.json` — larger eval-set for the `map-debug` skill + (failure diagnosis / crash investigation prompts, wider variety than the smoke fixture) + +## Why >= 8 Entries per Optimizer Fixture + +The description optimizer uses a deterministic 60/40 train/test split: + +``` +n_test = max(1, round(n * 0.4)) +``` + +With `n = 8` this gives `n_test = 3`, which is the minimum for a meaningful +held-out pass-rate signal. Smaller fixtures yield `n_test <= 2`, making the +held-out score a 0/1 coin-flip that cannot distinguish a good description +candidate from a bad one. The >= 8 floor ensures the optimizer has a statistically +useful held-out set while keeping fixture authoring lightweight. + +## Authoring New Fixtures + +Each JSON file must follow this schema: + +```json +{ + "entries": [ + { + "prompt": "", + "should_trigger": "", + "assertions": [{"type": "contains", "value": ""}] + }, + { + "prompt": "", + "should_not_trigger": "" + } + ] +} +``` + +Rules: +- `prompt` is required on every entry. +- Use `should_trigger` XOR `should_not_trigger` (or neither) per entry — not both. +- `assertions` is optional; when present, values should be lowercase substrings + that genuinely appear in the prompt text. +- Include at least 1-2 `should_not_trigger` negatives to exercise the rejection path. +- Target 8-10 entries total to satisfy the >= 8 optimizer sizing requirement. diff --git a/tests/skills_eval/fixtures/map_debug_optimize_eval_set.json b/tests/skills_eval/fixtures/map_debug_optimize_eval_set.json new file mode 100644 index 00000000..638b1b13 --- /dev/null +++ b/tests/skills_eval/fixtures/map_debug_optimize_eval_set.json @@ -0,0 +1,65 @@ +{ + "entries": [ + { + "prompt": "My Python service crashes with a KeyError on startup. Help me diagnose the root cause.", + "should_trigger": "map-debug", + "assertions": [ + {"type": "contains", "value": "crashes"} + ] + }, + { + "prompt": "I have a failing pytest test with a mysterious AssertionError and I cannot figure out why.", + "should_trigger": "map-debug", + "assertions": [ + {"type": "contains", "value": "failing"} + ] + }, + { + "prompt": "My CI pipeline is suddenly failing after a dependency upgrade. Help me track down the regression.", + "should_trigger": "map-debug", + "assertions": [ + {"type": "contains", "value": "regression"} + ] + }, + { + "prompt": "There is a segmentation fault in my C extension. Walk me through diagnosing the memory error.", + "should_trigger": "map-debug", + "assertions": [ + {"type": "contains", "value": "segmentation"} + ] + }, + { + "prompt": "The API returns 500 intermittently under load. Help me investigate the root cause.", + "should_trigger": "map-debug", + "assertions": [ + {"type": "contains", "value": "intermittently"} + ] + }, + { + "prompt": "My hook script silently exits without producing any output. I need to debug why.", + "should_trigger": "map-debug", + "assertions": [ + {"type": "contains", "value": "debug"} + ] + }, + { + "prompt": "There is a data corruption bug that only manifests in production. Help me isolate the cause.", + "should_trigger": "map-debug" + }, + { + "prompt": "My async task queue stops processing after a few hours with no error logged. Diagnose it.", + "should_trigger": "map-debug", + "assertions": [ + {"type": "contains", "value": "diagnose"} + ] + }, + { + "prompt": "Write a function that reverses a string.", + "should_not_trigger": "map-debug" + }, + { + "prompt": "Summarize the last three commits.", + "should_not_trigger": "map-debug" + } + ] +} diff --git a/tests/skills_eval/fixtures/map_efficient_optimize_eval_set.json b/tests/skills_eval/fixtures/map_efficient_optimize_eval_set.json new file mode 100644 index 00000000..31dfd279 --- /dev/null +++ b/tests/skills_eval/fixtures/map_efficient_optimize_eval_set.json @@ -0,0 +1,55 @@ +{ + "entries": [ + { + "prompt": "Execute the next subtask in the MAP build loop and report back.", + "should_trigger": "map-efficient", + "assertions": [ + {"type": "contains", "value": "execute"} + ] + }, + { + "prompt": "Run the planned subtask ST-003 and apply the code changes.", + "should_trigger": "map-efficient", + "assertions": [ + {"type": "contains", "value": "run"} + ] + }, + { + "prompt": "Implement the database migration step from the current plan.", + "should_trigger": "map-efficient", + "assertions": [ + {"type": "contains", "value": "implement"} + ] + }, + { + "prompt": "Continue with the next wave of subtasks in our feature rollout plan.", + "should_trigger": "map-efficient", + "assertions": [ + {"type": "contains", "value": "continue"} + ] + }, + { + "prompt": "Apply the code changes for the authentication refactor subtask.", + "should_trigger": "map-efficient" + }, + { + "prompt": "Advance the MAP workflow to the next subtask and execute it.", + "should_trigger": "map-efficient", + "assertions": [ + {"type": "contains", "value": "advance"} + ] + }, + { + "prompt": "Complete the current batch of subtasks from our migration blueprint.", + "should_trigger": "map-efficient" + }, + { + "prompt": "What is the capital of France?", + "should_not_trigger": "map-efficient" + }, + { + "prompt": "Explain how hash tables work.", + "should_not_trigger": "map-efficient" + } + ] +} diff --git a/tests/skills_eval/fixtures/map_plan_optimize_eval_set.json b/tests/skills_eval/fixtures/map_plan_optimize_eval_set.json new file mode 100644 index 00000000..d27bafee --- /dev/null +++ b/tests/skills_eval/fixtures/map_plan_optimize_eval_set.json @@ -0,0 +1,55 @@ +{ + "entries": [ + { + "prompt": "I need to decompose the new user authentication feature into atomic subtasks for my team.", + "should_trigger": "map-plan", + "assertions": [ + {"type": "contains", "value": "decompose"} + ] + }, + { + "prompt": "Help me plan a refactor of the database layer into smaller independent steps.", + "should_trigger": "map-plan", + "assertions": [ + {"type": "contains", "value": "plan"} + ] + }, + { + "prompt": "Break down the migration of our monolith to microservices into a sequence of subtasks.", + "should_trigger": "map-plan", + "assertions": [ + {"type": "contains", "value": "migration"} + ] + }, + { + "prompt": "I want to create a structured plan for implementing the new payment integration feature.", + "should_trigger": "map-plan" + }, + { + "prompt": "Decompose the API versioning upgrade project into atomic tasks with clear dependencies.", + "should_trigger": "map-plan", + "assertions": [ + {"type": "contains", "value": "atomic"} + ] + }, + { + "prompt": "Help me break the search feature redesign into manageable subtasks for my sprint.", + "should_trigger": "map-plan" + }, + { + "prompt": "I need a plan for incrementally rolling out the new permissions system without breaking existing users.", + "should_trigger": "map-plan", + "assertions": [ + {"type": "contains", "value": "incrementally"} + ] + }, + { + "prompt": "What is 2 + 2?", + "should_not_trigger": "map-plan" + }, + { + "prompt": "Fix the typo in the README file.", + "should_not_trigger": "map-plan" + } + ] +} diff --git a/tests/test_inv1_no_anthropic_optimize.py b/tests/test_inv1_no_anthropic_optimize.py new file mode 100644 index 00000000..f293fe60 --- /dev/null +++ b/tests/test_inv1_no_anthropic_optimize.py @@ -0,0 +1,107 @@ +"""Guard test: VC1 [INV-1/HC-2/AC-10] — no anthropic import / no ANTHROPIC_API_KEY +in the optimization pipeline modules. + +ST-002: covers proposer.py. +ST-009: extend MODULES list to add optimizer.py, viewer.py, apply_patcher.py + once those modules exist. + +Structure: a single parametrized test over a MODULES list so ST-009 can extend +it with one line per new module. +""" +from __future__ import annotations + +import ast +from pathlib import Path + +import pytest + +# --------------------------------------------------------------------------- +# Module list — extend here in ST-009 +# --------------------------------------------------------------------------- + +_SKILLS_EVAL_ROOT = ( + Path(__file__).parent.parent / "src" / "mapify_cli" / "skills_eval" +) + +MODULES: list[Path] = [ + _SKILLS_EVAL_ROOT / "proposer.py", + _SKILLS_EVAL_ROOT / "description_optimizer.py", + _SKILLS_EVAL_ROOT / "viewer.py", + _SKILLS_EVAL_ROOT / "apply_patcher.py", +] + + +# --------------------------------------------------------------------------- +# Sentinel: guard against an empty MODULES list producing a vacuous green run +# --------------------------------------------------------------------------- + + +def test_modules_list_is_non_empty() -> None: + """Discovery sentinel: MODULES must contain at least one entry.""" + assert MODULES, "MODULES list is empty — guard test would pass vacuously" + for mod in MODULES: + assert mod.exists(), f"Module listed in MODULES does not exist: {mod}" + + +# --------------------------------------------------------------------------- +# Parametrized guard +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("module_path", MODULES, ids=[m.name for m in MODULES]) +def test_vc1_no_anthropic_import(module_path: Path) -> None: + """VC1 [INV-1/HC-2]: no 'import anthropic' / 'from anthropic' in module source.""" + source = module_path.read_text(encoding="utf-8") + tree = ast.parse(source, filename=str(module_path)) + + for node in ast.walk(tree): + if isinstance(node, ast.Import): + for alias in node.names: + assert "anthropic" not in (alias.name or ""), ( + f"Found 'import anthropic' in {module_path.name}: {alias.name!r}" + ) + elif isinstance(node, ast.ImportFrom): + module_name = node.module or "" + assert "anthropic" not in module_name, ( + f"Found 'from anthropic' import in {module_path.name}: " + f"from {module_name!r}" + ) + + +@pytest.mark.parametrize("module_path", MODULES, ids=[m.name for m in MODULES]) +def test_vc1_no_anthropic_api_key(module_path: Path) -> None: + """VC1 [AC-10]: no ANTHROPIC_API_KEY env access in module source. + + Checks os.environ["ANTHROPIC_API_KEY"], os.environ.get("ANTHROPIC_API_KEY"), + and os.getenv("ANTHROPIC_API_KEY") via AST walk over all Call/Subscript nodes. + """ + source = module_path.read_text(encoding="utf-8") + tree = ast.parse(source, filename=str(module_path)) + + for node in ast.walk(tree): + # os.environ["ANTHROPIC_API_KEY"] + if isinstance(node, ast.Subscript): + if isinstance(node.value, ast.Attribute) and node.value.attr == "environ": + key_node = node.slice + if isinstance(key_node, ast.Constant) and isinstance( + key_node.value, str + ): + assert "ANTHROPIC_API_KEY" not in key_node.value, ( + f"Found ANTHROPIC_API_KEY env subscript in {module_path.name}" + ) + + # os.getenv("ANTHROPIC_API_KEY") or os.environ.get("ANTHROPIC_API_KEY") + if isinstance(node, ast.Call): + func = node.func + is_env_get = isinstance(func, ast.Attribute) and func.attr in ( + "getenv", + "get", + ) + if is_env_get and node.args: + first_arg = node.args[0] + if isinstance(first_arg, ast.Constant) and isinstance( + first_arg.value, str + ): + assert "ANTHROPIC_API_KEY" not in first_arg.value, ( + f"Found ANTHROPIC_API_KEY env read in {module_path.name}" + ) diff --git a/tests/test_skills.py b/tests/test_skills.py index a80e4d2d..86d5f2d8 100644 --- a/tests/test_skills.py +++ b/tests/test_skills.py @@ -307,14 +307,25 @@ def test_descriptions_include_negative_triggers(self, skills_dir, skill_folders) def test_descriptions_fit_claude_skill_listing_limit( self, skills_dir, skill_folders ): - """Descriptions should stay under the 250-char Claude listing limit.""" + """Descriptions must fit the Agent Skills `description` spec limit (1024 chars). + + The 1024-char cap is the documented Agent Skills maximum for the + `description` field (the official skill-creator validates against it too). + It is NOT the old 250-char number: Claude Code truncated descriptions at + 250 in v2.1.86, raised the cap to 1536 in v2.1.105, then removed the + per-description cap in v2.1.129+ (usage-ranked listing budget). So 250 was + a transient version cap; current Claude Code loads the full description + (up to the spec's 1024) for triggering. + Refs: github.com/anthropics/claude-code issues #40121 / #47627; + code.claude.com/docs/en/skills. + """ for folder in skill_folders: skill_file = skills_dir / folder / "SKILL.md" fm = self._parse_frontmatter(skill_file) desc = fm.get("description", "") - assert len(desc) <= 250, ( + assert len(desc) <= 1024, ( f"Skill '{folder}' description is {len(desc)} chars; " - "keep it at or under 250 chars to avoid UI truncation." + "keep it at or under the 1024-char Agent Skills spec limit." ) def test_frontmatter_uses_supported_fields(self, skills_dir, skill_folders): diff --git a/tests/test_skills_eval_apply.py b/tests/test_skills_eval_apply.py new file mode 100644 index 00000000..1cdf29d8 --- /dev/null +++ b/tests/test_skills_eval_apply.py @@ -0,0 +1,428 @@ +"""Tests for apply_patcher.py (ST-006). + +Covers VC1 (frontmatter description patched in rendered .claude/), VC2 (rendered +trees byte-identical after apply), VC3 (two distinct no-op messages), and VC4 +(fail-loud on missing frontmatter / description key / bad paths). + +CRITICAL: tests NEVER patch or render the real repo's templates_src / .claude / +.codex. Every test that touches the render pipeline seeds a TEMP copy of +templates_src at a tmp_path and injects repo_root=tmp_path to all calls. +""" + +from __future__ import annotations + +import shutil +from pathlib import Path + +import pytest + +from mapify_cli.skills_eval.apply_patcher import ( + apply_optimized_description, + patch_skill_description, +) + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +# Real repo root — used ONLY for seeding temp copies; never for patching/rendering. +_REAL = Path(__file__).resolve().parents[1] + +# Skill used throughout the tests (must exist in templates_src). +_SKILL = "map-skill-eval" + + +def _seed_templates_src(tmp_path: Path) -> Path: + """Copy the real templates_src tree into tmp_path; return the copy root.""" + src = _REAL / "src" / "mapify_cli" / "templates_src" + dst = tmp_path / "src" / "mapify_cli" / "templates_src" + shutil.copytree(src, dst) + return tmp_path + + +def _jinja_path(tmp_root: Path, skill: str = _SKILL) -> Path: + return ( + tmp_root + / "src" + / "mapify_cli" + / "templates_src" + / "skills" + / skill + / "SKILL.md.jinja" + ) + + +def _read_frontmatter_description(skill_md: Path) -> str: + """Parse the YAML frontmatter via skill_ir.parse_frontmatter and return description.""" + from mapify_cli.skill_ir import parse_frontmatter # noqa: PLC0415 + + text = skill_md.read_text(encoding="utf-8") + # Extract text between the two --- + assert text.startswith("---\n"), f"No frontmatter in {skill_md}" + close_idx = text.find("\n---", 3) + assert close_idx != -1, f"No closing --- in {skill_md}" + fm_text = text[4:close_idx] + parsed = parse_frontmatter(fm_text) + return str(parsed.get("description", "")).strip() + + +# --------------------------------------------------------------------------- +# VC4: patch_skill_description — happy path +# --------------------------------------------------------------------------- + + +def test_vc4_patch_block_scalar_body_replaced(tmp_path: Path) -> None: + """Happy path: description body is replaced; other keys and body are untouched.""" + jinja = _jinja_path(_seed_templates_src(tmp_path)) + original = jinja.read_text(encoding="utf-8") + new_desc = "A brand-new description for testing." + + patch_skill_description(jinja, new_desc) + + patched = jinja.read_text(encoding="utf-8") + # Parse back and confirm + result = _read_frontmatter_description(jinja) + assert result == new_desc, f"Got: {result!r}" + + # Other keys must still be present + assert "effort:" in patched + assert "disable-model-invocation:" in patched + assert "argument-hint:" in patched + + # Body after closing --- is preserved + orig_body_start = original.find("\n---\n") + 5 + orig_body = original[orig_body_start:] + patch_body_start = patched.find("\n---\n") + 5 + patch_body = patched[patch_body_start:] + assert patch_body == orig_body, "Body after frontmatter must be unchanged" + + +def test_vc4_patch_multiline_description_round_trips(tmp_path: Path) -> None: + """A winner with newlines and special chars round-trips to exactly new_desc.""" + jinja = _jinja_path(_seed_templates_src(tmp_path)) + new_desc = ( + "Line one of a multi-line description.\n" + "Line two: disable-model-invocation: true\n" + "Line three: special chars <>&\"'" + ) + + patch_skill_description(jinja, new_desc) + result = _read_frontmatter_description(jinja) + assert result == new_desc, f"Round-trip mismatch.\nGot: {result!r}\nExpected: {new_desc!r}" + + +def test_vc4_patch_uses_block_scalar_strip_chomping(tmp_path: Path) -> None: + """Patched file uses '|-' (strip chomping) so no trailing newline in YAML parse.""" + jinja = _jinja_path(_seed_templates_src(tmp_path)) + new_desc = "Simple description." + + patch_skill_description(jinja, new_desc) + patched = jinja.read_text(encoding="utf-8") + assert "description: |-" in patched, "Must use '|-' block scalar indicator" + + +# --------------------------------------------------------------------------- +# VC4: patch_skill_description — fail-loud, no partial write +# --------------------------------------------------------------------------- + + +def test_vc4_fail_loud_no_frontmatter(tmp_path: Path) -> None: + """ValueError raised and file unchanged when no YAML frontmatter present.""" + bad_file = tmp_path / "SKILL.md" + original_content = "# No frontmatter here\nSome body." + bad_file.write_text(original_content, encoding="utf-8") + + with pytest.raises(ValueError, match="frontmatter"): + patch_skill_description(bad_file, "irrelevant") + + # File must be completely unchanged + assert bad_file.read_text(encoding="utf-8") == original_content + + +def test_vc4_fail_loud_missing_description_key(tmp_path: Path) -> None: + """ValueError raised and file unchanged when description: key is absent.""" + content = "---\nname: test-skill\neffort: low\n---\n# Body\n" + bad_file = tmp_path / "SKILL.md" + bad_file.write_text(content, encoding="utf-8") + + with pytest.raises(ValueError, match="description"): + patch_skill_description(bad_file, "irrelevant") + + assert bad_file.read_text(encoding="utf-8") == content + + +def test_vc4_fail_loud_no_closing_fence(tmp_path: Path) -> None: + """ValueError raised when there is no closing --- in frontmatter.""" + content = "---\nname: test-skill\ndescription: foo\n" + bad_file = tmp_path / "SKILL.md" + bad_file.write_text(content, encoding="utf-8") + + with pytest.raises(ValueError): + patch_skill_description(bad_file, "irrelevant") + + assert bad_file.read_text(encoding="utf-8") == content + + +# --------------------------------------------------------------------------- +# VC4: apply_optimized_description — path safety +# --------------------------------------------------------------------------- + + +def test_vc4_path_outside_templates_src_rejected(tmp_path: Path) -> None: + """ValueError raised when skill would resolve outside templates_src.""" + repo_root = _seed_templates_src(tmp_path) + + # Use path traversal in skill name + with pytest.raises((ValueError, FileNotFoundError)): + apply_optimized_description( + skill="../../evil", + winner="new desc", + current_description="old desc", + no_improvement=False, + repo_root=repo_root, + stage=False, + ) + + +def test_vc4_missing_jinja_raises_file_not_found(tmp_path: Path) -> None: + """FileNotFoundError when the skill does not exist in templates_src.""" + repo_root = _seed_templates_src(tmp_path) + + with pytest.raises(FileNotFoundError): + apply_optimized_description( + skill="nonexistent-skill-xyz", + winner="new desc", + current_description="old desc", + no_improvement=False, + repo_root=repo_root, + stage=False, + ) + + +def test_vc4_git_path_in_parts_rejected(tmp_path: Path) -> None: + """A target whose resolved path contains '.git' is rejected (under templates_src). + + Path-safety runs BEFORE the existence check, so this raises even though no such + file exists — the '.git' segment is what triggers the ValueError, not relative_to. + """ + repo_root = _seed_templates_src(tmp_path) + + with pytest.raises(ValueError, match=r"\.git"): + apply_optimized_description( + skill=".git/evil", + winner="new desc", + current_description="old desc", + no_improvement=False, + repo_root=repo_root, + stage=False, + ) + + +def test_vc4_multiparagraph_block_scalar_fully_replaced(tmp_path: Path) -> None: + """A multi-paragraph '|' block (interior blank lines) is fully replaced. + + The body-scan must consume interior blank lines as part of the block scalar, + so the orphaned second paragraph is gone after patch and the next key survives. + """ + skill_md = tmp_path / "SKILL.md" + skill_md.write_text( + "---\n" + "name: foo\n" + "description: |\n" + " Paragraph one.\n" + "\n" + " Paragraph two after a blank line.\n" + "effort: medium\n" + "---\n" + "# body kept verbatim\n", + encoding="utf-8", + ) + + patch_skill_description(skill_md, "Replacement description.") + + patched = skill_md.read_text(encoding="utf-8") + assert _read_frontmatter_description(skill_md) == "Replacement description." + # The second paragraph must be fully gone (block was consumed, not truncated). + assert "Paragraph two after a blank line." not in patched + assert "Paragraph one." not in patched + # The next key and the body are intact. + assert "effort: medium" in patched + assert "# body kept verbatim" in patched + + +# --------------------------------------------------------------------------- +# VC3: no-op outcomes — two distinct messages +# --------------------------------------------------------------------------- + + +def test_vc3_no_improvement_message_and_no_write( + tmp_path: Path, capsys: pytest.CaptureFixture[str] +) -> None: + """no_improvement=True prints exact message; .jinja content and mtime unchanged.""" + repo_root = _seed_templates_src(tmp_path) + jinja = _jinja_path(repo_root) + original_content = jinja.read_text(encoding="utf-8") + original_mtime = jinja.stat().st_mtime + + result = apply_optimized_description( + skill=_SKILL, + winner="some winner", + current_description="old desc", + no_improvement=True, + repo_root=repo_root, + stage=False, + ) + + captured = capsys.readouterr() + assert result == "no_improvement" + assert "No improvement found: current description already optimal on held-out test" in captured.out + # File must be completely untouched + assert jinja.read_text(encoding="utf-8") == original_content + assert jinja.stat().st_mtime == original_mtime + + +def test_vc3_winner_identical_message_and_no_write( + tmp_path: Path, capsys: pytest.CaptureFixture[str] +) -> None: + """winner == current_description prints exact message; .jinja unchanged.""" + repo_root = _seed_templates_src(tmp_path) + jinja = _jinja_path(repo_root) + original_content = jinja.read_text(encoding="utf-8") + original_mtime = jinja.stat().st_mtime + + current_desc = "Same description text." + + result = apply_optimized_description( + skill=_SKILL, + winner=current_desc, + current_description=current_desc, + no_improvement=False, + repo_root=repo_root, + stage=False, + ) + + captured = capsys.readouterr() + assert result == "identical" + assert "Winner identical to current description; no file changes" in captured.out + assert jinja.read_text(encoding="utf-8") == original_content + assert jinja.stat().st_mtime == original_mtime + + +# --------------------------------------------------------------------------- +# VC1 + VC2: full apply with temp repo +# --------------------------------------------------------------------------- + + +def test_vc1_rendered_claude_skill_md_description_equals_winner(tmp_path: Path) -> None: + """VC1: after apply, .claude/skills//SKILL.md frontmatter description == winner.""" + repo_root = _seed_templates_src(tmp_path) + winner = "A completely new optimized description for the skill." + + result = apply_optimized_description( + skill=_SKILL, + winner=winner, + current_description="something different", + no_improvement=False, + repo_root=repo_root, + stage=False, + ) + + assert result == "applied" + + # Check the rendered .claude/skills/map-skill-eval/SKILL.md + rendered_skill_md = repo_root / ".claude" / "skills" / _SKILL / "SKILL.md" + assert rendered_skill_md.exists(), f"Expected rendered file: {rendered_skill_md}" + + rendered_desc = _read_frontmatter_description(rendered_skill_md) + assert rendered_desc == winner, ( + f"Rendered description mismatch.\nGot: {rendered_desc!r}\nExpected: {winner!r}" + ) + + +def test_vc2_rendered_trees_byte_identical_after_apply(tmp_path: Path) -> None: + """VC2: diff_rendered_trees returns [] for both providers after apply. + + Both repo_root and templates_src_root point to the temp copy so the check + compares a fresh render (from the patched temp source) against the already- + rendered temp output — they must be byte-identical. + """ + from mapify_cli.delivery.template_renderer import diff_rendered_trees # noqa: PLC0415 + + repo_root = _seed_templates_src(tmp_path) + templates_src_root = repo_root / "src" / "mapify_cli" / "templates_src" + winner = "New description for byte-identity test." + + result = apply_optimized_description( + skill=_SKILL, + winner=winner, + current_description="old description value", + no_improvement=False, + repo_root=repo_root, + stage=False, + ) + + assert result == "applied" + + # After render, diff_rendered_trees must return [] (byte-identical). + # Pass templates_src_root so diff_rendered_trees reads from the patched temp + # source rather than the installed package's templates_src. + diffs_claude = diff_rendered_trees( + "claude", repo_root=repo_root, templates_src_root=templates_src_root + ) + assert diffs_claude == [], ( + f"claude trees not byte-identical after apply. Diffs: {diffs_claude}" + ) + + diffs_codex = diff_rendered_trees( + "codex", repo_root=repo_root, templates_src_root=templates_src_root + ) + assert diffs_codex == [], ( + f"codex trees not byte-identical after apply. Diffs: {diffs_codex}" + ) + + +def test_vc1_skill_rules_json_description_unchanged(tmp_path: Path) -> None: + """VC1: skill-rules.json description is NOT modified by apply.""" + repo_root = _seed_templates_src(tmp_path) + + # Find the skill-rules.json.jinja and render initial state + from mapify_cli.delivery.template_renderer import render_repo_trees # noqa: PLC0415 + + render_repo_trees("claude", repo_root=repo_root) + + # Read skill-rules.json BEFORE apply + skill_rules = repo_root / ".claude" / "skills" / "skill-rules.json" + if not skill_rules.exists(): + pytest.skip("skill-rules.json not present in rendered output") + + import json as _json # noqa: PLC0415 + + before = _json.loads(skill_rules.read_text(encoding="utf-8")) + # skills is a dict keyed by skill name, each value is a dict with 'description' etc. + skills_before: dict[str, object] = before.get("skills", {}) + skill_entry_before = skills_before.get(_SKILL, {}) + before_desc = skill_entry_before.get("description") if isinstance(skill_entry_before, dict) else None # type: ignore[union-attr] + + winner = "Totally different description that should not appear in skill-rules.json." + + apply_optimized_description( + skill=_SKILL, + winner=winner, + current_description="some old desc", + no_improvement=False, + repo_root=repo_root, + stage=False, + ) + + after = _json.loads(skill_rules.read_text(encoding="utf-8")) + skills_after: dict[str, object] = after.get("skills", {}) + skill_entry_after = skills_after.get(_SKILL, {}) + after_desc = skill_entry_after.get("description") if isinstance(skill_entry_after, dict) else None # type: ignore[union-attr] + + assert before_desc == after_desc, ( + f"skill-rules.json description was modified by apply!\n" + f"Before: {before_desc!r}\nAfter: {after_desc!r}" + ) + assert winner not in str(after_desc), ( + "Winner text must NOT appear in skill-rules.json after apply" + ) diff --git a/tests/test_skills_eval_cli_optimize.py b/tests/test_skills_eval_cli_optimize.py new file mode 100644 index 00000000..f947cf71 --- /dev/null +++ b/tests/test_skills_eval_cli_optimize.py @@ -0,0 +1,633 @@ +"""Tests for `skill-eval optimize` and `skill-eval view` CLI commands (ST-005). + +Validation criteria covered: +- VC1: `optimize --dry-run` prints budget + model line, exits 0, no dispatcher constructed. +- VC2: eval-set with < 5 entries exits 2 with ">= 5" message, BEFORE any dispatcher. +- VC3: claude absent -> exit 1 with "requires-cmd: claude" before any work. +- VC4: full run writes *-optimize.json + *-optimize.html; `view` renders from stored result. +- VC5: --open with a raising webbrowser does NOT error the run (exit stays 0). +- VC6: full run (no --apply) modifies NOTHING outside .map/; apply_patcher never invoked. + +All tests use MockDispatcher + mock proposer — no real claude invocation (INV-2). +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +import pytest +from typer.testing import CliRunner + +from mapify_cli import app +from mapify_cli.skills_eval.dispatcher import MockDispatcher +from mapify_cli.token_budget import TokenUsage + +runner = CliRunner() + +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + +_SKILL = "test-skill" + + +def _which_present(_cmd: object) -> str: + """shutil.which stub: pretend the requested command is on PATH.""" + del _cmd + return "/usr/bin/claude" + + +def _which_absent(_cmd: object) -> None: + """shutil.which stub: pretend the requested command is NOT on PATH.""" + del _cmd + return None + + +def _raise_no_browser(_url: object) -> None: + """webbrowser.open stub that raises — proves --open swallows the error (SC-2).""" + del _url + raise OSError("no browser") + + +_VALID_FRONTMATTER = """\ +--- +name: test-skill +description: "A test skill for optimize CLI tests" +triggers: + - test trigger phrase +--- +Skill body. +""" + + +def _make_eval_set(tmp_path: Path, n: int, skill: str = _SKILL) -> Path: + """Write a minimal eval-set JSON with *n* entries to tmp_path.""" + entries = [ + {"prompt": f"prompt {i}", "should_trigger": skill} + for i in range(n) + ] + p = tmp_path / "eval_set.json" + p.write_text(json.dumps({"entries": entries}), encoding="utf-8") + return p + + +def _make_claude_dir(tmp_path: Path, skill: str = _SKILL) -> Path: + """Create a minimal .claude/skills//SKILL.md tree under tmp_path.""" + skill_dir = tmp_path / ".claude" / "skills" / skill + skill_dir.mkdir(parents=True, exist_ok=True) + (skill_dir / "SKILL.md").write_text(_VALID_FRONTMATTER, encoding="utf-8") + return tmp_path / ".claude" + + +def _mock_dispatcher_factory(skill: str = _SKILL) -> Any: + """Return a MockDispatcher factory callable that ignores source_claude_dir.""" + + class _FactoryMock(MockDispatcher): + def __init__(self, source_claude_dir: Path | None = None, **_kw: Any) -> None: + del source_claude_dir, _kw # intentionally unused in mock + super().__init__( + triggered_skill=skill, + token_usage=TokenUsage( + input_tokens=10, + cache_read_input_tokens=0, + cache_creation_input_tokens=0, + ), + duration_s=0.01, + ) + + return _FactoryMock + + +def _mock_proposer(current_description: str, failing_records: object) -> str: + del current_description, failing_records # intentionally unused in mock + return "improved candidate description" + + +# --------------------------------------------------------------------------- +# VC1: dry-run prints budget + model line, exits 0, NO dispatcher constructed +# --------------------------------------------------------------------------- + + +def test_vc1_dry_run_prints_budget_and_exits_0(tmp_path: Path) -> None: + """VC1: --dry-run prints dispatch count + proposer count + model line; exit 0.""" + eval_set = _make_eval_set(tmp_path, n=5) + + result = runner.invoke( + app, + [ + "skill-eval", "optimize", _SKILL, + "--eval-set", str(eval_set), + "--dry-run", + ], + ) + + assert result.exit_code == 0, f"Expected exit 0, got {result.exit_code}.\nOutput:\n{result.stdout}" + # Budget line: "5 x (N+M) = K dispatch calls + 5 proposer calls" + assert "dispatch calls" in result.stdout, f"Missing budget line.\nOutput:\n{result.stdout}" + assert "proposer calls" in result.stdout, f"Missing proposer call count.\nOutput:\n{result.stdout}" + assert "model: default" in result.stdout, f"Missing model line.\nOutput:\n{result.stdout}" + + +def test_vc1_dry_run_with_5_entries_budget_numbers(tmp_path: Path) -> None: + """VC1: with 5 entries and iterations=3, budget numbers must be consistent.""" + from mapify_cli.skills_eval.description_optimizer import _DEFAULT_SEED, split_train_test + from mapify_cli.skills_eval.runner import load_eval_set + + eval_set = _make_eval_set(tmp_path, n=5) + entries = load_eval_set(eval_set) + train, test = split_train_test(entries, _DEFAULT_SEED) + n_train = len(train) + n_test = len(test) + expected_dispatches = 3 * (n_train + n_test) + + result = runner.invoke( + app, + [ + "skill-eval", "optimize", _SKILL, + "--eval-set", str(eval_set), + "--dry-run", + "--iterations", "3", + ], + ) + + assert result.exit_code == 0 + assert str(expected_dispatches) in result.stdout, ( + f"Expected {expected_dispatches} in output.\nOutput:\n{result.stdout}" + ) + assert "3" in result.stdout # iterations count visible + + +def test_vc1_dry_run_constructs_no_dispatcher( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """VC1: dry-run must NOT construct a ClaudeSubprocessDispatcher.""" + eval_set = _make_eval_set(tmp_path, n=5) + + constructed: list[str] = [] + + class _SentinelDispatcher(MockDispatcher): + def __init__(self, **_kw: Any) -> None: + del _kw # intentionally unused in sentinel + constructed.append("constructed") + super().__init__() + + monkeypatch.setattr( + "mapify_cli.skills_eval.description_optimizer.ClaudeSubprocessDispatcher", + _SentinelDispatcher, + ) + + result = runner.invoke( + app, + [ + "skill-eval", "optimize", _SKILL, + "--eval-set", str(eval_set), + "--dry-run", + ], + ) + + assert result.exit_code == 0 + assert constructed == [], ( + f"Dispatcher was constructed during dry-run! Calls: {constructed}" + ) + + +# --------------------------------------------------------------------------- +# VC2: < 5 entries -> exit 2 with ">= 5" message, BEFORE any dispatcher +# --------------------------------------------------------------------------- + + +def test_vc2_fewer_than_5_entries_exits_2(tmp_path: Path) -> None: + """VC2: eval-set with 4 entries exits 2 with '>= 5' message.""" + eval_set = _make_eval_set(tmp_path, n=4) + + result = runner.invoke( + app, + [ + "skill-eval", "optimize", _SKILL, + "--eval-set", str(eval_set), + ], + ) + + assert result.exit_code == 2, ( + f"Expected exit 2 for < 5 entries, got {result.exit_code}.\nOutput:\n{result.stdout}" + ) + assert "5" in result.stdout, ( + f"Expected '>= 5' minimum in error message.\nOutput:\n{result.stdout}" + ) + + +def test_vc2_min_size_before_dispatcher( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """VC2: dispatcher must NOT be constructed when entries < 5 (even non-dry-run).""" + eval_set = _make_eval_set(tmp_path, n=3) + constructed: list[str] = [] + + class _SentinelDispatcher(MockDispatcher): + def __init__(self, **_kw: Any) -> None: + del _kw # intentionally unused in sentinel + constructed.append("constructed") + super().__init__() + + monkeypatch.setattr( + "mapify_cli.skills_eval.description_optimizer.ClaudeSubprocessDispatcher", + _SentinelDispatcher, + ) + # Even if claude were on PATH, dispatcher must not be constructed before min-size check + monkeypatch.setattr("shutil.which", _which_present) + + result = runner.invoke( + app, + [ + "skill-eval", "optimize", _SKILL, + "--eval-set", str(eval_set), + ], + ) + + assert result.exit_code == 2 + assert constructed == [], ( + f"Dispatcher was constructed before min-size check! Calls: {constructed}" + ) + + +def test_vc2_exactly_5_entries_does_not_exit_2( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """VC2: boundary — exactly 5 entries must NOT be rejected by the min-size guard.""" + _make_claude_dir(tmp_path) + eval_set = _make_eval_set(tmp_path, n=5) + monkeypatch.chdir(tmp_path) + monkeypatch.setattr("shutil.which", _which_present) + monkeypatch.setattr( + "mapify_cli.skills_eval.description_optimizer.ClaudeSubprocessDispatcher", + _mock_dispatcher_factory(_SKILL), + ) + monkeypatch.setattr( + "mapify_cli.skills_eval.proposer.propose_description", + _mock_proposer, + ) + + result = runner.invoke( + app, + [ + "skill-eval", "optimize", _SKILL, + "--eval-set", str(eval_set), + "--iterations", "1", + ], + ) + + # Must not exit 2 due to the min-size guard (5 >= 5) + assert result.exit_code != 2, ( + f"5 entries should not trigger the min-size guard.\nOutput:\n{result.stdout}" + ) + + +# --------------------------------------------------------------------------- +# VC3: claude absent -> exit 1 with "requires-cmd: claude" +# --------------------------------------------------------------------------- + + +def test_vc3_claude_absent_exits_1( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """VC3: when claude is not on PATH, exit 1 with 'requires-cmd: claude'.""" + eval_set = _make_eval_set(tmp_path, n=5) + monkeypatch.setattr("shutil.which", _which_absent) + + result = runner.invoke( + app, + [ + "skill-eval", "optimize", _SKILL, + "--eval-set", str(eval_set), + ], + ) + + assert result.exit_code == 1, ( + f"Expected exit 1 when claude absent, got {result.exit_code}.\nOutput:\n{result.stdout}" + ) + assert "requires-cmd: claude" in result.stdout, ( + f"Expected 'requires-cmd: claude' in output.\nOutput:\n{result.stdout}" + ) + + +def test_vc3_claude_absent_dry_run_still_exits_0( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """VC3: dry-run does NOT check for claude; exit 0 even when claude absent.""" + eval_set = _make_eval_set(tmp_path, n=5) + monkeypatch.setattr("shutil.which", _which_absent) + + result = runner.invoke( + app, + [ + "skill-eval", "optimize", _SKILL, + "--eval-set", str(eval_set), + "--dry-run", + ], + ) + + # dry-run exits BEFORE the claude check + assert result.exit_code == 0, ( + f"Dry-run should exit 0 even when claude absent.\nOutput:\n{result.stdout}" + ) + + +# --------------------------------------------------------------------------- +# VC4 + VC6: full run writes JSON + HTML; view renders; nothing outside .map/ changed +# --------------------------------------------------------------------------- + + +def test_vc4_vc6_full_run_writes_artifacts_and_nothing_outside_map( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """VC4+VC6: full run writes *-optimize.json + *.html under .map/; SKILL.md unchanged.""" + _make_claude_dir(tmp_path) + eval_set = _make_eval_set(tmp_path, n=5) + monkeypatch.chdir(tmp_path) + monkeypatch.setattr("shutil.which", _which_present) + monkeypatch.setattr( + "mapify_cli.skills_eval.description_optimizer.ClaudeSubprocessDispatcher", + _mock_dispatcher_factory(_SKILL), + ) + monkeypatch.setattr( + "mapify_cli.skills_eval.proposer.propose_description", + _mock_proposer, + ) + + # Snapshot files BEFORE the run (relative to tmp_path, under .claude only) + skill_md_path = tmp_path / ".claude" / "skills" / _SKILL / "SKILL.md" + skill_md_before = skill_md_path.read_text(encoding="utf-8") + + result = runner.invoke( + app, + [ + "skill-eval", "optimize", _SKILL, + "--eval-set", str(eval_set), + "--iterations", "2", + ], + ) + + assert result.exit_code == 0, ( + f"Expected exit 0 for full run.\nOutput:\n{result.stdout}" + ) + + # VC4: *-optimize.json and *-optimize.html must exist under .map/eval-runs// + out_dir = tmp_path / ".map" / "eval-runs" / _SKILL + json_files = list(out_dir.glob("*-optimize.json")) + html_files = list(out_dir.glob("*-optimize.html")) + assert len(json_files) == 1, ( + f"Expected 1 *-optimize.json under {out_dir}, found: {json_files}" + ) + assert len(html_files) == 1, ( + f"Expected 1 *-optimize.html under {out_dir}, found: {html_files}" + ) + + # VC4: JSON must be a valid OptimizeResult + from mapify_cli.skills_eval.eval_schema import OptimizeResult + + stored = OptimizeResult.from_dict(json.loads(json_files[0].read_text(encoding="utf-8"))) + assert stored.skill == _SKILL + + # VC6: all NEW paths must be under .map/; SKILL.md must be byte-unchanged + all_new_paths = [ + p for p in tmp_path.rglob("*") + if p.is_file() and ".map" not in str(p.relative_to(tmp_path)).split("/") + ] + map_new = [ + p for p in tmp_path.rglob("*") + if p.is_file() and str(p.relative_to(tmp_path)).startswith(".map") + ] + # The only files outside .map/ were there before; verify SKILL.md unchanged + assert skill_md_path.read_text(encoding="utf-8") == skill_md_before, ( + "SKILL.md was modified by optimize (no --apply)!" + ) + assert len(map_new) >= 2, ( + f"Expected at least JSON+HTML under .map/, found: {map_new}" + ) + # Confirm no unexpected files outside .map/ (only .claude and eval_set.json expected) + unexpected = [ + p for p in all_new_paths + if not str(p.relative_to(tmp_path)).startswith(".claude") + and p != eval_set + ] + assert unexpected == [], ( + f"Files created outside .map/ and .claude/: {unexpected}" + ) + + +def test_vc4_view_renders_from_stored_result( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """VC4: `view` loads the stored JSON and renders an HTML report.""" + _make_claude_dir(tmp_path) + eval_set = _make_eval_set(tmp_path, n=5) + monkeypatch.chdir(tmp_path) + monkeypatch.setattr("shutil.which", _which_present) + monkeypatch.setattr( + "mapify_cli.skills_eval.description_optimizer.ClaudeSubprocessDispatcher", + _mock_dispatcher_factory(_SKILL), + ) + monkeypatch.setattr( + "mapify_cli.skills_eval.proposer.propose_description", + _mock_proposer, + ) + + # Run optimize first to produce the stored result + optimize_result = runner.invoke( + app, + [ + "skill-eval", "optimize", _SKILL, + "--eval-set", str(eval_set), + "--iterations", "1", + ], + ) + assert optimize_result.exit_code == 0, ( + f"Optimize failed before view test.\nOutput:\n{optimize_result.stdout}" + ) + + # Now run view + view_result = runner.invoke( + app, + ["skill-eval", "view", _SKILL], + ) + + assert view_result.exit_code == 0, ( + f"Expected exit 0 for view.\nOutput:\n{view_result.stdout}" + ) + # HTML must exist + out_dir = tmp_path / ".map" / "eval-runs" / _SKILL + html_files = list(out_dir.glob("*-optimize.html")) + assert len(html_files) >= 1, ( + f"Expected HTML report to exist after view.\nFiles: {list(out_dir.iterdir())}" + ) + + +def test_vc6_apply_patcher_not_called_without_apply_flag( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """VC6: without --apply, apply_optimized_description must never be called.""" + _make_claude_dir(tmp_path) + eval_set = _make_eval_set(tmp_path, n=5) + monkeypatch.chdir(tmp_path) + monkeypatch.setattr("shutil.which", _which_present) + monkeypatch.setattr( + "mapify_cli.skills_eval.description_optimizer.ClaudeSubprocessDispatcher", + _mock_dispatcher_factory(_SKILL), + ) + monkeypatch.setattr( + "mapify_cli.skills_eval.proposer.propose_description", + _mock_proposer, + ) + + # Monkeypatch apply_optimized_description to raise if called + def _should_not_be_called(**_kw: Any) -> str: + del _kw + raise AssertionError("apply_optimized_description was called without --apply!") + + monkeypatch.setattr( + "mapify_cli.skills_eval.apply_patcher.apply_optimized_description", + _should_not_be_called, + ) + + result = runner.invoke( + app, + [ + "skill-eval", "optimize", _SKILL, + "--eval-set", str(eval_set), + "--iterations", "1", + ], + ) + + assert result.exit_code == 0, ( + f"Expected exit 0.\nOutput:\n{result.stdout}" + ) + + +# --------------------------------------------------------------------------- +# VC5: --open with raising webbrowser does NOT error the run +# --------------------------------------------------------------------------- + + +def test_vc5_open_swallows_browser_error( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """VC5: --open with a raising webbrowser.open does NOT error the run (exit 0).""" + _make_claude_dir(tmp_path) + eval_set = _make_eval_set(tmp_path, n=5) + monkeypatch.chdir(tmp_path) + monkeypatch.setattr("shutil.which", _which_present) + monkeypatch.setattr( + "mapify_cli.skills_eval.description_optimizer.ClaudeSubprocessDispatcher", + _mock_dispatcher_factory(_SKILL), + ) + monkeypatch.setattr( + "mapify_cli.skills_eval.proposer.propose_description", + _mock_proposer, + ) + monkeypatch.setattr("webbrowser.open", _raise_no_browser) + + result = runner.invoke( + app, + [ + "skill-eval", "optimize", _SKILL, + "--eval-set", str(eval_set), + "--iterations", "1", + "--open", + ], + ) + + assert result.exit_code == 0, ( + f"Expected exit 0 even when webbrowser.open raises.\nOutput:\n{result.stdout}" + ) + + +def test_vc5_view_open_swallows_browser_error( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """VC5: `view --open` with raising webbrowser does NOT error the run.""" + _make_claude_dir(tmp_path) + eval_set = _make_eval_set(tmp_path, n=5) + monkeypatch.chdir(tmp_path) + monkeypatch.setattr("shutil.which", _which_present) + monkeypatch.setattr( + "mapify_cli.skills_eval.description_optimizer.ClaudeSubprocessDispatcher", + _mock_dispatcher_factory(_SKILL), + ) + monkeypatch.setattr( + "mapify_cli.skills_eval.proposer.propose_description", + _mock_proposer, + ) + + # Run optimize first + optimize_result = runner.invoke( + app, + [ + "skill-eval", "optimize", _SKILL, + "--eval-set", str(eval_set), + "--iterations", "1", + ], + ) + assert optimize_result.exit_code == 0 + + monkeypatch.setattr("webbrowser.open", _raise_no_browser) + + view_result = runner.invoke( + app, + ["skill-eval", "view", _SKILL, "--open"], + ) + + assert view_result.exit_code == 0, ( + f"Expected exit 0 for view --open even with raising webbrowser.\nOutput:\n{view_result.stdout}" + ) + + +# --------------------------------------------------------------------------- +# Edge: view with no optimize result -> exit 2 +# --------------------------------------------------------------------------- + + +def test_view_no_result_exits_2( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """view exits 2 when no *-optimize.json found under the skill's eval-runs dir.""" + monkeypatch.chdir(tmp_path) + + result = runner.invoke( + app, + ["skill-eval", "view", _SKILL], + ) + + assert result.exit_code == 2, ( + f"Expected exit 2 when no optimize result found.\nOutput:\n{result.stdout}" + ) + + +# --------------------------------------------------------------------------- +# Edge: optimize --eval-set missing / malformed -> exit 2 +# --------------------------------------------------------------------------- + + +def test_optimize_missing_eval_set_exits_2() -> None: + """optimize without --eval-set exits 2.""" + result = runner.invoke( + app, + ["skill-eval", "optimize", _SKILL], + ) + assert result.exit_code == 2 + + +def test_optimize_malformed_eval_set_exits_2(tmp_path: Path) -> None: + """optimize with malformed eval-set JSON exits 2.""" + bad = tmp_path / "bad.json" + bad.write_text("not json", encoding="utf-8") + + result = runner.invoke( + app, + [ + "skill-eval", "optimize", _SKILL, + "--eval-set", str(bad), + ], + ) + assert result.exit_code == 2 diff --git a/tests/test_skills_eval_dispatcher_env.py b/tests/test_skills_eval_dispatcher_env.py new file mode 100644 index 00000000..4eb9450f --- /dev/null +++ b/tests/test_skills_eval_dispatcher_env.py @@ -0,0 +1,64 @@ +"""Eval subprocess environment: telegram-bridge isolation (no hang on `tg listen`). + +The telegram-bridge plugin's SessionStart hook injects an "always-listen — run +``tg listen``" instruction. If the eval ``claude -p`` agent obeys it, ``tg listen`` +blocks on the Telegram long-poll until the dispatch timeout, and a triggered-skill +cell mis-records as a non-trigger. `_eval_subprocess_env` points ``TG_STATE_DIR`` +at a config-less path inside the throwaway cwd: any ``tg listen`` / ``tg send`` the +agent runs inherits this env, finds no ``config.json``, and exits immediately +(``die``) instead of blocking — neutralising the hang without touching the +operator's real ``~/.claude/telegram`` config. +""" +from __future__ import annotations + +from pathlib import Path + +from mapify_cli.skills_eval.dispatcher import ( + _NO_TELEGRAM_STATE_DIRNAME, + _eval_subprocess_env, +) + + +def test_env_sets_map_invoked_by_guard(tmp_path: Path) -> None: + env = _eval_subprocess_env(tmp_path) + assert env["MAP_INVOKED_BY"] == "skills-eval" + + +def test_env_points_tg_state_dir_under_cwd(tmp_path: Path) -> None: + """TG_STATE_DIR is a path INSIDE the throwaway cwd (cleaned up with it).""" + env = _eval_subprocess_env(tmp_path) + tg_state = Path(env["TG_STATE_DIR"]) + assert tg_state == tmp_path / _NO_TELEGRAM_STATE_DIRNAME + assert tmp_path in tg_state.parents + + +def test_tg_state_dir_has_no_config_so_tg_commands_exit_fast(tmp_path: Path) -> None: + """The telegram gate is ``config.json`` presence; our dir must lack it. + + ``tg.py`` does ``if not os.path.exists(STATE_DIR/config.json): die()`` -> any + ``tg listen`` / ``tg send`` the eval agent runs exits immediately instead of + blocking on the Telegram long-poll. + """ + env = _eval_subprocess_env(tmp_path) + tg_state = Path(env["TG_STATE_DIR"]) + assert not (tg_state / "config.json").exists() + + +def test_env_preserves_inherited_environment(tmp_path: Path, monkeypatch) -> None: + """The override is additive — existing env (e.g. PATH) is preserved.""" + monkeypatch.setenv("SOME_EXISTING_VAR", "keepme") + env = _eval_subprocess_env(tmp_path) + assert env.get("SOME_EXISTING_VAR") == "keepme" + assert "PATH" in env + + +def test_env_does_not_mutate_real_tg_state_dir(tmp_path: Path, monkeypatch) -> None: + """The operator's real TG_STATE_DIR (if set) is overridden ONLY in the returned + dict for the subprocess — os.environ itself is not mutated.""" + monkeypatch.setenv("TG_STATE_DIR", "/real/operator/telegram") + import os + + env = _eval_subprocess_env(tmp_path) + assert env["TG_STATE_DIR"] == str(tmp_path / _NO_TELEGRAM_STATE_DIRNAME) + # The process-wide environ is untouched. + assert os.environ["TG_STATE_DIR"] == "/real/operator/telegram" diff --git a/tests/test_skills_eval_fixtures.py b/tests/test_skills_eval_fixtures.py new file mode 100644 index 00000000..3158316f --- /dev/null +++ b/tests/test_skills_eval_fixtures.py @@ -0,0 +1,94 @@ +"""Tests for skill eval optimizer fixtures (ST-007). + +Validates: +- VC1: the 3 new optimizer fixtures each have >= 8 entries loadable via load_eval_set. +- VC2: each fixture is 60/40-splittable with n_test >= 3. +- VC3: the existing 3-entry smoke fixture is unchanged; README.md exists and + documents the >= 8 sizing rationale. +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from mapify_cli.skills_eval.description_optimizer import _DEFAULT_SEED, split_train_test +from mapify_cli.skills_eval.runner import load_eval_set + +_FIXTURES_DIR = Path(__file__).parent / "skills_eval" / "fixtures" + +_NEW_FIXTURES = [ + _FIXTURES_DIR / "map_plan_optimize_eval_set.json", + _FIXTURES_DIR / "map_efficient_optimize_eval_set.json", + _FIXTURES_DIR / "map_debug_optimize_eval_set.json", +] + +_SMOKE_FIXTURE = _FIXTURES_DIR / "map_debug_eval_set.json" + + +# --------------------------------------------------------------------------- +# Discovery guard (fail loudly on path typos before parametrized tests run) +# --------------------------------------------------------------------------- + + +def test_new_fixture_discovery_non_empty() -> None: + """All 3 new fixture paths must exist — catches path typos before parametrize.""" + assert len(_NEW_FIXTURES) == 3, f"Expected 3 fixtures, got {len(_NEW_FIXTURES)}" + missing = [str(p) for p in _NEW_FIXTURES if not p.exists()] + assert not missing, f"Missing fixture files: {missing}" + + +# --------------------------------------------------------------------------- +# VC1: each new fixture has >= 8 entries +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("fixture_path", _NEW_FIXTURES, ids=[p.name for p in _NEW_FIXTURES]) +def test_vc1_new_fixture_has_at_least_8_entries(fixture_path: Path) -> None: + entries = load_eval_set(fixture_path) + assert len(entries) >= 8, ( + f"{fixture_path.name} has only {len(entries)} entries; optimizer requires >= 8" + ) + + +# --------------------------------------------------------------------------- +# VC2: each new fixture is 60/40-splittable with n_test >= 3 +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("fixture_path", _NEW_FIXTURES, ids=[p.name for p in _NEW_FIXTURES]) +def test_vc2_new_fixture_split_yields_n_test_ge_3(fixture_path: Path) -> None: + entries = load_eval_set(fixture_path) + train, test = split_train_test(entries, _DEFAULT_SEED) + assert len(test) >= 3, ( + f"{fixture_path.name}: n_test={len(test)}, expected >= 3 for a meaningful held-out set" + ) + assert len(train) + len(test) == len(entries), ( + f"{fixture_path.name}: split sizes {len(train)}+{len(test)} != total {len(entries)}" + ) + + +# --------------------------------------------------------------------------- +# VC3: smoke fixture unchanged; README.md exists and documents >= 8 rationale +# --------------------------------------------------------------------------- + + +def test_vc3_smoke_fixture_has_exactly_3_entries() -> None: + """The 3-entry smoke fixture must remain untouched.""" + assert _SMOKE_FIXTURE.exists(), f"Smoke fixture missing: {_SMOKE_FIXTURE}" + entries = load_eval_set(_SMOKE_FIXTURE) + assert len(entries) == 3, ( + f"Smoke fixture {_SMOKE_FIXTURE.name} must have exactly 3 entries " + f"(got {len(entries)}); do not modify it" + ) + + +def test_vc3_readme_exists_and_documents_sizing_rationale() -> None: + """README.md must exist and mention the >= 8 sizing rationale.""" + readme = _FIXTURES_DIR / "README.md" + assert readme.exists(), f"README.md not found at {readme}" + content = readme.read_text(encoding="utf-8") + assert ">= 8" in content or "8 entries" in content or "≥ 8" in content, ( + "README.md must document the >= 8 (or '8 entries' / '≥ 8') optimizer sizing rationale" + ) diff --git a/tests/test_skills_eval_optimizer.py b/tests/test_skills_eval_optimizer.py new file mode 100644 index 00000000..6da9638f --- /dev/null +++ b/tests/test_skills_eval_optimizer.py @@ -0,0 +1,542 @@ +"""Tests for description_optimizer.py — VC1, VC4, VC5, VC6. + +All tests use: +- A stateful ScriptedDispatcher (subclasses VariantDispatcher) for zero subprocess calls. +- A function proposer (no claude subprocess). +- A minimal real source tree: tmp/.claude/skills//SKILL.md with valid frontmatter. + +Scenarios covered +----------------- +VC1: overfit candidate (train up, test down vs baseline) → overfit=True, selected=False +VC1: strict-better candidate (test > baseline) → selected=True, no_improvement=False +VC1: full-tie across all iterations → baseline (iter 0) wins, no_improvement=True +VC4: proposer returning None → proposal_failed iteration recorded, loop continues, + baseline still eligible as winner +VC5: OptimizeResult fields populated: candidate_description, pass-rates, token totals, + selected flag; TokenUsage.total verified +VC6: production source .claude/ is never modified; temp dirs cleaned up after optimize +""" + +from __future__ import annotations + +import json +import tempfile +from collections.abc import Sequence +from pathlib import Path + +import pytest + +from mapify_cli.skills_eval.dispatcher import VariantDispatcher +from mapify_cli.skills_eval.eval_schema import ( + DispatchResult, + EvalResultRecord, + EvalSetEntry, + OptimizeResult, +) +from mapify_cli.skills_eval.description_optimizer import ( + _set_frontmatter_description, + optimize, + split_train_test, +) +from mapify_cli.token_budget import TokenUsage + + +# --------------------------------------------------------------------------- +# Helpers: source tree fixture +# --------------------------------------------------------------------------- + +SKILL_NAME = "test-skill" +_BASE_DESC = "base desc" + + +def _make_source_tree(tmp: Path, desc: str = _BASE_DESC) -> Path: + """Create a minimal .claude/skills//SKILL.md under tmp.""" + skill_dir = tmp / ".claude" / "skills" / SKILL_NAME + skill_dir.mkdir(parents=True) + content = f'---\nname: {SKILL_NAME}\ndescription: "{desc}"\n---\n# body\n' + (skill_dir / "SKILL.md").write_text(content, encoding="utf-8") + return tmp / ".claude" + + +def _make_entries(n: int, skill: str = SKILL_NAME) -> list[EvalSetEntry]: + """Create n EvalSetEntry rows that should_trigger the given skill.""" + return [ + EvalSetEntry( + prompt=f"prompt-{i}", + should_trigger=skill, + should_not_trigger=None, + assertions=[], + ) + for i in range(n) + ] + + +# --------------------------------------------------------------------------- +# Stateful scripted dispatcher +# --------------------------------------------------------------------------- + + +class ScriptedDispatcher(VariantDispatcher): + """Returns DispatchResults from a pre-built call script (list pop order). + + Each element in ``script`` is a tuple (triggered_skill, token_total). + When the script is exhausted, returns a passing result for SKILL_NAME. + """ + + def __init__(self, script: Sequence[tuple[str | None, int]]) -> None: + self._script = list(script) + self._call_count = 0 + + def dispatch(self, prompt: str) -> DispatchResult: + del prompt # intentionally unused; scripted mock + if self._script: + triggered, tokens = self._script.pop(0) + else: + triggered = SKILL_NAME + tokens = 10 + self._call_count += 1 + return DispatchResult( + raw_output="", + triggered_skill=triggered, + token_usage=TokenUsage(input_tokens=tokens), + duration_s=0.0, + error=None, + ) + + +# --------------------------------------------------------------------------- +# Proposer helpers +# --------------------------------------------------------------------------- + + +def _make_fixed_proposer(description: str): + """Return a proposer that always returns the given description.""" + def proposer(current_desc: str, failing: list[EvalResultRecord]) -> str: + del current_desc, failing # unused in fixed proposer + return description + return proposer + + +def _make_none_proposer(): + """Return a proposer that always returns None (proposal_failed).""" + def proposer(current_desc: str, failing: list[EvalResultRecord]) -> None: + del current_desc, failing # unused + return None + return proposer + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def tmp_src(tmp_path: Path) -> Path: + """tmp_path/.claude/ with a valid SKILL.md.""" + return _make_source_tree(tmp_path / "src") + + +@pytest.fixture +def out_dir(tmp_path: Path) -> Path: + """Separate output directory for .jsonl files.""" + d = tmp_path / "out" + d.mkdir() + return d + + +# --------------------------------------------------------------------------- +# VC1: overfit candidate — train up, test down → overfit=True, selected=False +# --------------------------------------------------------------------------- + + +def test_vc1_overfit_candidate_not_selected(tmp_src: Path, out_dir: Path) -> None: + """VC1 [AC-1][INV-4][HC-4]: overfit=True candidate must NOT be selected.""" + # 5 entries: split_train_test with default seed → 3 train, 2 test (n=5, n_test=2) + entries = _make_entries(5) + + # Dispatcher script design: + # iter 0 (baseline): ALL dispatch calls return triggered=None (no trigger) + # → 3 train fail, 2 test fail → train_pass_rate=0.0, test_pass_rate=0.0 + # iter 1 (candidate): train calls return triggered=SKILL_NAME (pass), + # test calls return triggered=None (fail) + # → train_pass_rate=1.0, test_pass_rate=0.0 + # Overfit: train(1.0) > baseline(0.0) AND test(0.0) < baseline(0.0) is FALSE + # Wait — 0.0 is NOT strictly less than 0.0; overfit needs test < baseline. + # Fix: baseline test = 0.5 (1 of 2 pass), candidate test = 0.0 + # baseline: 3 train=None(fail), 2 test: 1 pass + 1 fail → test=0.5 + # candidate: 3 train=SKILL(pass), 2 test: both None(fail) → test=0.0 + + # We need to figure out the split to know which are train vs test indices. + train_entries, test_entries = split_train_test(entries, 1337) + n_train = len(train_entries) + n_test = len(test_entries) + + # baseline iter 0: n_train fails, (n_test-1) test pass + 1 test fail + # baseline_script is a list of (triggered_skill, token_total) tuples + baseline_script = ( + [(None, 5)] * n_train # train: all fail + + [(SKILL_NAME, 5)] * (n_test - 1) # test: n_test-1 pass + + [(None, 5)] # test: 1 fail + ) + # candidate iter 1: all train pass, all test fail + candidate_script = ( + [(SKILL_NAME, 8)] * n_train # train: all pass + + [(None, 5)] * n_test # test: all fail + ) + + # Combine: baseline first, then candidate + full_script = baseline_script + candidate_script + + dispatcher = ScriptedDispatcher(full_script) + + result = optimize( + skill=SKILL_NAME, + entries=entries, + current_description=_BASE_DESC, + proposer=_make_fixed_proposer("better desc attempt"), + dispatcher=dispatcher, + source_claude_dir=tmp_src, + out_dir=out_dir, + run_ts="20260101T000000Z", + iterations=2, + seed=1337, + ) + + assert isinstance(result, OptimizeResult) + assert len(result.iterations) == 2 + + baseline = result.iterations[0] + candidate = result.iterations[1] + + # Baseline should have partial test pass-rate + assert baseline.test_pass_rate > 0.0, "baseline test_pass_rate should be > 0" + + # Candidate: train improved, test did NOT improve (0.0 < baseline) + assert candidate.train_pass_rate > baseline.train_pass_rate + assert candidate.test_pass_rate < baseline.test_pass_rate + + # VC1: overfit flagged + assert candidate.overfit is True, "overfit candidate must be flagged overfit=True" + assert candidate.selected is False, "overfit candidate must NOT be selected" + + # VC1: baseline is selected (no strict improvement) + assert baseline.selected is True + assert result.no_improvement is True + assert result.winning_iteration == 0 + assert result.winning_description == _BASE_DESC + + +# --------------------------------------------------------------------------- +# VC1: strict-better candidate → selected=True, no_improvement=False +# --------------------------------------------------------------------------- + + +def test_vc1_strict_better_candidate_selected(tmp_src: Path, out_dir: Path) -> None: + """VC1: candidate with test_pass_rate > baseline → selected=True, no_improvement=False.""" + entries = _make_entries(5) + train_entries, test_entries = split_train_test(entries, 1337) + n_train = len(train_entries) + n_test = len(test_entries) + + # baseline: all fail → 0.0 train, 0.0 test + baseline_script = [(None, 5)] * (n_train + n_test) + # candidate: all pass → 1.0 train, 1.0 test (strictly > 0.0) + candidate_script = [(SKILL_NAME, 12)] * (n_train + n_test) + + dispatcher = ScriptedDispatcher(baseline_script + candidate_script) + + result = optimize( + skill=SKILL_NAME, + entries=entries, + current_description=_BASE_DESC, + proposer=_make_fixed_proposer("improved desc"), + dispatcher=dispatcher, + source_claude_dir=tmp_src, + out_dir=out_dir, + run_ts="20260101T000001Z", + iterations=2, + seed=1337, + ) + + assert len(result.iterations) == 2 + candidate = result.iterations[1] + + assert candidate.selected is True + assert result.no_improvement is False + assert result.winning_iteration == 1 + assert result.winning_description == "improved desc" + # VC5: check token totals + assert candidate.train_tokens_total > 0 + assert candidate.test_tokens_total > 0 + # VC5: candidate_description populated + assert candidate.candidate_description == "improved desc" + + +# --------------------------------------------------------------------------- +# VC1 / VC5: full tie across all iterations → baseline wins, no_improvement=True +# --------------------------------------------------------------------------- + + +def test_vc1_full_tie_baseline_wins(tmp_src: Path, out_dir: Path) -> None: + """VC5 [AC-2]: full-tie across all iterations -> baseline selected, no_improvement=True.""" + entries = _make_entries(5) + train_entries, test_entries = split_train_test(entries, 1337) + n_train = len(train_entries) + n_test = len(test_entries) + + # 3 iterations; all produce same result: all fail + per_iter = [(None, 5)] * (n_train + n_test) + dispatcher = ScriptedDispatcher(per_iter * 3) + + result = optimize( + skill=SKILL_NAME, + entries=entries, + current_description=_BASE_DESC, + proposer=_make_fixed_proposer("same rate desc"), + dispatcher=dispatcher, + source_claude_dir=tmp_src, + out_dir=out_dir, + run_ts="20260101T000002Z", + iterations=3, + seed=1337, + ) + + assert result.no_improvement is True + assert result.winning_iteration == 0 + assert result.winning_description == _BASE_DESC + # Baseline is selected + baseline = result.iterations[0] + assert baseline.selected is True + # Others are not selected + for rec in result.iterations[1:]: + assert rec.selected is False, f"iter {rec.iteration} should not be selected" + + +# --------------------------------------------------------------------------- +# VC4: proposer returning None → proposal_failed recorded, baseline eligible +# --------------------------------------------------------------------------- + + +def test_vc4_proposal_failed_continues(tmp_src: Path, out_dir: Path) -> None: + """VC4 [INV-6][HC-3]: proposer returning None => proposal_failed iter; loop continues.""" + entries = _make_entries(4) + train_entries, test_entries = split_train_test(entries, 1337) + n_train = len(train_entries) + n_test = len(test_entries) + + # Only baseline gets dispatched (iter 1 and 2 will be proposal_failed) + baseline_script = [(None, 5)] * (n_train + n_test) + dispatcher = ScriptedDispatcher(baseline_script) + + result = optimize( + skill=SKILL_NAME, + entries=entries, + current_description=_BASE_DESC, + proposer=_make_none_proposer(), + dispatcher=dispatcher, + source_claude_dir=tmp_src, + out_dir=out_dir, + run_ts="20260101T000003Z", + iterations=3, + seed=1337, + ) + + # 3 iterations: iter 0 (baseline) + iter 1 and 2 (proposal_failed) + assert len(result.iterations) == 3 + + baseline = result.iterations[0] + assert not baseline.proposal_failed + + for rec in result.iterations[1:]: + assert rec.proposal_failed is True, f"iter {rec.iteration} should be proposal_failed" + assert rec.candidate_description is None + assert rec.train_pass_rate == 0.0 + assert rec.test_pass_rate == 0.0 + assert rec.train_jsonl_path == "" + assert rec.test_jsonl_path == "" + + # Baseline wins (no strict improvement) + assert result.winning_iteration == 0 + assert result.no_improvement is True + + +# --------------------------------------------------------------------------- +# VC5: OptimizeResult fields populated correctly +# --------------------------------------------------------------------------- + + +def test_vc5_result_fields_populated(tmp_src: Path, out_dir: Path) -> None: + """VC5 [AC-2]: verify all OptimizeResult and OptimizeIterationRecord fields.""" + entries = _make_entries(5) + train_entries, test_entries = split_train_test(entries, 1337) + n_train = len(train_entries) + n_test = len(test_entries) + + # Baseline fails; candidate passes everything + baseline_script = [(None, 3)] * (n_train + n_test) + candidate_script = [(SKILL_NAME, 7)] * (n_train + n_test) + dispatcher = ScriptedDispatcher(baseline_script + candidate_script) + + result = optimize( + skill=SKILL_NAME, + entries=entries, + current_description=_BASE_DESC, + proposer=_make_fixed_proposer("new candidate"), + dispatcher=dispatcher, + source_claude_dir=tmp_src, + out_dir=out_dir, + run_ts="20260101T000004Z", + iterations=2, + seed=1337, + ) + + # Top-level fields + assert result.skill == SKILL_NAME + assert result.seed == 1337 + assert result.n_train == n_train + assert result.n_test == n_test + assert result.baseline_description == _BASE_DESC + assert result.winning_description == "new candidate" + assert result.winning_iteration == 1 + assert result.no_improvement is False + + # Per-iteration record fields (VC5) + candidate = result.iterations[1] + assert candidate.candidate_description == "new candidate" + assert candidate.train_pass_rate == 1.0 + assert candidate.test_pass_rate == 1.0 + # Token totals: n_train * 7 for train, n_test * 7 for test + assert candidate.train_tokens_total == n_train * 7 + assert candidate.test_tokens_total == n_test * 7 + assert candidate.selected is True + assert candidate.overfit is False + assert candidate.proposal_failed is False + + # TokenUsage.total check via explicit construction + tu = TokenUsage(input_tokens=5, cache_read_input_tokens=3, cache_creation_input_tokens=2) + assert tu.total == 10 + + +# --------------------------------------------------------------------------- +# VC6: production source SKILL.md is never modified; temp dirs cleaned up +# --------------------------------------------------------------------------- + + +def test_vc6_production_source_untouched(tmp_path: Path, out_dir: Path) -> None: + """VC6 [INV-7]: source .claude/ is NEVER modified; temp dirs removed after optimize.""" + src_root = tmp_path / "prod_src" + source_claude = _make_source_tree(src_root) + + # Record original state + skill_md = source_claude / "skills" / SKILL_NAME / "SKILL.md" + original_content = skill_md.read_text(encoding="utf-8") + original_mtime = skill_md.stat().st_mtime + + entries = _make_entries(4) + train_entries, test_entries = split_train_test(entries, 1337) + n_train = len(train_entries) + n_test = len(test_entries) + + # Run 2 iterations with a dispatcher that always passes + dispatcher = ScriptedDispatcher( + [(SKILL_NAME, 10)] * (n_train + n_test) * 2 + ) + + import glob + import os + tmpdir = tempfile.gettempdir() + + # Snapshot mapeval-candidate dirs before + before = set(glob.glob(os.path.join(tmpdir, "mapeval-candidate-*"))) + + result = optimize( + skill=SKILL_NAME, + entries=entries, + current_description=_BASE_DESC, + proposer=_make_fixed_proposer("new desc"), + dispatcher=dispatcher, + source_claude_dir=source_claude, + out_dir=out_dir, + run_ts="20260101T000005Z", + iterations=2, + seed=1337, + ) + + # After: mapeval-candidate dirs should all be removed + after = set(glob.glob(os.path.join(tmpdir, "mapeval-candidate-*"))) + new_dirs = after - before + assert not new_dirs, ( + f"Leftover mapeval-candidate-* dirs after optimize: {new_dirs}" + ) + + # VC6: source SKILL.md content is byte-identical + assert skill_md.read_text(encoding="utf-8") == original_content, ( + "Source SKILL.md was modified by optimize()!" + ) + # VC6: mtime unchanged (file was not touched) + assert skill_md.stat().st_mtime == original_mtime, ( + "Source SKILL.md mtime changed — file was written!" + ) + + # Sanity check: optimization ran + assert len(result.iterations) == 2 + + +# --------------------------------------------------------------------------- +# VC6: fail-loud frontmatter patch — raise paths (no partial write) +# --------------------------------------------------------------------------- + + +def test_set_frontmatter_description_raises_without_opening_fence() -> None: + """VC6 fail-loud: content not starting with '---\\n' raises ValueError.""" + with pytest.raises(ValueError): + _set_frontmatter_description("name: x\ndescription: y\n", "new desc") + + +def test_set_frontmatter_description_raises_without_closing_fence() -> None: + """VC6 fail-loud: frontmatter with no closing '---' raises ValueError.""" + with pytest.raises(ValueError): + _set_frontmatter_description("---\nname: x\ndescription: y\n", "new desc") + + +def test_set_frontmatter_description_raises_without_description_key() -> None: + """VC6 fail-loud: frontmatter lacking a 'description:' line raises ValueError.""" + with pytest.raises(ValueError): + _set_frontmatter_description("---\nname: x\n---\n# body\n", "new desc") + + +def test_set_frontmatter_description_roundtrips_tricky_value() -> None: + """VC6: a value with quotes/colon/newline survives a YAML round-trip.""" + tricky = 'has "quotes": a colon\nand a newline' + patched = _set_frontmatter_description( + '---\nname: x\ndescription: "old"\n---\n# body\n', tricky + ) + # The body and other keys are preserved; description re-parses to `tricky`. + assert "name: x" in patched + assert "# body" in patched + fm = patched.split("---\n", 2)[1] + desc_line = next(ln for ln in fm.splitlines() if ln.startswith("description:")) + parsed = json.loads(desc_line[len("description: "):]) # double-quoted YAML == JSON string + assert parsed == tricky + + +def test_optimize_rejects_zero_iterations(tmp_path: Path) -> None: + """Latent-crash guard: iterations < 1 raises ValueError, not IndexError.""" + source_claude = _make_source_tree(tmp_path / "src") + + def _never_called(_cur: str, _recs: list[EvalResultRecord]) -> str | None: + del _cur, _recs + return None + + with pytest.raises(ValueError): + optimize( + skill=SKILL_NAME, + entries=_make_entries(5), + current_description=_BASE_DESC, + proposer=_never_called, + dispatcher=ScriptedDispatcher([]), + source_claude_dir=source_claude, + out_dir=tmp_path / "out", + run_ts="20260101T120000Z", + iterations=0, + ) diff --git a/tests/test_skills_eval_optimizer_isolation.py b/tests/test_skills_eval_optimizer_isolation.py new file mode 100644 index 00000000..6bd9c1f2 --- /dev/null +++ b/tests/test_skills_eval_optimizer_isolation.py @@ -0,0 +1,408 @@ +"""Isolation tests for description_optimizer.py — VC2 and VC3. + +VC2 [INV-3][HC-8]: + - split_train_test is pure/deterministic (same inputs -> identical split). + - Module source contains NO ``import random``, ``import datetime``, + ``datetime.now``, ``time.time``. + +VC3 [INV-8][HC-11][AC-12]: + - An N-iteration run produces N distinct -optimize-iter-{train,test}.jsonl + paths (2*N files). + - Every run_eval call uses resume=False. + - cell_ids from different iterations NEVER share a .jsonl file. +""" + +from __future__ import annotations + +import ast +import json +from pathlib import Path +from unittest.mock import patch + + +from mapify_cli.skills_eval.description_optimizer import ( + _DEFAULT_SEED, + split_train_test, + optimize, +) +from mapify_cli.skills_eval.dispatcher import VariantDispatcher +from mapify_cli.skills_eval.eval_schema import ( + DispatchResult, + EvalResultRecord, + EvalSetEntry, +) +from mapify_cli.token_budget import TokenUsage + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +_SKILL = "test-skill" +_BASE_DESC = "base desc" +_SRC_ROOT = Path(__file__).parent.parent / "src" / "mapify_cli" / "skills_eval" +_OPTIMIZER_SRC = _SRC_ROOT / "description_optimizer.py" + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_entries(n: int) -> list[EvalSetEntry]: + return [ + EvalSetEntry( + prompt=f"p{i}", + should_trigger=_SKILL, + should_not_trigger=None, + assertions=[], + ) + for i in range(n) + ] + + +def _make_source_tree(tmp: Path) -> Path: + """Create minimal tmp/.claude/skills//SKILL.md.""" + skill_dir = tmp / ".claude" / "skills" / _SKILL + skill_dir.mkdir(parents=True) + content = f'---\nname: {_SKILL}\ndescription: "{_BASE_DESC}"\n---\n# body\n' + (skill_dir / "SKILL.md").write_text(content, encoding="utf-8") + return tmp / ".claude" + + +def _stub_proposer(current: str, failing: list[EvalResultRecord]) -> str | None: + """Proposer stub for isolation tests — returns a fixed non-None candidate. + + These tests exercise jsonl path isolation and resume=False, not the + candidate text itself, so the inputs are intentionally unused. + """ + del current, failing + return "candidate desc" + + +class _PassDispatcher(VariantDispatcher): + """Dispatcher that always triggers _SKILL.""" + + def dispatch(self, prompt: str) -> DispatchResult: + del prompt + return DispatchResult( + raw_output="", + triggered_skill=_SKILL, + token_usage=TokenUsage(input_tokens=5), + duration_s=0.0, + error=None, + ) + + +# --------------------------------------------------------------------------- +# VC2: split determinism +# --------------------------------------------------------------------------- + + +def test_vc2_split_deterministic_same_seed() -> None: + """VC2: identical inputs and seed → identical split on every call.""" + entries = _make_entries(10) + seed = 42 + + train_a, test_a = split_train_test(entries, seed) + train_b, test_b = split_train_test(entries, seed) + + # Compare by prompt (EvalSetEntry equality is by identity, so compare fields) + assert [e.prompt for e in train_a] == [e.prompt for e in train_b] + assert [e.prompt for e in test_a] == [e.prompt for e in test_b] + + +def test_vc2_split_different_seeds_produce_different_splits() -> None: + """VC2: different seeds should (very likely) produce different splits.""" + entries = _make_entries(10) + train_a, _ = split_train_test(entries, seed=1) + train_b, _ = split_train_test(entries, seed=9999) + # With 10 items it is astronomically unlikely both seeds give the same split + prompts_a = [e.prompt for e in train_a] + prompts_b = [e.prompt for e in train_b] + assert prompts_a != prompts_b, ( + "Different seeds produced identical splits — seeding is broken" + ) + + +def test_vc2_split_sizes() -> None: + """VC2: n_test = max(1, round(n * 0.4)); all entries accounted for.""" + for n in range(1, 15): + entries = _make_entries(n) + train, test = split_train_test(entries, 1337) + expected_test = max(1, round(n * 0.4)) + assert len(test) == expected_test, f"n={n}: expected n_test={expected_test}, got {len(test)}" + assert len(train) + len(test) == n + # No duplicates + all_prompts = [e.prompt for e in train] + [e.prompt for e in test] + assert len(set(all_prompts)) == n + + +# --------------------------------------------------------------------------- +# VC2: module source scan — no forbidden imports +# --------------------------------------------------------------------------- + + +def test_vc2_no_import_random() -> None: + """VC2 [INV-3]: description_optimizer.py must not import random.""" + source = _OPTIMIZER_SRC.read_text(encoding="utf-8") + tree = ast.parse(source) + for node in ast.walk(tree): + if isinstance(node, ast.Import): + for alias in node.names: + assert alias.name != "random", ( + "description_optimizer.py has 'import random' — violates INV-3" + ) + elif isinstance(node, ast.ImportFrom): + assert node.module != "random", ( + "description_optimizer.py has 'from random import ...' — violates INV-3" + ) + + +def test_vc2_no_import_datetime() -> None: + """VC2 [INV-3]: description_optimizer.py must not import datetime.""" + source = _OPTIMIZER_SRC.read_text(encoding="utf-8") + tree = ast.parse(source) + for node in ast.walk(tree): + if isinstance(node, ast.Import): + for alias in node.names: + assert alias.name != "datetime", ( + "description_optimizer.py has 'import datetime' — violates INV-3" + ) + elif isinstance(node, ast.ImportFrom): + assert (node.module or "").split(".")[0] != "datetime", ( + "description_optimizer.py has 'from datetime import ...' — violates INV-3" + ) + + +def test_vc2_no_datetime_now_call() -> None: + """VC2: description_optimizer.py must not call datetime.now() (AST check). + + Uses AST walk over Call nodes so docstring/comment prose is not flagged. + """ + source = _OPTIMIZER_SRC.read_text(encoding="utf-8") + tree = ast.parse(source) + for node in ast.walk(tree): + if not isinstance(node, ast.Call): + continue + func = node.func + if not isinstance(func, ast.Attribute) or func.attr != "now": + continue + # datetime.now(...) or datetime.datetime.now(...) + val = func.value + if isinstance(val, ast.Name) and val.id == "datetime": + raise AssertionError( + "description_optimizer.py calls datetime.now() — " + "violates clock-free invariant" + ) + if isinstance(val, ast.Attribute) and val.attr == "datetime": + raise AssertionError( + "description_optimizer.py calls datetime.datetime.now() — " + "violates clock-free invariant" + ) + + +def test_vc2_no_time_time_call() -> None: + """VC2: description_optimizer.py must not call time.time() (AST check). + + Uses AST walk over Call nodes so docstring/comment prose is not flagged. + """ + source = _OPTIMIZER_SRC.read_text(encoding="utf-8") + tree = ast.parse(source) + for node in ast.walk(tree): + if not isinstance(node, ast.Call): + continue + func = node.func + if ( + isinstance(func, ast.Attribute) + and func.attr == "time" + and isinstance(func.value, ast.Name) + and func.value.id == "time" + ): + raise AssertionError( + "description_optimizer.py calls time.time() — " + "violates clock-free invariant" + ) + + +# --------------------------------------------------------------------------- +# VC3: N distinct iter paths, resume=False, no intra-file cell_id duplication +# --------------------------------------------------------------------------- + + +def test_vc3_distinct_jsonl_paths_and_resume_false( + tmp_path: Path, +) -> None: + """VC3 [INV-8][HC-11][AC-12]: N iterations → 2*N distinct files; resume=False always.""" + source_claude = _make_source_tree(tmp_path / "src") + out_dir = tmp_path / "out" + out_dir.mkdir() + + entries = _make_entries(5) + n_iters = 3 + run_ts = "20260101T120000Z" + + # Track run_eval calls to verify resume=False + run_eval_calls: list[dict] = [] + + import mapify_cli.skills_eval.description_optimizer as opt_module + + original_run_eval = opt_module.run_eval + + def capturing_run_eval(**kwargs) -> list: + run_eval_calls.append( + {"out_path": kwargs["out_path"], "resume": kwargs.get("resume", False)} + ) + return original_run_eval(**kwargs) + + dispatcher = _PassDispatcher() + + with patch.object(opt_module, "run_eval", side_effect=capturing_run_eval): + result = optimize( + skill=_SKILL, + entries=entries, + current_description=_BASE_DESC, + proposer=_stub_proposer, + dispatcher=dispatcher, + source_claude_dir=source_claude, + out_dir=out_dir, + run_ts=run_ts, + iterations=n_iters, + seed=_DEFAULT_SEED, + ) + + # VC3: N iterations × 2 splits = 2*N run_eval calls + assert len(run_eval_calls) == n_iters * 2, ( + f"Expected {n_iters * 2} run_eval calls, got {len(run_eval_calls)}" + ) + + # VC3: all resume=False + for c in run_eval_calls: + assert c["resume"] is False, ( + f"run_eval called with resume=True for path {c['out_path']}" + ) + + # VC3: all out_paths are distinct + paths = [c["out_path"] for c in run_eval_calls] + assert len(set(str(p) for p in paths)) == n_iters * 2, ( + f"Duplicate out_paths detected: {paths}" + ) + + # VC3: paths follow the naming convention + for i in range(n_iters): + train_name = f"{run_ts}-optimize-iter{i}-train.jsonl" + test_name = f"{run_ts}-optimize-iter{i}-test.jsonl" + assert any(str(p).endswith(train_name) for p in paths), ( + f"Missing expected file: {train_name}" + ) + assert any(str(p).endswith(test_name) for p in paths), ( + f"Missing expected file: {test_name}" + ) + + # VC3: files exist on disk + for rec in result.iterations: + if not rec.proposal_failed: + assert rec.train_jsonl_path != "" + assert rec.test_jsonl_path != "" + assert Path(rec.train_jsonl_path).exists(), ( + f"train jsonl missing: {rec.train_jsonl_path}" + ) + assert Path(rec.test_jsonl_path).exists(), ( + f"test jsonl missing: {rec.test_jsonl_path}" + ) + + +def test_vc3_no_intra_file_cell_id_duplication(tmp_path: Path) -> None: + """VC3: no single .jsonl file contains a duplicated cell_id.""" + source_claude = _make_source_tree(tmp_path / "src") + out_dir = tmp_path / "out" + out_dir.mkdir() + + entries = _make_entries(6) + n_iters = 3 + + dispatcher = _PassDispatcher() + + result = optimize( + skill=_SKILL, + entries=entries, + current_description=_BASE_DESC, + proposer=_stub_proposer, + dispatcher=dispatcher, + source_claude_dir=source_claude, + out_dir=out_dir, + run_ts="20260101T130000Z", + iterations=n_iters, + seed=_DEFAULT_SEED, + ) + + # For each iter jsonl file, assert no duplicated cell_id + for rec in result.iterations: + if rec.proposal_failed: + continue + for path_str in [rec.train_jsonl_path, rec.test_jsonl_path]: + path = Path(path_str) + assert path.exists() + cell_ids: list[str] = [] + for line in path.read_text(encoding="utf-8").splitlines(): + line = line.strip() + if not line: + continue + row = json.loads(line) + cell_ids.append(row["cell_id"]) + assert len(cell_ids) == len(set(cell_ids)), ( + f"Duplicate cell_ids in {path}: {cell_ids}" + ) + + +def test_vc3_different_iters_use_different_files(tmp_path: Path) -> None: + """VC3 [AC-12]: each (iter, split) pair writes to a UNIQUE file path. + + Note: cell_ids legitimately repeat across iterations (same prompt evaluated + again in each iteration). The VC3 invariant is about file path uniqueness, + not cell_id uniqueness across files. + """ + source_claude = _make_source_tree(tmp_path / "src") + out_dir = tmp_path / "out" + out_dir.mkdir() + + entries = _make_entries(5) + n_iters = 3 + + dispatcher = _PassDispatcher() + + result = optimize( + skill=_SKILL, + entries=entries, + current_description=_BASE_DESC, + proposer=_stub_proposer, + dispatcher=dispatcher, + source_claude_dir=source_claude, + out_dir=out_dir, + run_ts="20260101T140000Z", + iterations=n_iters, + seed=_DEFAULT_SEED, + ) + + # Collect all file paths from non-failed iterations + all_paths: list[str] = [] + for rec in result.iterations: + if rec.proposal_failed: + continue + all_paths.append(rec.train_jsonl_path) + all_paths.append(rec.test_jsonl_path) + + # Every (iter, split) pair must produce a distinct file path + assert len(all_paths) == len(set(all_paths)), ( + f"Duplicate file paths across iterations: {all_paths}" + ) + + # Each file path must encode its iteration number and split label + for rec in result.iterations: + if rec.proposal_failed: + continue + assert f"iter{rec.iteration}-train" in rec.train_jsonl_path, ( + f"train path missing iter{rec.iteration}: {rec.train_jsonl_path}" + ) + assert f"iter{rec.iteration}-test" in rec.test_jsonl_path, ( + f"test path missing iter{rec.iteration}: {rec.test_jsonl_path}" + ) diff --git a/tests/test_skills_eval_proposer.py b/tests/test_skills_eval_proposer.py new file mode 100644 index 00000000..510c451d --- /dev/null +++ b/tests/test_skills_eval_proposer.py @@ -0,0 +1,367 @@ +"""Tests for mapify_cli.skills_eval.proposer — propose_description(). + +Coverage: +- VC2 [INV-2/HC-7]: MAP_INVOKED_BY is set (non-empty) in subprocess env. +- VC3 [INV-1]: argv is a list; no shell=True; untrusted record text is a + discrete argv element (not interpolated into one shell string). +- VC4 [INV-1]: success returns .result text; returncode!=0 -> None; + malformed JSON -> None; missing .result -> None; whitespace-only .result + -> None; FileNotFoundError/OSError -> None; TimeoutExpired -> None. +""" +from __future__ import annotations + +import json +import subprocess +import types +from typing import Any + +import pytest + +from mapify_cli.skills_eval.eval_schema import EvalResultRecord +from mapify_cli.skills_eval.proposer import _DEFAULT_MAX_CHARS, propose_description + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_record( + prompt: str = "test prompt", + assertions_failed: list[str] | None = None, +) -> EvalResultRecord: + """Construct a minimal EvalResultRecord for test use.""" + return EvalResultRecord( + cell_id="p0-v0-r0", + prompt=prompt, + triggered_skill=None, + token_usage=None, + duration_s=0.0, + assertions_passed=[], + assertions_failed=assertions_failed or ["should_trigger: map-debug"], + raw_output="", + ) + + +def _fake_run( + returncode: int, + stdout: str, + capture: dict[str, Any] | None = None, + *, + raise_exc: BaseException | None = None, +) -> Any: + """Return a monkeypatch-compatible subprocess.run replacement. + + If ``raise_exc`` is provided, the fake raises it instead of returning a + result. ``capture`` dict is populated with the argv and kwargs received. + """ + + def run(argv: list[str], **kwargs: Any) -> types.SimpleNamespace: + if capture is not None: + capture["argv"] = argv + capture["kwargs"] = kwargs + if raise_exc is not None: + raise raise_exc + return types.SimpleNamespace( + returncode=returncode, + stdout=stdout, + stderr="", + ) + + return run + + +# --------------------------------------------------------------------------- +# VC4 — success path +# --------------------------------------------------------------------------- + + +def test_vc4_success_returns_result_text(monkeypatch: pytest.MonkeyPatch) -> None: + """VC4: success path returns stripped .result text from JSON envelope.""" + cap: dict[str, Any] = {} + monkeypatch.setattr( + "mapify_cli.skills_eval.proposer.subprocess.run", + _fake_run(0, json.dumps({"result": " improved description "}), cap), + ) + result = propose_description("old desc", [_make_record()]) + assert result == "improved description" + + +def test_vc4_result_is_stripped(monkeypatch: pytest.MonkeyPatch) -> None: + """VC4: leading/trailing whitespace is stripped from .result.""" + monkeypatch.setattr( + "mapify_cli.skills_eval.proposer.subprocess.run", + _fake_run(0, json.dumps({"result": "\n new desc\n\t"})), + ) + result = propose_description("old desc", [_make_record()]) + assert result == "new desc" + + +# --------------------------------------------------------------------------- +# VC4 — failure paths +# --------------------------------------------------------------------------- + + +def test_vc4_nonzero_returncode_returns_none(monkeypatch: pytest.MonkeyPatch) -> None: + """VC4: non-zero returncode -> None.""" + monkeypatch.setattr( + "mapify_cli.skills_eval.proposer.subprocess.run", + _fake_run(1, json.dumps({"result": "some output"})), + ) + assert propose_description("old desc", [_make_record()]) is None + + +def test_vc4_malformed_json_returns_none(monkeypatch: pytest.MonkeyPatch) -> None: + """VC4: malformed JSON in stdout -> None.""" + monkeypatch.setattr( + "mapify_cli.skills_eval.proposer.subprocess.run", + _fake_run(0, "this is not json"), + ) + assert propose_description("old desc", [_make_record()]) is None + + +def test_vc4_missing_result_field_returns_none(monkeypatch: pytest.MonkeyPatch) -> None: + """VC4: JSON envelope with no .result field -> None.""" + monkeypatch.setattr( + "mapify_cli.skills_eval.proposer.subprocess.run", + _fake_run(0, json.dumps({"session_id": "abc123"})), + ) + assert propose_description("old desc", [_make_record()]) is None + + +def test_vc4_whitespace_only_result_returns_none(monkeypatch: pytest.MonkeyPatch) -> None: + """VC4: whitespace-only .result -> None.""" + monkeypatch.setattr( + "mapify_cli.skills_eval.proposer.subprocess.run", + _fake_run(0, json.dumps({"result": " \n\t "})), + ) + assert propose_description("old desc", [_make_record()]) is None + + +def test_vc4_empty_result_string_returns_none(monkeypatch: pytest.MonkeyPatch) -> None: + """VC4: empty string .result -> None.""" + monkeypatch.setattr( + "mapify_cli.skills_eval.proposer.subprocess.run", + _fake_run(0, json.dumps({"result": ""})), + ) + assert propose_description("old desc", [_make_record()]) is None + + +def test_vc4_non_dict_json_returns_none(monkeypatch: pytest.MonkeyPatch) -> None: + """VC4: JSON that parses to a non-dict (e.g. a list) -> None.""" + monkeypatch.setattr( + "mapify_cli.skills_eval.proposer.subprocess.run", + _fake_run(0, json.dumps(["result", "oops"])), + ) + assert propose_description("old desc", [_make_record()]) is None + + +def test_vc4_file_not_found_returns_none(monkeypatch: pytest.MonkeyPatch) -> None: + """VC4: FileNotFoundError (claude not on PATH) -> None.""" + monkeypatch.setattr( + "mapify_cli.skills_eval.proposer.subprocess.run", + _fake_run(0, "", raise_exc=FileNotFoundError("claude not found")), + ) + assert propose_description("old desc", [_make_record()]) is None + + +def test_vc4_oserror_returns_none(monkeypatch: pytest.MonkeyPatch) -> None: + """VC4: generic OSError -> None (FileNotFoundError is a subclass).""" + monkeypatch.setattr( + "mapify_cli.skills_eval.proposer.subprocess.run", + _fake_run(0, "", raise_exc=OSError("some OS error")), + ) + assert propose_description("old desc", [_make_record()]) is None + + +def test_vc4_timeout_returns_none(monkeypatch: pytest.MonkeyPatch) -> None: + """VC4: TimeoutExpired -> None.""" + monkeypatch.setattr( + "mapify_cli.skills_eval.proposer.subprocess.run", + _fake_run( + 0, + "", + raise_exc=subprocess.TimeoutExpired(cmd=["claude"], timeout=120), + ), + ) + assert propose_description("old desc", [_make_record()]) is None + + +# --------------------------------------------------------------------------- +# VC2 — MAP_INVOKED_BY in subprocess env +# --------------------------------------------------------------------------- + + +def test_vc2_map_invoked_by_is_set(monkeypatch: pytest.MonkeyPatch) -> None: + """VC2 [INV-2/HC-7]: MAP_INVOKED_BY is present and non-empty in subprocess env.""" + cap: dict[str, Any] = {} + monkeypatch.setattr( + "mapify_cli.skills_eval.proposer.subprocess.run", + _fake_run(0, json.dumps({"result": "new desc"}), cap), + ) + propose_description("old desc", [_make_record()]) + assert "MAP_INVOKED_BY" in cap["kwargs"]["env"], ( + "MAP_INVOKED_BY must be set in subprocess env" + ) + assert cap["kwargs"]["env"]["MAP_INVOKED_BY"], ( + "MAP_INVOKED_BY must be non-empty" + ) + + +# --------------------------------------------------------------------------- +# VC3 — argv is a list, no shell=True, untrusted text is a discrete element +# --------------------------------------------------------------------------- + + +def test_vc3_argv_is_list(monkeypatch: pytest.MonkeyPatch) -> None: + """VC3 [INV-1]: argv passed to subprocess.run must be a list (not a string).""" + cap: dict[str, Any] = {} + monkeypatch.setattr( + "mapify_cli.skills_eval.proposer.subprocess.run", + _fake_run(0, json.dumps({"result": "ok"}), cap), + ) + propose_description("old desc", [_make_record()]) + assert isinstance(cap["argv"], list), ( + f"argv must be a list, got {type(cap['argv']).__name__!r}" + ) + + +def test_vc3_no_shell_true(monkeypatch: pytest.MonkeyPatch) -> None: + """VC3 [INV-1]: shell=True must NOT be passed to subprocess.run.""" + cap: dict[str, Any] = {} + monkeypatch.setattr( + "mapify_cli.skills_eval.proposer.subprocess.run", + _fake_run(0, json.dumps({"result": "ok"}), cap), + ) + propose_description("old desc", [_make_record()]) + assert cap["kwargs"].get("shell") is not True, ( + "shell=True must not be used — it enables shell injection" + ) + + +def test_vc3_untrusted_text_is_discrete_argv_element( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """VC3 [INV-1]: record .prompt content appears as a discrete argv element. + + The full argv must be a list where the prompt string is a distinct element, + not interpolated together with the claude binary and flags into one string. + This guards against shell-injection via untrusted eval-set content. + """ + untrusted_prompt = "rm -rf /; echo injected" + cap: dict[str, Any] = {} + monkeypatch.setattr( + "mapify_cli.skills_eval.proposer.subprocess.run", + _fake_run(0, json.dumps({"result": "new desc"}), cap), + ) + propose_description("old desc", [_make_record(prompt=untrusted_prompt)]) + + argv: list[str] = cap["argv"] + # argv must be a list — not a single joined string + assert isinstance(argv, list), "argv must be a list" + # The untrusted text must NOT be concatenated with the claude binary in one element + for element in argv: + assert not ( + "claude" in element and untrusted_prompt in element + ), ( + f"Untrusted text was interpolated into a combined argv element: {element!r}" + ) + # The argv list must contain at least one element that contains the untrusted text + assert any(untrusted_prompt in element for element in argv), ( + "Untrusted record text must appear somewhere in argv (as a discrete element)" + ) + + +def test_vc3_assertions_failed_text_appears_in_prompt( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """VC3: assertions_failed content from failing records is included in the prompt.""" + failing_assertion = "should_trigger: map-special" + cap: dict[str, Any] = {} + monkeypatch.setattr( + "mapify_cli.skills_eval.proposer.subprocess.run", + _fake_run(0, json.dumps({"result": "new desc"}), cap), + ) + propose_description( + "old desc", + [_make_record(assertions_failed=[failing_assertion])], + ) + argv: list[str] = cap["argv"] + combined = " ".join(argv) + assert failing_assertion in combined, ( + "assertions_failed content must be included in the prompt" + ) + + +# --------------------------------------------------------------------------- +# Empty failing records list — edge case +# --------------------------------------------------------------------------- + + +def test_empty_failing_records_still_calls_subprocess( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Edge case: empty failing_train_records list — subprocess still called.""" + cap: dict[str, Any] = {} + monkeypatch.setattr( + "mapify_cli.skills_eval.proposer.subprocess.run", + _fake_run(0, json.dumps({"result": "improved"}), cap), + ) + result = propose_description("current desc", []) + assert result == "improved" + assert "argv" in cap, "subprocess.run must be called even with no failing records" + + +# --------------------------------------------------------------------------- +# Length cap — proposals must fit the skill `description` spec limit +# --------------------------------------------------------------------------- + + +def test_default_max_chars_is_spec_limit() -> None: + """The default cap is the Agent Skills `description` spec maximum (1024).""" + assert _DEFAULT_MAX_CHARS == 1024 + + +def test_over_limit_proposal_is_rejected(monkeypatch: pytest.MonkeyPatch) -> None: + """A proposal longer than max_chars is rejected (None) — never shipped.""" + too_long = "x" * (_DEFAULT_MAX_CHARS + 1) + monkeypatch.setattr( + "mapify_cli.skills_eval.proposer.subprocess.run", + _fake_run(0, json.dumps({"result": too_long})), + ) + assert propose_description("old desc", [_make_record()]) is None + + +def test_at_limit_proposal_is_returned(monkeypatch: pytest.MonkeyPatch) -> None: + """A proposal exactly at max_chars is accepted (boundary is inclusive).""" + at_limit = "y" * _DEFAULT_MAX_CHARS + monkeypatch.setattr( + "mapify_cli.skills_eval.proposer.subprocess.run", + _fake_run(0, json.dumps({"result": at_limit})), + ) + result = propose_description("old desc", [_make_record()]) + assert result == at_limit + + +def test_custom_max_chars_enforced(monkeypatch: pytest.MonkeyPatch) -> None: + """An explicit max_chars overrides the default for both accept and reject.""" + monkeypatch.setattr( + "mapify_cli.skills_eval.proposer.subprocess.run", + _fake_run(0, json.dumps({"result": "z" * 60})), + ) + assert propose_description("old", [_make_record()], max_chars=50) is None + assert propose_description("old", [_make_record()], max_chars=100) == "z" * 60 + + +def test_prompt_states_the_char_limit(monkeypatch: pytest.MonkeyPatch) -> None: + """The improvement prompt tells claude the hard character limit.""" + cap: dict[str, Any] = {} + monkeypatch.setattr( + "mapify_cli.skills_eval.proposer.subprocess.run", + _fake_run(0, json.dumps({"result": "short desc"}), cap), + ) + propose_description("old desc", [_make_record()], max_chars=250) + # argv = ["claude", "-p", , "--output-format", "json"] + prompt = cap["argv"][2] + assert "250" in prompt + assert "character" in prompt.lower() diff --git a/tests/test_skills_eval_schema.py b/tests/test_skills_eval_schema.py new file mode 100644 index 00000000..eec68917 --- /dev/null +++ b/tests/test_skills_eval_schema.py @@ -0,0 +1,210 @@ +"""Tests for OptimizeIterationRecord, OptimizeResult, and ProposerFn in eval_schema. + +VC1 [AC-7]: to_dict/from_dict round-trip for OptimizeResult (incl. nested iterations). +VC2 [AC-7]: ProposerFn importable and usable as a type alias. +VC3 [AC-7]: OptimizeIterationRecord carries all required fields. +""" + +from __future__ import annotations + +import json + +from mapify_cli.skills_eval.eval_schema import ( + EvalResultRecord, + OptimizeIterationRecord, + OptimizeResult, + ProposerFn, +) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_iter( + iteration: int, + *, + candidate_description: str | None = "desc-v1", + train_pass_rate: float = 0.8, + test_pass_rate: float = 0.75, + selected: bool = False, + proposal_failed: bool = False, + overfit: bool = False, +) -> OptimizeIterationRecord: + return OptimizeIterationRecord( + iteration=iteration, + candidate_description=candidate_description, + train_pass_rate=train_pass_rate, + test_pass_rate=test_pass_rate, + train_tokens_total=100 * iteration, + test_tokens_total=50 * iteration, + selected=selected, + proposal_failed=proposal_failed, + overfit=overfit, + train_jsonl_path=f"/tmp/train-{iteration}.jsonl", + test_jsonl_path=f"/tmp/test-{iteration}.jsonl", + ) + + +def _make_result() -> OptimizeResult: + return OptimizeResult( + skill="map-plan", + eval_set_path="/evals/map-plan.json", + seed=42, + n_train=10, + n_test=5, + baseline_description="original description", + winning_description="improved description", + winning_iteration=2, + no_improvement=False, + iterations=[ + _make_iter(0, candidate_description=None, proposal_failed=True), + _make_iter(1, train_pass_rate=0.7, test_pass_rate=0.65), + _make_iter(2, train_pass_rate=0.9, test_pass_rate=0.85, selected=True), + ], + ) + + +# --------------------------------------------------------------------------- +# VC1: full round-trip (OptimizeResult + each iteration) +# --------------------------------------------------------------------------- + + +def test_vc1_optimize_result_round_trip() -> None: + """VC1: from_dict(json.loads(json.dumps(r.to_dict()))) == r for OptimizeResult.""" + original = _make_result() + serialized = json.dumps(original.to_dict()) + restored = OptimizeResult.from_dict(json.loads(serialized)) + assert restored == original, f"round-trip mismatch:\n{restored!r}\n!=\n{original!r}" + + +def test_vc1_iteration_round_trip_each() -> None: + """VC1: each OptimizeIterationRecord individually round-trips.""" + result = _make_result() + for it in result.iterations: + restored = OptimizeIterationRecord.from_dict(json.loads(json.dumps(it.to_dict()))) + assert restored == it, f"iteration {it.iteration} round-trip mismatch" + + +def test_vc1_iteration_none_candidate_round_trip() -> None: + """VC1: candidate_description=None survives the JSON round-trip.""" + it = _make_iter(0, candidate_description=None, proposal_failed=True) + restored = OptimizeIterationRecord.from_dict(json.loads(json.dumps(it.to_dict()))) + assert restored.candidate_description is None + assert restored.proposal_failed is True + assert restored == it + + +# --------------------------------------------------------------------------- +# Test 2: from_dict tolerates absent optional keys +# --------------------------------------------------------------------------- + + +def test_from_dict_tolerates_absent_optional_keys_iteration() -> None: + """from_dict on OptimizeIterationRecord applies defaults for absent optional fields.""" + minimal: dict = { + "iteration": 3, + "candidate_description": "minimal", + "train_pass_rate": 0.5, + "test_pass_rate": 0.4, + # omit: selected, proposal_failed, overfit, train/test_jsonl_path, token totals + } + rec = OptimizeIterationRecord.from_dict(minimal) + assert rec.iteration == 3 + assert rec.candidate_description == "minimal" + assert rec.train_pass_rate == 0.5 + assert rec.test_pass_rate == 0.4 + assert rec.train_tokens_total == 0 + assert rec.test_tokens_total == 0 + assert rec.selected is False + assert rec.proposal_failed is False + assert rec.overfit is False + assert rec.train_jsonl_path == "" + assert rec.test_jsonl_path == "" + + +def test_from_dict_tolerates_absent_iterations_in_optimize_result() -> None: + """from_dict on OptimizeResult applies empty list when 'iterations' is absent.""" + minimal: dict = { + "skill": "map-x", + "eval_set_path": "/evals/x.json", + "seed": 0, + "n_train": 5, + "n_test": 3, + "baseline_description": "base", + "winning_description": "base", + "winning_iteration": 0, + "no_improvement": True, + # omit: iterations + } + result = OptimizeResult.from_dict(minimal) + assert result.iterations == [] + assert result.no_improvement is True + + +# --------------------------------------------------------------------------- +# VC2: ProposerFn importable and callable +# --------------------------------------------------------------------------- + + +def test_vc2_proposer_fn_importable_and_callable() -> None: + """VC2: ProposerFn is importable and a matching def is assignable and callable. + + The proposer signature is ``Callable[[str, list[EvalResultRecord]], str | None]``; + both branches (None and str) satisfy the alias. + """ + + def none_proposer(current: str, failing: list[EvalResultRecord]) -> str | None: + del current, failing # unused: this proposer always declines + return None + + p: ProposerFn = none_proposer + assert p("some description", []) is None + + # A proposer that returns a string — exercises both params and EvalResultRecord. + failing_record = EvalResultRecord( + cell_id="p0-v1-r0", + prompt="plan a feature", + triggered_skill=None, + token_usage=None, + duration_s=1.0, + assertions_failed=["expected map-plan to trigger"], + ) + + def echo_proposer(current: str, failing: list[EvalResultRecord]) -> str | None: + return f"improved ({len(failing)} failing): {current}" + + p2: ProposerFn = echo_proposer + assert p2("old", [failing_record]) == "improved (1 failing): old" + + +# --------------------------------------------------------------------------- +# VC3: OptimizeIterationRecord carries all required fields +# --------------------------------------------------------------------------- + + +def test_vc3_iteration_record_has_all_required_fields() -> None: + """VC3: OptimizeIterationRecord carries train/test pass rates, token totals, + selected, overfit flags, and per-iteration jsonl paths.""" + it = _make_iter( + 5, + train_pass_rate=0.88, + test_pass_rate=0.77, + selected=True, + overfit=True, + ) + # pass rates + assert it.train_pass_rate == 0.88 + assert it.test_pass_rate == 0.77 + # token totals (flat ints, no TokenUsage) + assert isinstance(it.train_tokens_total, int) + assert isinstance(it.test_tokens_total, int) + assert it.train_tokens_total == 500 + assert it.test_tokens_total == 250 + # flags + assert it.selected is True + assert it.overfit is True + # per-iteration jsonl paths (so viewer can render without re-parsing) + assert it.train_jsonl_path == "/tmp/train-5.jsonl" + assert it.test_jsonl_path == "/tmp/test-5.jsonl" diff --git a/tests/test_skills_eval_viewer.py b/tests/test_skills_eval_viewer.py new file mode 100644 index 00000000..240a5de5 --- /dev/null +++ b/tests/test_skills_eval_viewer.py @@ -0,0 +1,397 @@ +"""Tests for src/mapify_cli/skills_eval/viewer.py. + +Covers: + VC1 — HTML output has one row per iteration, shows train/test pass-rates, diff content. + VC2 — overfit rows carry a distinct CSS class marker; non-overfit rows do not. + VC3 — viewer imports only eval_schema + stdlib + jinja2; no subprocess/anthropic. + Security — candidate_description with XSS payload is HTML-escaped. + proposal_failed — renders without raising. +""" + +from __future__ import annotations + +import ast +import re +from pathlib import Path + +import pytest + +from mapify_cli.skills_eval.eval_schema import OptimizeIterationRecord, OptimizeResult +from mapify_cli.skills_eval.viewer import render_html, render_to_path + +# --------------------------------------------------------------------------- +# Fixture +# --------------------------------------------------------------------------- + +BASELINE_DESC = "Trigger when the user says 'map' or asks to plan." +DESC_ITER1 = "Trigger when the user says 'map', asks to plan, or mentions workflow." +DESC_ITER2_XSS = "Trigger on map keyword." +# iteration 3: proposal_failed=True, candidate_description=None +# iteration 4: overfit=True + + +@pytest.fixture() +def opt_result() -> OptimizeResult: + """OptimizeResult with >= 3 meaningful iterations: + iter 0 — baseline (iteration=0, selected=False) + iter 1 — normal improvement, selected=True + iter 2 — XSS payload in description, not selected + iter 3 — proposal_failed=True, candidate_description=None + iter 4 — overfit=True (train up, test down) + """ + iterations = [ + OptimizeIterationRecord( + iteration=0, + candidate_description=BASELINE_DESC, + train_pass_rate=0.50, + test_pass_rate=0.50, + train_tokens_total=100, + test_tokens_total=100, + selected=False, + proposal_failed=False, + overfit=False, + ), + OptimizeIterationRecord( + iteration=1, + candidate_description=DESC_ITER1, + train_pass_rate=0.70, + test_pass_rate=0.65, + train_tokens_total=200, + test_tokens_total=200, + selected=True, + proposal_failed=False, + overfit=False, + ), + OptimizeIterationRecord( + iteration=2, + candidate_description=DESC_ITER2_XSS, + train_pass_rate=0.75, + test_pass_rate=0.60, + train_tokens_total=300, + test_tokens_total=300, + selected=False, + proposal_failed=False, + overfit=False, + ), + OptimizeIterationRecord( + iteration=3, + candidate_description=None, + train_pass_rate=0.0, + test_pass_rate=0.0, + train_tokens_total=0, + test_tokens_total=0, + selected=False, + proposal_failed=True, + overfit=False, + ), + OptimizeIterationRecord( + iteration=4, + candidate_description="Trigger on map keyword only.", + train_pass_rate=0.90, + test_pass_rate=0.40, + train_tokens_total=400, + test_tokens_total=400, + selected=False, + proposal_failed=False, + overfit=True, + ), + ] + return OptimizeResult( + skill="map-plan", + eval_set_path="evals/map-plan.json", + seed=42, + n_train=10, + n_test=5, + baseline_description=BASELINE_DESC, + winning_description=DESC_ITER1, + winning_iteration=1, + no_improvement=False, + iterations=iterations, + ) + + +# --------------------------------------------------------------------------- +# VC1 — structure: HTML wrapper, one row per iteration, pass-rates, diff +# --------------------------------------------------------------------------- + + +class TestVC1Structure: + def test_vc1_returns_html_string(self, opt_result: OptimizeResult) -> None: + html = render_html(opt_result) + assert isinstance(html, str) + assert html.lstrip().lower().startswith(" None: + html = render_html(opt_result) + assert " None: + html = render_html(opt_result) + # Each iteration produces exactly one in tbody. + # Count without a class attr; data rows have class="". + # We look for (.*?)", html, re.DOTALL) + assert tbody_match is not None, "No found" + tbody = tbody_match.group(1) + tr_count = len(re.findall(r" None: + html = render_html(opt_result) + # iter 1: 70.0%, iter 4: 90.0% + assert "70.0%" in html + assert "90.0%" in html + + def test_vc1_test_pass_rates_present(self, opt_result: OptimizeResult) -> None: + html = render_html(opt_result) + assert "65.0%" in html + assert "40.0%" in html + + def test_vc1_diff_content_present(self, opt_result: OptimizeResult) -> None: + """Diff output must appear for at least one iteration (iter 1 vs baseline).""" + html = render_html(opt_result) + # difflib will produce +/- lines; they land in diff-add/diff-rem spans + assert "diff-add" in html or "diff-rem" in html + + def test_vc1_header_contains_skill(self, opt_result: OptimizeResult) -> None: + html = render_html(opt_result) + assert "map-plan" in html + + def test_vc1_header_contains_seed(self, opt_result: OptimizeResult) -> None: + html = render_html(opt_result) + assert "42" in html + + def test_vc1_header_contains_n_train_n_test(self, opt_result: OptimizeResult) -> None: + html = render_html(opt_result) + assert "10" in html + assert "5" in html + + def test_vc1_header_contains_winning_iteration(self, opt_result: OptimizeResult) -> None: + html = render_html(opt_result) + assert "1" in html # winning_iteration=1 + + +# --------------------------------------------------------------------------- +# VC2 — overfit rows: marker present ONLY on overfit rows +# --------------------------------------------------------------------------- + + +class TestVC2OverfitHighlight: + """Validate that row-overfit class appears EXACTLY on overfit rows. + + Strategy: count occurrences (which appear as + row-level attributes), and also verify a non-overfit row's does + NOT carry it. The CSS rule .row-overfit in ", html, re.DOTALL) + assert style_match is not None + style_block = style_match.group(1) + # CSS selector uses dot notation — no class="..." in CSS + assert 'class="row-overfit"' not in style_block + + def test_vc2_result_with_no_overfit_has_zero_markers(self) -> None: + """Sanity: no overfit iterations → zero markers.""" + result = OptimizeResult( + skill="test-skill", + eval_set_path="evals/test.json", + seed=0, + n_train=5, + n_test=5, + baseline_description="baseline", + winning_description="baseline", + winning_iteration=0, + no_improvement=True, + iterations=[ + OptimizeIterationRecord( + iteration=0, + candidate_description="baseline", + train_pass_rate=0.5, + test_pass_rate=0.5, + overfit=False, + ) + ], + ) + html = render_html(result) + assert len(re.findall(r']*class="row-overfit"', html)) == 0 + + +# --------------------------------------------------------------------------- +# VC3 — purity: no subprocess, no anthropic in viewer module source +# --------------------------------------------------------------------------- + + +class TestVC3Purity: + _VIEWER_PATH = ( + Path(__file__).parent.parent + / "src" + / "mapify_cli" + / "skills_eval" + / "viewer.py" + ) + + def _get_viewer_source(self) -> str: + return self._VIEWER_PATH.read_text(encoding="utf-8") + + def _get_viewer_imports(self) -> list[str]: + """Return all top-level and lazy import names via AST.""" + source = self._get_viewer_source() + tree = ast.parse(source) + names: list[str] = [] + for node in ast.walk(tree): + if isinstance(node, ast.Import): + for alias in node.names: + names.append(alias.name) + elif isinstance(node, ast.ImportFrom): + if node.module: + names.append(node.module) + return names + + def test_vc3_no_subprocess_import(self) -> None: + source = self._get_viewer_source() + assert "subprocess" not in source + + def test_vc3_no_anthropic_import(self) -> None: + source = self._get_viewer_source() + assert "import anthropic" not in source + + def test_vc3_render_requires_no_claude(self, opt_result: OptimizeResult) -> None: + """render_html must complete without any claude/subprocess call.""" + html = render_html(opt_result) + assert len(html) > 0 + + def test_vc3_viewer_module_imports_only_allowed(self) -> None: + """Allowed top-level imports: __future__, difflib, pathlib, + mapify_cli.skills_eval.eval_schema, jinja2 (lazy). + """ + imports = self._get_viewer_imports() + allowed_prefixes = ( + "__future__", + "difflib", + "pathlib", + "mapify_cli.skills_eval.eval_schema", + "jinja2", + ) + for name in imports: + assert any(name.startswith(p) for p in allowed_prefixes), ( + f"Unexpected import in viewer.py: {name!r}" + ) + + +# --------------------------------------------------------------------------- +# Security — autoescape XSS +# --------------------------------------------------------------------------- + + +class TestSecurity: + def test_xss_payload_is_escaped(self, opt_result: OptimizeResult) -> None: + html = render_html(opt_result) + # The raw " not in html + + +# --------------------------------------------------------------------------- +# proposal_failed graceful render +# --------------------------------------------------------------------------- + + +class TestProposalFailed: + def test_proposal_failed_renders_without_raising( + self, opt_result: OptimizeResult + ) -> None: + html = render_html(opt_result) + # The "proposal failed" text must appear for the failed iteration + assert "proposal failed" in html.lower() + + def test_proposal_failed_no_crash_standalone(self) -> None: + """A result with only a proposal_failed iteration must not raise.""" + result = OptimizeResult( + skill="s", + eval_set_path="e", + seed=1, + n_train=3, + n_test=3, + baseline_description="base", + winning_description="base", + winning_iteration=0, + no_improvement=True, + iterations=[ + OptimizeIterationRecord( + iteration=0, + candidate_description=None, + train_pass_rate=0.0, + test_pass_rate=0.0, + proposal_failed=True, + ) + ], + ) + html = render_html(result) + assert isinstance(html, str) + assert "proposal failed" in html.lower() + + +# --------------------------------------------------------------------------- +# render_to_path +# --------------------------------------------------------------------------- + + +class TestRenderToPath: + def test_render_to_path_writes_file( + self, opt_result: OptimizeResult, tmp_path: Path + ) -> None: + out = tmp_path / "report.html" + render_to_path(opt_result, out) + assert out.exists() + content = out.read_text(encoding="utf-8") + assert " None: + out = tmp_path / "report2.html" + render_to_path(opt_result, out) + assert out.read_text(encoding="utf-8") == render_html(opt_result)