From f64e25dbfb68bba060acb5790cd123158cf22dc5 Mon Sep 17 00:00:00 2001
From: Ryan Orban <ryan@orban.dev>
Date: Mon, 23 Feb 2026 20:30:41 -0800
Subject: [PATCH 01/21] add config-as-checkpoint, error classification, and
 per-trial writes to eval harness

- store run_config in checkpoints; warn on resume if config mismatches
- classify errors as infra/timeout/genuine; skip genuine failures on resume
- add --retry-all flag to override and re-run genuine failures too
- write per-trial JSON to results/trials/ for ls-level observability
- 13 new tests covering all three features

Entire-Checkpoint: 7fb98ee370d1
---
 eval-harness/lib/cli.py           | 154 +++++++++++++----
 eval-harness/lib/reporter.py      | 122 ++++++++++++--
 eval-harness/lib/stats.py         |   7 +-
 eval-harness/tests/test_resume.py | 263 ++++++++++++++++++++++++++++--
 4 files changed, 484 insertions(+), 62 deletions(-)

diff --git a/eval-harness/lib/cli.py b/eval-harness/lib/cli.py
index 71f9580..1f459e1 100644
--- a/eval-harness/lib/cli.py
+++ b/eval-harness/lib/cli.py
@@ -15,7 +15,7 @@
 from lib.models import TaskFile
 from lib.task_runner import TaskRunner, TaskResult, Condition, PreValidationCache
 from lib.reporter import Reporter, EvalResults
-from lib.stats import wilson_score_interval, ci_overlap
+from lib.stats import wilson_score_interval
 from lib.git_scanner import GitScanner
 from lib.git_ops import clone_repo, checkout_commit
 from lib.index_cache import IndexCache
@@ -26,14 +26,23 @@
 _print_lock = threading.Lock()
 
 
-def _load_prior_results(json_path: str) -> tuple[set[tuple[str, str]], dict]:
-    """Load prior results JSON file and identify passed (task_id, condition) pairs.
+def _load_prior_results(json_path: str) -> tuple[set[tuple[str, str]], set[tuple[str, str]], dict]:
+    """Load prior results JSON and classify (task_id, condition) pairs.
 
-    A condition is "passed" if success=True and no error field exists at the
-    condition level. Works for both single-run and multi-run formats:
+    Returns:
+        passed: pairs where success=True and no error (carry forward)
+        genuine_failures: pairs that failed due to test failures, not infra
+            (skip by default on resume — retrying won't help)
+        data: raw prior data dict
+
+    Works for both single-run and multi-run formats:
     - Single-run: checks top-level success + absence of error
-    - Multi-run: checks aggregate success (majority pass) — individual run
-      errors don't produce a top-level error field, so the check works as-is
+    - Multi-run: checks aggregate success (majority pass)
+
+    Classification:
+    - passed: success=True, no error → carry forward
+    - infra error: error starts with INFRA_ERROR_PREFIXES → retry
+    - genuine failure: everything else (test failures, timeouts) → skip by default
     """
     try:
         with open(json_path) as f:
@@ -48,6 +57,8 @@ def _load_prior_results(json_path: str) -> tuple[set[tuple[str, str]], dict]:
         raise click.ClickException(f"Invalid results file: 'results' must be a list in {json_path}")
 
     passed = set()
+    genuine_failures = set()
+
     for i, task in enumerate(data["results"]):
         if not isinstance(task, dict) or "task_id" not in task:
             raise click.ClickException(
@@ -58,10 +69,25 @@ def _load_prior_results(json_path: str) -> tuple[set[tuple[str, str]], dict]:
             cond_data = task.get(cond_key)
             if cond_data is None:
                 continue
+
             if cond_data.get("success") is True and "error" not in cond_data:
                 passed.add((task_id, cond_key))
+                continue
+
+            # Not passed — classify as infra error or genuine failure.
+            # Multi-run: if any individual run had an infra error, the
+            # whole condition is worth retrying (more valid runs = better stats).
+            is_infra = _is_infra_error_dict(cond_data)
+            if not is_infra and "runs" in cond_data:
+                is_infra = any(
+                    r.get("error", "").startswith(Reporter.INFRA_ERROR_PREFIXES)
+                    for r in cond_data["runs"]
+                )
+
+            if not is_infra:
+                genuine_failures.add((task_id, cond_key))
 
-    return passed, data
+    return passed, genuine_failures, data
 
 
 def _is_infra_error_dict(cond_data: dict) -> bool:
@@ -159,8 +185,9 @@ def _merge_results(new_results: 'EvalResults', prior_data: dict, passed_pairs: s
 def _recompute_summary(merged_results: list[dict]) -> dict:
     """Recompute summary stats from merged result dicts.
 
-    Mirrors Reporter._compute_summary: success rates, Wilson Score CIs
-    for multi-run data, and significance flags via CI overlap.
+    Mirrors Reporter._compute_summary: success rates and Wilson Score CIs
+    for multi-run data. Significance flags (from McNemar) are NOT recomputed
+    here — they are carried forward from the original compilation.
     """
     cond_stats: dict[str, dict] = {
         "none": {"successes": 0, "total": 0, "assigned": 0},
@@ -229,17 +256,10 @@ def itt_rate(stats):
                     "upper": round(ci_upper, 3),
                 }
 
-        # Significance: check CI overlap between none and each treatment
-        none_ci = summary.get("none_ci_90")
-        if none_ci:
-            for treatment in ("flat_llm", "intent_layer"):
-                t_ci = summary.get(f"{treatment}_ci_90")
-                if t_ci:
-                    overlaps = ci_overlap(
-                        (none_ci["lower"], none_ci["upper"]),
-                        (t_ci["lower"], t_ci["upper"]),
-                    )
-                    summary[f"{treatment}_vs_none_significant"] = not overlaps
+        # Note: significance flags are derived from McNemar in the reporter's
+        # _compute_summary. _recompute_summary doesn't have access to raw
+        # TaskResults for McNemar pairing, so significance flags from merged
+        # results are carried forward from the original compilation.
 
     return summary
 
@@ -360,8 +380,10 @@ def scan(repo, output, since, limit, docker_image, setup, test_command, branch):
 @click.option("--repetitions", "-n", default=1,
               help="Number of times to repeat each task/condition pair (default: 1)")
 @click.option("--resume", default=None, type=click.Path(exists=True),
-              help="Prior results JSON — skip passed pairs, re-run failures")
-def run(tasks, parallel, category, output, keep_workspaces, dry_run, timeout, verbose, clear_cache, no_cache, cache_dir, condition, model, repetitions, resume):
+              help="Prior results JSON — skip passed pairs, re-run infra errors")
+@click.option("--retry-all", is_flag=True,
+              help="With --resume: also retry genuine failures, not just infra errors")
+def run(tasks, parallel, category, output, keep_workspaces, dry_run, timeout, verbose, clear_cache, no_cache, cache_dir, condition, model, repetitions, resume, retry_all):
     """Run eval on task files."""
     # Validate task files exist
     for task_path in tasks:
@@ -400,14 +422,47 @@ def run(tasks, parallel, category, output, keep_workspaces, dry_run, timeout, ve
 
     # Filter out passed pairs from prior run
     passed_pairs = set()
+    genuine_fail_pairs = set()
     prior_data = None
     pre_validated_tasks: frozenset[str] = frozenset()
     if resume:
-        passed_pairs, prior_data = _load_prior_results(resume)
+        passed_pairs, genuine_fail_pairs, prior_data = _load_prior_results(resume)
         pre_validated_tasks = _load_pre_validated_tasks(prior_data)
+
+        # Config compatibility check: warn if prior run used different settings
+        prior_config = prior_data.get("run_config")
+        if prior_config:
+            mismatches = []
+            current_task_ids = sorted(set(t.id for _, t in all_tasks))
+            prior_task_ids = sorted(prior_config.get("task_ids", []))
+            if current_task_ids != prior_task_ids:
+                mismatches.append(f"tasks: {len(prior_task_ids)} prior vs {len(current_task_ids)} current")
+            if prior_config.get("repetitions") != repetitions:
+                mismatches.append(f"repetitions: {prior_config['repetitions']} prior vs {repetitions} current")
+            if prior_config.get("timeout") != timeout:
+                mismatches.append(f"timeout: {prior_config['timeout']} prior vs {timeout} current")
+            prior_conds = sorted(prior_config.get("conditions", []))
+            current_conds = sorted(c.value for c in conditions)
+            if prior_conds != current_conds:
+                mismatches.append(f"conditions: {prior_conds} prior vs {current_conds} current")
+            if mismatches:
+                click.echo(f"\u26a0 Config mismatch with prior run ({', '.join(mismatches)})")
+
+        # By default, skip both passed pairs AND genuine failures.
+        # Genuine failures (test ran, code didn't fix it) won't improve on retry.
+        # Only infra errors (harness/Docker/network problems) are retried.
+        # Use --retry-all to also retry genuine failures.
+        skip_pairs = passed_pairs.copy()
+        if not retry_all:
+            skip_pairs |= genuine_fail_pairs
+
         original_len = len(work_queue)
-        work_queue = [item for item in work_queue if (item[1].id, item[2].value) not in passed_pairs]
-        click.echo(f"Resume: {len(passed_pairs)} passed pairs carried forward, {len(work_queue)}/{original_len} to re-run")
+        work_queue = [item for item in work_queue if (item[1].id, item[2].value) not in skip_pairs]
+        n_infra = original_len - len(work_queue) - len(passed_pairs) - (len(genuine_fail_pairs) if not retry_all else 0)
+
+        click.echo(f"Resume: {len(passed_pairs)} passed (carried forward), "
+                   f"{len(genuine_fail_pairs)} genuine failures ({'retrying' if retry_all else 'skipped'}), "
+                   f"{len(work_queue)} to re-run")
         if pre_validated_tasks:
             click.echo(f"Resume: {len(pre_validated_tasks)} task(s) will skip pre-validation")
 
@@ -521,6 +576,21 @@ def run_single(item):
         budget_threshold = int(preflight_budget["remaining_tokens"] * 0.8)
     budget_warned = False
 
+    # Pre-create reporter for incremental checkpoints.
+    # eval_id assigned now so checkpoint filenames are stable across the run.
+    reporter = Reporter(output)
+    eval_id = datetime.now().strftime("%Y-%m-%d-%H%M%S")
+
+    # Snapshot run config for checkpoint verification on --resume
+    run_config = {
+        "task_ids": sorted(set(t.id for _, t in all_tasks)),
+        "conditions": sorted(c.value for c in conditions),
+        "repetitions": repetitions,
+        "timeout": timeout,
+        "model": model,
+        "task_files": [str(Path(t).resolve()) for t in tasks],
+    }
+
     with ThreadPoolExecutor(max_workers=parallel) as executor:
         futures = {executor.submit(run_single, item): item for item in work_queue}
 
@@ -575,6 +645,20 @@ def run_single(item):
                     for l in tail:
                         click.echo(f"      {l}", err=True)
 
+            # Per-trial result file for ls-level observability
+            try:
+                reporter.write_trial(result)
+            except Exception as e:
+                click.echo(f"  Warning: trial write failed: {e}", err=True)
+
+            # Incremental checkpoint — resume-compatible JSON written after each result
+            try:
+                checkpoint_path = reporter.write_checkpoint(results, eval_id, run_config=run_config)
+                if len(results) == 1:
+                    click.echo(f"  Checkpoint: {checkpoint_path} (updated after each result)")
+            except Exception as e:
+                click.echo(f"  Warning: checkpoint write failed: {e}", err=True)
+
             # Mid-run budget checkpoint (one-time warning)
             if budget_threshold and not budget_warned and preflight_budget:
                 cumulative_tokens = sum(r.input_tokens + r.output_tokens for r in results)
@@ -587,19 +671,31 @@ def run_single(item):
 
     # Generate reports — capture postflight budget in cli (not reporter)
     postflight_budget = get_budget_status()
-    reporter = Reporter(output)
     eval_results = reporter.compile_results(
         results, preflight_budget=preflight_budget, postflight_budget=postflight_budget
     )
+    # Use the pre-assigned eval_id so final report matches the checkpoint lineage
+    eval_results = replace(
+        eval_results,
+        eval_id=eval_id,
+        timestamp=eval_results.timestamp,
+        run_config=run_config,
+    )
 
-    # Merge with prior results if resuming
+    # Merge with prior results if resuming.
+    # Carry forward both passed pairs AND genuine failures (unless --retry-all
+    # was used, in which case genuine failures were re-run and are in new results).
     if prior_data is not None:
-        eval_results = _merge_results(eval_results, prior_data, passed_pairs)
+        carry_forward = passed_pairs | (genuine_fail_pairs if not retry_all else set())
+        eval_results = _merge_results(eval_results, prior_data, carry_forward)
         eval_results.summary["resumed_from"] = prior_data.get("eval_id")
 
     json_path = reporter.write_json(eval_results)
     md_path = reporter.write_markdown(eval_results)
 
+    # Remove checkpoint now that final results are written
+    reporter.remove_checkpoint(eval_id)
+
     click.echo(f"\nResults written to:")
     click.echo(f"  JSON: {json_path}")
     click.echo(f"  Markdown: {md_path}")
diff --git a/eval-harness/lib/reporter.py b/eval-harness/lib/reporter.py
index 36d38f5..9b307c0 100644
--- a/eval-harness/lib/reporter.py
+++ b/eval-harness/lib/reporter.py
@@ -19,6 +19,7 @@ class EvalResults:
     results: list[dict[str, Any]]
     summary: dict[str, Any]
     budget: dict[str, Any] | None = None
+    run_config: dict[str, Any] | None = None
 
 
 class Reporter:
@@ -354,21 +355,23 @@ def median_tokens(task_results: list[TaskResult]) -> int:
                         "upper": round(ci_upper, 3),
                     }
 
-            # Significance: check CI overlap between none and each treatment
-            none_ci = summary.get("none_ci_90")
-            if none_ci:
-                for treatment in ("flat_llm", "intent_layer"):
-                    t_ci = summary.get(f"{treatment}_ci_90")
-                    if t_ci:
-                        overlaps = ci_overlap(
-                            (none_ci["lower"], none_ci["upper"]),
-                            (t_ci["lower"], t_ci["upper"]),
-                        )
-                        summary[f"{treatment}_vs_none_significant"] = not overlaps
+            # Significance: derived from McNemar (paired test), not CI overlap.
+            # CI overlap is a visual heuristic only — it's not a valid test
+            # for paired data and maps unreliably to p-values.
+            # McNemar p-values are populated below in _compute_mcnemar.
 
         # McNemar's paired analysis: compare conditions per (task, rep) pair
         summary["mcnemar"] = self._compute_mcnemar(results)
 
+        # Derive significance flags from McNemar p-values (paired test).
+        # Only for multi-run data — single-run has too few pairs to be meaningful.
+        if has_multi_run:
+            for treatment in ("flat_llm", "intent_layer"):
+                key = f"{treatment}_vs_none"
+                mcnemar_entry = summary["mcnemar"].get(key)
+                if mcnemar_entry and mcnemar_entry["n_discordant"] > 0:
+                    summary[f"{treatment}_vs_none_significant"] = mcnemar_entry["p_value"] < 0.05
+
         return summary
 
     def _compute_mcnemar(self, results: list[TaskResult]) -> dict:
@@ -419,6 +422,86 @@ def _compute_mcnemar(self, results: list[TaskResult]) -> dict:
 
         return mcnemar_results
 
+    def write_checkpoint(
+        self,
+        results: list['TaskResult'],
+        eval_id: str,
+        run_config: dict | None = None,
+    ) -> str:
+        """Write incremental checkpoint after each task result.
+
+        Produces a --resume-compatible JSON file so that a killed run can
+        be continued with the same command plus --resume <checkpoint>.
+        The checkpoint is overwritten after each result, keeping only the
+        latest snapshot.
+
+        run_config: snapshot of the CLI flags (tasks, conditions, reps, timeout)
+        so --resume can detect incompatible configs before mixing results.
+        """
+        checkpoint_path = self.output_dir / f"in-progress-{eval_id}.json"
+        compiled = self.compile_results(results)
+        # Stamp with the pre-assigned eval_id so resume traces lineage
+        data = asdict(compiled)
+        data["eval_id"] = eval_id
+        data["checkpoint"] = True
+        data["completed_runs"] = len(results)
+        if run_config is not None:
+            data["run_config"] = run_config
+
+        # Atomic write: write to tmp then rename to avoid partial reads
+        tmp_path = checkpoint_path.with_suffix(f".tmp.{id(results)}")
+        with open(tmp_path, "w") as f:
+            json.dump(data, f, indent=2)
+        tmp_path.rename(checkpoint_path)
+
+        return str(checkpoint_path)
+
+    def remove_checkpoint(self, eval_id: str) -> None:
+        """Remove checkpoint file after successful completion."""
+        checkpoint_path = self.output_dir / f"in-progress-{eval_id}.json"
+        checkpoint_path.unlink(missing_ok=True)
+
+    def write_trial(self, result: 'TaskResult') -> str:
+        """Write a per-trial result file for ls-level observability.
+
+        Creates results/trials/<task_id>-<condition>-r<rep>.json so you
+        can see exactly which trials completed by listing a directory.
+        Each file is small (~1KB) and written atomically.
+        """
+        trials_dir = self.output_dir / "trials"
+        trials_dir.mkdir(parents=True, exist_ok=True)
+
+        filename = f"{result.task_id}-{result.condition.value}-r{result.rep}.json"
+        trial_path = trials_dir / filename
+
+        data = {
+            "task_id": result.task_id,
+            "condition": result.condition.value,
+            "rep": result.rep,
+            "success": result.success,
+            "wall_clock_seconds": result.wall_clock_seconds,
+            "input_tokens": result.input_tokens,
+            "output_tokens": result.output_tokens,
+            "tool_calls": result.tool_calls,
+            "lines_changed": result.lines_changed,
+        }
+        if result.error:
+            data["error"] = result.error
+            data["error_class"] = (
+                "infra" if result.error.startswith(self.INFRA_ERROR_PREFIXES)
+                else "timeout" if result.error.startswith("[timeout]")
+                else "genuine"
+            )
+
+        # Atomic write
+        import os
+        tmp_path = trial_path.with_suffix(f".tmp.{os.getpid()}")
+        with open(tmp_path, "w") as f:
+            json.dump(data, f, indent=2)
+        tmp_path.rename(trial_path)
+
+        return str(trial_path)
+
     def write_json(self, results: EvalResults) -> str:
         """Write results to JSON file."""
         path = self.output_dir / f"{results.eval_id}.json"
@@ -490,16 +573,20 @@ def write_markdown(self, results: EvalResults) -> str:
         # Significance flags
         if has_cis:
             lines.append("")
+            mcnemar_data = summary.get("mcnemar", {})
             for treatment, display_name in [
                 ("flat_llm", "Flat LLM"),
                 ("intent_layer", "Intent Layer"),
             ]:
                 sig_key = f"{treatment}_vs_none_significant"
-                if sig_key in summary:
+                mcnemar_entry = mcnemar_data.get(f"{treatment}_vs_none")
+                if sig_key in summary and mcnemar_entry:
+                    p = mcnemar_entry["p_value"]
+                    n_disc = mcnemar_entry["n_discordant"]
                     if summary[sig_key]:
-                        lines.append(f"- **{display_name} vs None:** significant (non-overlapping CIs)")
+                        lines.append(f"- **{display_name} vs None:** significant (McNemar p={p:.3f}, {n_disc} discordant pairs)")
                     else:
-                        lines.append(f"- **{display_name} vs None:** not significant (overlapping CIs)")
+                        lines.append(f"- **{display_name} vs None:** not significant (McNemar p={p:.3f}, {n_disc} discordant pairs)")
 
             # CI width as variance proxy
             widths = []
@@ -543,7 +630,8 @@ def write_markdown(self, results: EvalResults) -> str:
             task_id = r["task_id"]
             deltas = r.get("deltas", {})
 
-            # Pre-compute per-task CI comparison for IL vs none
+            # Per-task CI comparison for IL vs none (visual heuristic only —
+            # aggregate significance comes from McNemar in the summary)
             none_data = r.get("none")
             il_data = r.get("intent_layer")
             il_vs_none = ""
@@ -558,8 +646,8 @@ def write_markdown(self, results: EvalResults) -> str:
                         (none_ci["lower"], none_ci["upper"]),
                         (il_ci["lower"], il_ci["upper"]),
                     )
-                    sig_label = "overlap" if overlaps else "sig."
-                    il_vs_none = f"{diff:+.0%} ({sig_label})"
+                    ci_label = "CIs overlap" if overlaps else "CIs disjoint"
+                    il_vs_none = f"{diff:+.0%} ({ci_label})"
 
             for cond_key in ("none", "flat_llm", "intent_layer"):
                 cond_data = r.get(cond_key)
diff --git a/eval-harness/lib/stats.py b/eval-harness/lib/stats.py
index 560d331..750a70c 100644
--- a/eval-harness/lib/stats.py
+++ b/eval-harness/lib/stats.py
@@ -102,10 +102,11 @@ def wilson_score_interval(
 
 
 def ci_overlap(ci_a: tuple[float, float], ci_b: tuple[float, float]) -> bool:
-    """Check if two confidence intervals overlap.
+    """Check if two confidence intervals overlap (visual heuristic only).
 
-    Returns True if the intervals share any range. Non-overlapping CIs
-    at 90% confidence suggest a statistically meaningful difference.
+    Returns True if the intervals share any range. Used for dashboard
+    display, NOT as a significance test. For significance decisions, use
+    McNemar's exact test (paired data) or Fisher's exact test (unpaired).
     """
     return ci_a[0] <= ci_b[1] and ci_b[0] <= ci_a[1]
 
diff --git a/eval-harness/tests/test_resume.py b/eval-harness/tests/test_resume.py
index 4ea57d5..963bbf1 100644
--- a/eval-harness/tests/test_resume.py
+++ b/eval-harness/tests/test_resume.py
@@ -9,7 +9,8 @@
 import pytest
 
 from lib.cli import _load_prior_results, _merge_results, _recompute_summary, _is_infra_error_dict
-from lib.reporter import EvalResults
+from lib.reporter import EvalResults, Reporter
+from lib.task_runner import TaskResult, Condition
 
 
 def _write_json(data: dict) -> str:
@@ -116,7 +117,7 @@ def test_identifies_passed_pairs(self):
             "deltas": {},
         }])
         path = _write_json(prior)
-        passed, data = _load_prior_results(path)
+        passed, genuine, data = _load_prior_results(path)
 
         assert ("task-1", "none") in passed
         assert ("task-1", "flat_llm") not in passed
@@ -132,9 +133,10 @@ def test_excludes_infra_errors(self):
             "deltas": {},
         }])
         path = _write_json(prior)
-        passed, _ = _load_prior_results(path)
+        passed, genuine, _ = _load_prior_results(path)
 
         assert len(passed) == 0
+        assert len(genuine) == 0  # infra errors are neither passed nor genuine
 
     def test_validates_structure(self):
         path = _write_json({"bad": "data"})
@@ -173,7 +175,7 @@ def test_handles_null_conditions(self):
             "deltas": {},
         }])
         path = _write_json(prior)
-        passed, _ = _load_prior_results(path)
+        passed, genuine, _ = _load_prior_results(path)
 
         assert passed == {("task-1", "none")}
 
@@ -186,7 +188,7 @@ def test_multi_run_passing_is_carried_forward(self):
             "deltas": {},
         }])
         path = _write_json(prior)
-        passed, _ = _load_prior_results(path)
+        passed, genuine, _ = _load_prior_results(path)
 
         assert ("task-1", "none") in passed
         assert ("task-1", "flat_llm") not in passed
@@ -201,7 +203,7 @@ def test_multi_run_failing_not_carried_forward(self):
             "deltas": {},
         }])
         path = _write_json(prior)
-        passed, _ = _load_prior_results(path)
+        passed, genuine, _ = _load_prior_results(path)
 
         assert len(passed) == 0
 
@@ -214,7 +216,7 @@ def test_mixed_single_and_multi_run(self):
             "deltas": {},
         }])
         path = _write_json(prior)
-        passed, _ = _load_prior_results(path)
+        passed, genuine, _ = _load_prior_results(path)
 
         assert ("task-1", "none") in passed
         assert ("task-1", "flat_llm") not in passed
@@ -234,7 +236,7 @@ def test_multiple_tasks(self):
              "deltas": {}},
         ])
         path = _write_json(prior)
-        passed, _ = _load_prior_results(path)
+        passed, genuine, _ = _load_prior_results(path)
 
         assert len(passed) == 3  # all 3 conditions of task-1
         assert all(tid == "task-1" for tid, _ in passed)
@@ -321,9 +323,12 @@ def test_multi_run_produces_cis(self):
         assert "upper" in none_ci
         assert 0 <= none_ci["lower"] <= none_ci["upper"] <= 1
 
-        # Significance flags should exist
-        assert "flat_llm_vs_none_significant" in summary
-        assert "intent_layer_vs_none_significant" in summary
+        # Significance flags are NOT produced by _recompute_summary —
+        # they require McNemar pairing from raw TaskResults, which only
+        # the reporter's _compute_summary has access to. Flags from
+        # merged results are carried forward from the original compilation.
+        assert "flat_llm_vs_none_significant" not in summary
+        assert "intent_layer_vs_none_significant" not in summary
 
 
 # --- _merge_results tests ---
@@ -628,7 +633,7 @@ def test_genuine_failure_not_carried_forward(self):
             "deltas": {},
         }])
         path = _write_json(prior)
-        passed, _ = _load_prior_results(path)
+        passed, genuine, _ = _load_prior_results(path)
 
         assert ("task-1", "none") not in passed
         assert ("task-1", "flat_llm") in passed
@@ -648,7 +653,239 @@ def test_success_with_error_field_not_carried(self):
             "deltas": {},
         }])
         path = _write_json(prior)
-        passed, _ = _load_prior_results(path)
+        passed, genuine, _ = _load_prior_results(path)
 
         assert ("task-1", "none") not in passed
         assert ("task-1", "flat_llm") in passed
+
+
+# --- Checkpoint tests ---
+
+class TestCheckpoint:
+    def _make_task_result(self, task_id, condition, success, rep=0):
+        return TaskResult(
+            task_id=task_id, condition=condition, success=success,
+            test_output="ok" if success else "fail",
+            wall_clock_seconds=100, input_tokens=500, output_tokens=200,
+            tool_calls=10, lines_changed=5, files_touched=["a.py"], rep=rep,
+        )
+
+    def test_checkpoint_is_resume_compatible(self):
+        """A checkpoint file can be loaded by _load_prior_results."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            reporter = Reporter(tmpdir)
+            results = [
+                self._make_task_result("task-1", Condition.NONE, True),
+                self._make_task_result("task-1", Condition.FLAT_LLM, False),
+            ]
+            checkpoint_path = reporter.write_checkpoint(results, "test-001")
+
+            passed, genuine, data = _load_prior_results(checkpoint_path)
+            assert ("task-1", "none") in passed
+            assert ("task-1", "flat_llm") not in passed
+            assert data["checkpoint"] is True
+            assert data["completed_runs"] == 2
+
+    def test_checkpoint_accumulates(self):
+        """Each checkpoint overwrites the previous, growing the result set."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            reporter = Reporter(tmpdir)
+            results = [self._make_task_result("task-1", Condition.NONE, True)]
+            reporter.write_checkpoint(results, "test-002")
+
+            results.append(self._make_task_result("task-1", Condition.FLAT_LLM, True))
+            checkpoint_path = reporter.write_checkpoint(results, "test-002")
+
+            passed, genuine, data = _load_prior_results(checkpoint_path)
+            assert ("task-1", "none") in passed
+            assert ("task-1", "flat_llm") in passed
+            assert data["completed_runs"] == 2
+
+    def test_checkpoint_includes_run_config(self):
+        """Checkpoint stores run_config for resume validation."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            reporter = Reporter(tmpdir)
+            results = [self._make_task_result("task-1", Condition.NONE, True)]
+            config = {"task_ids": ["task-1"], "conditions": ["none"], "repetitions": 5, "timeout": 1800}
+            checkpoint_path = reporter.write_checkpoint(results, "test-cfg", run_config=config)
+
+            with open(checkpoint_path) as f:
+                data = json.load(f)
+            assert data["run_config"] == config
+
+    def test_remove_checkpoint(self):
+        """remove_checkpoint deletes the in-progress file."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            reporter = Reporter(tmpdir)
+            results = [self._make_task_result("task-1", Condition.NONE, True)]
+            path = reporter.write_checkpoint(results, "test-003")
+            assert Path(path).exists()
+
+            reporter.remove_checkpoint("test-003")
+            assert not Path(path).exists()
+
+    def test_remove_checkpoint_idempotent(self):
+        """remove_checkpoint doesn't error if file already gone."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            reporter = Reporter(tmpdir)
+            reporter.remove_checkpoint("nonexistent-id")  # should not raise
+
+
+# --- Error classification tests ---
+
+class TestErrorClassification:
+    def test_infra_errors_are_retryable(self):
+        """Infra errors should NOT be in genuine_failures (they get retried)."""
+        prior = _make_prior([{
+            "task_id": "task-1",
+            "none": _infra_error_condition(),
+            "flat_llm": _passing_condition(),
+            "intent_layer": _infra_error_condition(),
+            "deltas": {},
+        }])
+        path = _write_json(prior)
+        passed, genuine, _ = _load_prior_results(path)
+
+        assert ("task-1", "flat_llm") in passed
+        # Infra errors: not passed, not genuine → retried
+        assert ("task-1", "none") not in passed
+        assert ("task-1", "none") not in genuine
+        assert ("task-1", "intent_layer") not in passed
+        assert ("task-1", "intent_layer") not in genuine
+
+    def test_genuine_failures_classified(self):
+        """Test failures and timeouts are genuine failures (skipped on resume)."""
+        prior = _make_prior([{
+            "task_id": "task-1",
+            "none": _genuine_failure_condition(),
+            "flat_llm": _failing_condition(),  # timeout
+            "intent_layer": _passing_condition(),
+            "deltas": {},
+        }])
+        path = _write_json(prior)
+        passed, genuine, _ = _load_prior_results(path)
+
+        assert ("task-1", "intent_layer") in passed
+        assert ("task-1", "none") in genuine
+        assert ("task-1", "flat_llm") in genuine
+
+    def test_mixed_infra_and_genuine(self):
+        """Mixed scenario: passed + infra + genuine all classified correctly."""
+        prior = _make_prior([{
+            "task_id": "task-1",
+            "none": _passing_condition(),         # passed
+            "flat_llm": _infra_error_condition(),  # infra → retry
+            "intent_layer": _genuine_failure_condition(),  # genuine → skip
+            "deltas": {},
+        }])
+        path = _write_json(prior)
+        passed, genuine, _ = _load_prior_results(path)
+
+        assert passed == {("task-1", "none")}
+        assert genuine == {("task-1", "intent_layer")}
+        # flat_llm infra error: not in either set → retried
+
+    def test_multi_run_with_infra_in_runs_is_retryable(self):
+        """Multi-run condition with infra error in individual runs → retry."""
+        multi_with_infra = {
+            "success_rate": 0.5, "success": True, "successes": 1,
+            "total_valid_runs": 2,
+            "runs": [
+                {"success": True, "test_output": "ok", "wall_clock_seconds": 10,
+                 "input_tokens": 100, "output_tokens": 50, "tool_calls": 5,
+                 "lines_changed": 3, "files_touched": ["a.py"]},
+                {"success": False, "test_output": "", "wall_clock_seconds": 0,
+                 "input_tokens": 0, "output_tokens": 0, "tool_calls": 0,
+                 "lines_changed": 0, "files_touched": [],
+                 "error": "[worker-crash] OOM killed"},
+            ],
+        }
+        prior = _make_prior([{
+            "task_id": "task-1",
+            "none": multi_with_infra,
+            "flat_llm": None,
+            "intent_layer": None,
+            "deltas": {},
+        }])
+        path = _write_json(prior)
+        passed, genuine, _ = _load_prior_results(path)
+
+        # Has infra error in runs → retryable, not genuine failure
+        assert ("task-1", "none") not in genuine
+
+
+# --- Trial file tests ---
+
+class TestWriteTrial:
+    def test_writes_trial_file(self):
+        """write_trial creates a per-trial JSON file."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            reporter = Reporter(tmpdir)
+            result = TaskResult(
+                task_id="fix-bug", condition=Condition.INTENT_LAYER, success=True,
+                test_output="ok", wall_clock_seconds=42.5, input_tokens=300,
+                output_tokens=150, tool_calls=8, lines_changed=3,
+                files_touched=["a.py"], rep=2,
+            )
+            path = reporter.write_trial(result)
+
+            assert Path(path).exists()
+            assert path.endswith("fix-bug-intent_layer-r2.json")
+            with open(path) as f:
+                data = json.load(f)
+            assert data["task_id"] == "fix-bug"
+            assert data["condition"] == "intent_layer"
+            assert data["rep"] == 2
+            assert data["success"] is True
+            assert "error" not in data
+
+    def test_trial_file_includes_error_class(self):
+        """Failed trials include error classification."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            reporter = Reporter(tmpdir)
+            result = TaskResult(
+                task_id="fix-bug", condition=Condition.NONE, success=False,
+                test_output="", wall_clock_seconds=0, input_tokens=0,
+                output_tokens=0, tool_calls=0, lines_changed=0,
+                files_touched=[], rep=0,
+                error="[pre-validation] test already passes",
+            )
+            path = reporter.write_trial(result)
+
+            with open(path) as f:
+                data = json.load(f)
+            assert data["error_class"] == "infra"
+
+    def test_trial_timeout_classified(self):
+        """Timeout errors are classified as 'timeout'."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            reporter = Reporter(tmpdir)
+            result = TaskResult(
+                task_id="fix-bug", condition=Condition.FLAT_LLM, success=False,
+                test_output="", wall_clock_seconds=300, input_tokens=0,
+                output_tokens=0, tool_calls=0, lines_changed=0,
+                files_touched=[], rep=0,
+                error="[timeout] Claude timed out after 300.0s",
+            )
+            path = reporter.write_trial(result)
+
+            with open(path) as f:
+                data = json.load(f)
+            assert data["error_class"] == "timeout"
+
+    def test_trial_genuine_failure_classified(self):
+        """Genuine test failures are classified as 'genuine'."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            reporter = Reporter(tmpdir)
+            result = TaskResult(
+                task_id="fix-bug", condition=Condition.NONE, success=False,
+                test_output="FAILED 3 tests", wall_clock_seconds=45,
+                input_tokens=500, output_tokens=200, tool_calls=12,
+                lines_changed=8, files_touched=["a.py"], rep=0,
+                error="tests failed with exit code 1",
+            )
+            path = reporter.write_trial(result)
+
+            with open(path) as f:
+                data = json.load(f)
+            assert data["error_class"] == "genuine"

From 4aae36a01ae8fe96ce4357a495d682f575aca1c2 Mon Sep 17 00:00:00 2001
From: Ryan Orban <ryan@orban.dev>
Date: Mon, 23 Feb 2026 21:56:28 -0800
Subject: [PATCH 02/21] default eval model to sonnet for reproducibility

Entire-Checkpoint: 5384ed119f2a
---
 eval-harness/lib/cli.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/eval-harness/lib/cli.py b/eval-harness/lib/cli.py
index 1f459e1..82a1b00 100644
--- a/eval-harness/lib/cli.py
+++ b/eval-harness/lib/cli.py
@@ -375,8 +375,8 @@ def scan(repo, output, since, limit, docker_image, setup, test_command, branch):
 @click.option("--condition", "-c", multiple=True,
               type=click.Choice(["none", "flat_llm", "intent_layer"]),
               help="Conditions to run (default: all three)")
-@click.option("--model", default=None,
-              help="Claude model to use (e.g., claude-sonnet-4-5-20250929)")
+@click.option("--model", default="sonnet",
+              help="Claude model to use (default: sonnet)")
 @click.option("--repetitions", "-n", default=1,
               help="Number of times to repeat each task/condition pair (default: 1)")
 @click.option("--resume", default=None, type=click.Path(exists=True),

From b9810180db1cdf0a61d53c97bd2da8c9ed482952 Mon Sep 17 00:00:00 2001
From: Ryan Orban <ryan@orban.dev>
Date: Mon, 23 Feb 2026 22:00:49 -0800
Subject: [PATCH 03/21] bump default parallelism from 2 to 8 workers

Entire-Checkpoint: 81278d45e27d
---
 eval-harness/lib/cli.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/eval-harness/lib/cli.py b/eval-harness/lib/cli.py
index 82a1b00..681dd91 100644
--- a/eval-harness/lib/cli.py
+++ b/eval-harness/lib/cli.py
@@ -362,7 +362,7 @@ def scan(repo, output, since, limit, docker_image, setup, test_command, branch):
 
 @main.command()
 @click.option("--tasks", "-t", multiple=True, required=True, help="Task YAML files")
-@click.option("--parallel", "-p", default=2, help="Number of parallel workers")
+@click.option("--parallel", "-p", default=8, help="Number of parallel workers")
 @click.option("--category", type=click.Choice(["simple_fix", "targeted_refactor", "complex_fix"]))
 @click.option("--output", "-o", default="results", help="Output directory")
 @click.option("--keep-workspaces", is_flag=True, help="Don't cleanup workspaces")
@@ -719,7 +719,7 @@ def run_single(item):
 
 @main.command()
 @click.option("--tasks", "-t", multiple=True, required=True, help="Task YAML files")
-@click.option("--parallel", "-p", default=2, help="Number of parallel workers")
+@click.option("--parallel", "-p", default=8, help="Number of parallel workers")
 @click.option("--timeout", default=300, help="Pre-validation timeout in seconds (default: 300)")
 @click.option("--verbose", "-v", is_flag=True, help="Show detailed progress")
 def validate(tasks, parallel, timeout, verbose):

From d5bc4159e6e96f2a4b355ac7c3903b5e382ce8e1 Mon Sep 17 00:00:00 2001
From: Ryan Orban <ryan@orban.dev>
Date: Tue, 24 Feb 2026 10:35:01 -0800
Subject: [PATCH 04/21] add circuit breaker to skip remaining reps after
 repeated failures

Pre-validation failures trip at task level (all conditions share
Docker setup), other failures trip at task+condition level.
Threshold of 2 accounts for in-flight parallel workers.

Entire-Checkpoint: b2252e9cb3d3
---
 eval-harness/lib/cli.py | 54 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/eval-harness/lib/cli.py b/eval-harness/lib/cli.py
index 681dd91..9bf1d68 100644
--- a/eval-harness/lib/cli.py
+++ b/eval-harness/lib/cli.py
@@ -555,8 +555,52 @@ def _warmup_one(item):
                         click.echo(f"  {cond_str}: warmup failed - {e}", err=True)
                         click.echo(f"    (task runs will retry with their own timeout)", err=True)
 
+    # Circuit breaker: stop retrying after repeated identical failures.
+    # Pre-validation failures trip at task level (all conditions share Docker
+    # setup), other failures trip at task+condition level.
+    _cb_counts: dict[tuple[str, ...], int] = {}
+    _cb_tripped: set[tuple[str, ...]] = set()
+    _cb_lock = threading.Lock()
+    CB_THRESHOLD = 2  # trip after this many consecutive failures
+
+    def _cb_record(task_id: str, condition: str, error: str) -> bool:
+        """Record a failure. Returns True if newly tripped."""
+        with _cb_lock:
+            if "[pre-validation]" in error:
+                key = (task_id,)  # task-level — affects all conditions
+            else:
+                key = (task_id, condition)
+            _cb_counts[key] = _cb_counts.get(key, 0) + 1
+            if _cb_counts[key] >= CB_THRESHOLD:
+                newly = key not in _cb_tripped
+                _cb_tripped.add(key)
+                return newly
+        return False
+
+    def _cb_is_tripped(task_id: str, condition: str) -> bool:
+        with _cb_lock:
+            return (task_id,) in _cb_tripped or (task_id, condition) in _cb_tripped
+
     def run_single(item):
         repo, task, condition, rep = item
+
+        # Skip if circuit breaker already tripped for this task/condition
+        if _cb_is_tripped(task.id, condition.value):
+            return TaskResult(
+                task_id=task.id,
+                condition=condition,
+                success=False,
+                test_output="",
+                wall_clock_seconds=0,
+                input_tokens=0,
+                output_tokens=0,
+                tool_calls=0,
+                lines_changed=0,
+                files_touched=[],
+                rep=rep,
+                error=f"[circuit-breaker] skipped — repeated failures for this task"
+            )
+
         runner = TaskRunner(
             repo,
             str(workspaces_dir),
@@ -568,7 +612,15 @@ def run_single(item):
             claude_timeout=timeout,
             skip_pre_validation_for=pre_validated_tasks,
         )
-        return runner.run(task, condition, model=model, rep=rep)
+        result = runner.run(task, condition, model=model, rep=rep)
+
+        if result.error:
+            newly_tripped = _cb_record(task.id, condition.value, result.error)
+            if newly_tripped:
+                scope = task.id if "[pre-validation]" in result.error else f"{task.id}/{condition.value}"
+                click.echo(f"  \u26a1 Circuit breaker tripped for {scope} — skipping remaining reps")
+
+        return result
 
     # Mid-run budget tracking state
     budget_threshold = None

From a65cdde6c409b49047a46d746ad507b98d101ab3 Mon Sep 17 00:00:00 2001
From: Ryan Orban <ryan@orban.dev>
Date: Tue, 24 Feb 2026 10:44:24 -0800
Subject: [PATCH 05/21] add supervisor loop with auto-remediation for infra
 failures

After each batch, classifies failures as infra vs genuine. If infra
failures detected: checks Docker health, restarts if needed, reduces
parallelism, resets circuit breaker, and retries. Max 2 retry rounds.

Entire-Checkpoint: 806ec810be18
---
 eval-harness/lib/cli.py | 249 ++++++++++++++++++++++++++++------------
 1 file changed, 174 insertions(+), 75 deletions(-)

diff --git a/eval-harness/lib/cli.py b/eval-harness/lib/cli.py
index 9bf1d68..00580b9 100644
--- a/eval-harness/lib/cli.py
+++ b/eval-harness/lib/cli.py
@@ -643,83 +643,182 @@ def run_single(item):
         "task_files": [str(Path(t).resolve()) for t in tasks],
     }
 
-    with ThreadPoolExecutor(max_workers=parallel) as executor:
-        futures = {executor.submit(run_single, item): item for item in work_queue}
-
-        for future in as_completed(futures):
-            item = futures[future]
-            _repo, _task, _cond, rep = item
-            try:
-                result = future.result()
-            except Exception as e:
-                # Worker crashed (e.g., cache race, OOM) — record as infra error
-                click.echo(f"  {_task.id} ({_cond.value}): CRASH - {e}", err=True)
-                result = TaskResult(
-                    task_id=_task.id,
-                    condition=_cond,
-                    success=False,
-                    test_output="",
-                    wall_clock_seconds=0,
-                    input_tokens=0,
-                    output_tokens=0,
-                    tool_calls=0,
-                    lines_changed=0,
-                    files_touched=[],
-                    rep=rep,
-                    error=f"[worker-crash] {e}"
-                )
-            results.append(result)
-            status = "PASS" if result.success else "FAIL"
-            # Build the status line with error info if failed
-            rep_tag = f" [rep {rep+1}/{repetitions}]" if repetitions > 1 else ""
-            line = f"  {result.task_id} ({result.condition.value}){rep_tag}: {status}"
-            if not result.success:
-                if result.error:
-                    # Exception during execution - show first line
-                    error_line = result.error.split('\n')[0][:80]
-                    line += f" - {error_line}"
-                elif result.test_output:
-                    # Tests failed - extract last meaningful line from output
-                    output_lines = [l.strip() for l in result.test_output.strip().split('\n') if l.strip()]
-                    if output_lines:
-                        last_line = output_lines[-1][:80]
-                        line += f" - {last_line}"
-            click.echo(line)
-            # In verbose mode, show more error context for failures
-            if verbose and not result.success:
-                if result.error:
-                    click.echo(f"    Error: {result.error}", err=True)
-                elif result.test_output:
-                    # Show last 10 lines of test output
-                    output_lines = result.test_output.strip().split('\n')
-                    tail = output_lines[-10:] if len(output_lines) > 10 else output_lines
-                    click.echo("    Test output (last 10 lines):", err=True)
-                    for l in tail:
-                        click.echo(f"      {l}", err=True)
-
-            # Per-trial result file for ls-level observability
+    def _run_batch(batch: list, workers: int) -> list[TaskResult]:
+        """Run a batch of work items and return results."""
+        nonlocal budget_warned
+        batch_results: list[TaskResult] = []
+        with ThreadPoolExecutor(max_workers=workers) as executor:
+            futures = {executor.submit(run_single, item): item for item in batch}
+
+            for future in as_completed(futures):
+                item = futures[future]
+                _repo, _task, _cond, rep = item
+                try:
+                    result = future.result()
+                except Exception as e:
+                    # Worker crashed (e.g., cache race, OOM) — record as infra error
+                    click.echo(f"  {_task.id} ({_cond.value}): CRASH - {e}", err=True)
+                    result = TaskResult(
+                        task_id=_task.id,
+                        condition=_cond,
+                        success=False,
+                        test_output="",
+                        wall_clock_seconds=0,
+                        input_tokens=0,
+                        output_tokens=0,
+                        tool_calls=0,
+                        lines_changed=0,
+                        files_touched=[],
+                        rep=rep,
+                        error=f"[worker-crash] {e}"
+                    )
+                batch_results.append(result)
+                results.append(result)  # also accumulate globally for checkpoint
+                status = "PASS" if result.success else "FAIL"
+                # Build the status line with error info if failed
+                rep_tag = f" [rep {rep+1}/{repetitions}]" if repetitions > 1 else ""
+                line = f"  {result.task_id} ({result.condition.value}){rep_tag}: {status}"
+                if not result.success:
+                    if result.error:
+                        error_line = result.error.split('\n')[0][:80]
+                        line += f" - {error_line}"
+                    elif result.test_output:
+                        output_lines = [l.strip() for l in result.test_output.strip().split('\n') if l.strip()]
+                        if output_lines:
+                            last_line = output_lines[-1][:80]
+                            line += f" - {last_line}"
+                click.echo(line)
+                if verbose and not result.success:
+                    if result.error:
+                        click.echo(f"    Error: {result.error}", err=True)
+                    elif result.test_output:
+                        output_lines = result.test_output.strip().split('\n')
+                        tail = output_lines[-10:] if len(output_lines) > 10 else output_lines
+                        click.echo("    Test output (last 10 lines):", err=True)
+                        for tl in tail:
+                            click.echo(f"      {tl}", err=True)
+
+                try:
+                    reporter.write_trial(result)
+                except Exception as e:
+                    click.echo(f"  Warning: trial write failed: {e}", err=True)
+
+                try:
+                    checkpoint_path = reporter.write_checkpoint(results, eval_id, run_config=run_config)
+                    if len(results) == 1:
+                        click.echo(f"  Checkpoint: {checkpoint_path} (updated after each result)")
+                except Exception as e:
+                    click.echo(f"  Warning: checkpoint write failed: {e}", err=True)
+
+                if budget_threshold and not budget_warned and preflight_budget:
+                    cumulative_tokens = sum(r.input_tokens + r.output_tokens for r in results)
+                    if cumulative_tokens > budget_threshold:
+                        rem_fmt = fmt_tokens(preflight_budget.get('remaining_tokens', 0))
+                        cum_fmt = fmt_tokens(cumulative_tokens)
+                        click.echo(f"\n\u26a0 Budget checkpoint: {cum_fmt} tokens consumed so far (est. remaining: {rem_fmt} at start)\n")
+                        refresh_budget_snapshot()
+                        budget_warned = True
+
+        return batch_results
+
+    def _check_docker() -> bool:
+        """Return True if Docker daemon is responsive."""
+        import subprocess
+        try:
+            r = subprocess.run(["docker", "ps"], capture_output=True, timeout=10)
+            return r.returncode == 0
+        except Exception:
+            return False
+
+    def _restart_docker() -> bool:
+        """Attempt to restart Docker/OrbStack. Returns True if successful."""
+        import subprocess, time
+        click.echo("  Attempting Docker restart...")
+        # Try OrbStack first (macOS), then generic docker
+        for cmd in [["open", "-a", "OrbStack"], ["open", "-a", "Docker"]]:
             try:
-                reporter.write_trial(result)
-            except Exception as e:
-                click.echo(f"  Warning: trial write failed: {e}", err=True)
+                subprocess.run(cmd, capture_output=True, timeout=10)
+            except Exception:
+                continue
+        # Wait for Docker to come up
+        for attempt in range(12):  # up to 60s
+            time.sleep(5)
+            if _check_docker():
+                click.echo(f"  Docker restarted successfully (waited {(attempt+1)*5}s)")
+                return True
+        click.echo("  Docker restart failed after 60s", err=True)
+        return False
 
-            # Incremental checkpoint — resume-compatible JSON written after each result
-            try:
-                checkpoint_path = reporter.write_checkpoint(results, eval_id, run_config=run_config)
-                if len(results) == 1:
-                    click.echo(f"  Checkpoint: {checkpoint_path} (updated after each result)")
-            except Exception as e:
-                click.echo(f"  Warning: checkpoint write failed: {e}", err=True)
-
-            # Mid-run budget checkpoint (one-time warning)
-            if budget_threshold and not budget_warned and preflight_budget:
-                cumulative_tokens = sum(r.input_tokens + r.output_tokens for r in results)
-                if cumulative_tokens > budget_threshold:
-                    rem_fmt = fmt_tokens(preflight_budget.get('remaining_tokens', 0))
-                    cum_fmt = fmt_tokens(cumulative_tokens)
-                    click.echo(f"\n\u26a0 Budget checkpoint: {cum_fmt} tokens consumed so far (est. remaining: {rem_fmt} at start)\n")
-                    refresh_budget_snapshot()
-                    budget_warned = True
+    def _cb_reset():
+        """Reset circuit breaker state between supervisor rounds."""
+        with _cb_lock:
+            _cb_counts.clear()
+            _cb_tripped.clear()
+
+    # ── Supervisor loop ──────────────────────────────────────────────
+    MAX_RETRY_ROUNDS = 2
+    current_batch = work_queue
+    current_workers = parallel
+
+    for supervisor_round in range(1 + MAX_RETRY_ROUNDS):
+        if supervisor_round > 0:
+            click.echo(f"\n{'='*60}")
+            click.echo(f"Supervisor retry round {supervisor_round}/{MAX_RETRY_ROUNDS}")
+            click.echo(f"{'='*60}")
+
+        batch_results = _run_batch(current_batch, current_workers)
+
+        # Classify failures from this batch
+        infra_results = [
+            r for r in batch_results
+            if r.error and (
+                r.error.startswith(Reporter.INFRA_ERROR_PREFIXES)
+                or r.error.startswith("[circuit-breaker]")
+            )
+        ]
+
+        if not infra_results:
+            break  # all clean or only genuine failures
+
+        # Build retry queue: find work items whose results were infra errors
+        infra_keys = {(r.task_id, r.condition.value, r.rep) for r in infra_results}
+        retry_queue = [
+            item for item in current_batch
+            if (item[1].id, item[2].value, item[3]) in infra_keys
+        ]
+
+        if not retry_queue or supervisor_round >= MAX_RETRY_ROUNDS:
+            if retry_queue:
+                click.echo(f"\n  {len(retry_queue)} infra failures remain after {MAX_RETRY_ROUNDS} retry rounds")
+            break
+
+        click.echo(f"\n  {len(infra_results)} infra failures detected, diagnosing...")
+
+        # Remove infra results from global list — they'll be replaced by retries
+        for r in infra_results:
+            if r in results:
+                results.remove(r)
+
+        # Diagnose and remediate
+        has_docker_failures = any(
+            "[pre-validation]" in r.error or "Docker" in r.error
+            for r in infra_results
+        )
+        if has_docker_failures:
+            if not _check_docker():
+                click.echo("  Docker is down!")
+                if not _restart_docker():
+                    click.echo("  Cannot recover Docker — aborting retries", err=True)
+                    results.extend(infra_results)  # put them back
+                    break
+            # Reduce parallelism to ease Docker contention
+            current_workers = max(2, current_workers // 2)
+            click.echo(f"  Reducing parallelism to {current_workers} workers")
+
+        # Reset circuit breaker for retry round
+        _cb_reset()
+        current_batch = retry_queue
+        click.echo(f"  Retrying {len(retry_queue)} items...")
 
     # Generate reports — capture postflight budget in cli (not reporter)
     postflight_budget = get_budget_status()

From 825709f33524bbca3c877f2c95188cbe9418340a Mon Sep 17 00:00:00 2001
From: Ryan Orban <ryan@orban.dev>
Date: Tue, 24 Feb 2026 10:49:27 -0800
Subject: [PATCH 06/21] add file-based control plane for external eval
 supervision

Status file (.eval-status.json) updated after each result with
machine-readable state: workers, pass/fail rates, paused flag.
Control dir (.eval-control/) accepts commands: pause, resume,
set-workers N, skip-task <id>. Commands consumed on read.
Enables Ralph Loop or any external agent to manage running evals.

Entire-Checkpoint: 5f55311d2e2a
---
 eval-harness/lib/cli.py | 92 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 86 insertions(+), 6 deletions(-)

diff --git a/eval-harness/lib/cli.py b/eval-harness/lib/cli.py
index 00580b9..e10bfae 100644
--- a/eval-harness/lib/cli.py
+++ b/eval-harness/lib/cli.py
@@ -643,14 +643,89 @@ def run_single(item):
         "task_files": [str(Path(t).resolve()) for t in tasks],
     }
 
+    # ── Control plane ─────────────────────────────────────────────
+    control_dir = Path(output) / ".eval-control"
+    control_dir.mkdir(parents=True, exist_ok=True)
+    status_path = Path(output) / ".eval-status.json"
+    _current_workers = parallel  # mutable via control commands
+
+    def _write_status(batch_total: int, completed: int, infra_fails: int,
+                      genuine_fails: int, passes: int, paused: bool):
+        """Write machine-readable status for external supervisors."""
+        import time as _time
+        status = {
+            "eval_id": eval_id,
+            "timestamp": datetime.now().isoformat(),
+            "uptime_seconds": _time.time() - _start_time,
+            "workers": _current_workers,
+            "paused": paused,
+            "batch_total": batch_total,
+            "completed": completed,
+            "passes": passes,
+            "genuine_failures": genuine_fails,
+            "infra_failures": infra_fails,
+            "remaining": batch_total - completed,
+            "pass_rate": round(passes / max(completed - infra_fails, 1), 3),
+            "infra_rate": round(infra_fails / max(completed, 1), 3),
+        }
+        tmp = status_path.with_suffix(f".tmp.{threading.get_ident()}")
+        with open(tmp, "w") as f:
+            json.dump(status, f, indent=2)
+        tmp.rename(status_path)
+
+    def _check_control() -> dict[str, str]:
+        """Read and consume control commands. Returns {command: value}."""
+        commands = {}
+        if not control_dir.exists():
+            return commands
+        for p in sorted(control_dir.iterdir()):
+            if p.name.startswith("."):
+                continue
+            val = p.read_text().strip() if p.stat().st_size > 0 else ""
+            commands[p.name] = val
+            p.unlink()  # consume the command
+        return commands
+
+    _start_time = __import__("time").time()
+    _paused = False
+
     def _run_batch(batch: list, workers: int) -> list[TaskResult]:
         """Run a batch of work items and return results."""
-        nonlocal budget_warned
+        nonlocal budget_warned, _current_workers, _paused
         batch_results: list[TaskResult] = []
         with ThreadPoolExecutor(max_workers=workers) as executor:
             futures = {executor.submit(run_single, item): item for item in batch}
 
             for future in as_completed(futures):
+                # Check control commands between results
+                for cmd, val in _check_control().items():
+                    if cmd == "pause":
+                        _paused = True
+                        click.echo("\n  \u23f8 Paused by external supervisor")
+                    elif cmd == "resume":
+                        _paused = False
+                        click.echo("\n  \u25b6 Resumed by external supervisor")
+                    elif cmd == "set-workers":
+                        try:
+                            _current_workers = max(1, int(val))
+                            click.echo(f"\n  \u2699 Workers set to {_current_workers} (takes effect next batch)")
+                        except ValueError:
+                            click.echo(f"\n  Warning: invalid set-workers value: {val}", err=True)
+                    elif cmd == "skip-task":
+                        # Trip circuit breaker for this task across all conditions
+                        with _cb_lock:
+                            _cb_tripped.add((val,))
+                        click.echo(f"\n  \u23ed Skipping task {val} by external command")
+
+                # Honor pause: spin-wait until resumed
+                while _paused:
+                    import time as _t
+                    _t.sleep(2)
+                    for cmd2, _ in _check_control().items():
+                        if cmd2 == "resume":
+                            _paused = False
+                            click.echo("\n  \u25b6 Resumed by external supervisor")
+
                 item = futures[future]
                 _repo, _task, _cond, rep = item
                 try:
@@ -719,6 +794,12 @@ def _run_batch(batch: list, workers: int) -> list[TaskResult]:
                         refresh_budget_snapshot()
                         budget_warned = True
 
+                # Update status file for external supervisors
+                n_infra = sum(1 for r in results if r.error and r.error.startswith(Reporter.INFRA_ERROR_PREFIXES))
+                n_pass = sum(1 for r in results if r.success)
+                n_genuine = len(results) - n_pass - n_infra
+                _write_status(len(work_queue), len(results), n_infra, n_genuine, n_pass, _paused)
+
         return batch_results
 
     def _check_docker() -> bool:
@@ -758,7 +839,6 @@ def _cb_reset():
     # ── Supervisor loop ──────────────────────────────────────────────
     MAX_RETRY_ROUNDS = 2
     current_batch = work_queue
-    current_workers = parallel
 
     for supervisor_round in range(1 + MAX_RETRY_ROUNDS):
         if supervisor_round > 0:
@@ -766,7 +846,7 @@ def _cb_reset():
             click.echo(f"Supervisor retry round {supervisor_round}/{MAX_RETRY_ROUNDS}")
             click.echo(f"{'='*60}")
 
-        batch_results = _run_batch(current_batch, current_workers)
+        batch_results = _run_batch(current_batch, _current_workers)
 
         # Classify failures from this batch
         infra_results = [
@@ -801,7 +881,7 @@ def _cb_reset():
 
         # Diagnose and remediate
         has_docker_failures = any(
-            "[pre-validation]" in r.error or "Docker" in r.error
+            r.error and ("[pre-validation]" in r.error or "Docker" in r.error)
             for r in infra_results
         )
         if has_docker_failures:
@@ -812,8 +892,8 @@ def _cb_reset():
                     results.extend(infra_results)  # put them back
                     break
             # Reduce parallelism to ease Docker contention
-            current_workers = max(2, current_workers // 2)
-            click.echo(f"  Reducing parallelism to {current_workers} workers")
+            _current_workers = max(2, _current_workers // 2)
+            click.echo(f"  Reducing parallelism to {_current_workers} workers")
 
         # Reset circuit breaker for retry round
         _cb_reset()

From e23bc3b76b121b000710a3b79e93785e39d6ba4e Mon Sep 17 00:00:00 2001
From: Ryan Orban <ryan@orban.dev>
Date: Tue, 24 Feb 2026 11:48:52 -0800
Subject: [PATCH 07/21] add per-task Fisher analysis, recommendations, and eval
 monitor

- Wire fisher_exact_test into reporter for per-task significance testing
- Add _compute_recommendations() flagging ceiling/floor/infra-only tasks
- Add Per-Task Analysis table and Recommendations section to markdown output
- Add lib/monitor.py: polling-based eval supervisor with stall detection,
  Docker recovery, infra-task skipping, and worker scaling
- 79 new tests across stats, reporter, and monitor modules

Entire-Checkpoint: 722abb53d525
---
 eval-harness/lib/monitor.py         | 363 ++++++++++++++++++++++++++++
 eval-harness/lib/reporter.py        | 146 ++++++++++-
 eval-harness/lib/stats.py           |  54 +++++
 eval-harness/tests/test_monitor.py  | 295 ++++++++++++++++++++++
 eval-harness/tests/test_reporter.py | 131 ++++++++++
 eval-harness/tests/test_stats.py    |  45 +++-
 6 files changed, 1032 insertions(+), 2 deletions(-)
 create mode 100644 eval-harness/lib/monitor.py
 create mode 100644 eval-harness/tests/test_monitor.py

diff --git a/eval-harness/lib/monitor.py b/eval-harness/lib/monitor.py
new file mode 100644
index 0000000..f30738a
--- /dev/null
+++ b/eval-harness/lib/monitor.py
@@ -0,0 +1,363 @@
+# lib/monitor.py
+"""Eval monitor: automated eval supervisor.
+
+Polls .eval-status.json and issues commands via .eval-control/ to manage
+a running eval. All decisions are rule-based — no LLM reasoning needed.
+
+Usage:
+    python lib/monitor.py --results-dir results/
+    python lib/monitor.py --results-dir results/ --poll-interval 30
+"""
+from __future__ import annotations
+
+import json
+import logging
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+
+log = logging.getLogger("monitor")
+
+
+@dataclass
+class MonitorConfig:
+    results_dir: Path
+    poll_interval: int = 15  # seconds between status checks
+    stall_timeout: int = 300  # 5 min without progress → stalled
+    infra_skip_threshold: int = 3  # skip task after N infra-only failures
+    budget_pause_pct: float = 0.10  # pause when < 10% budget remaining
+    max_workers_recovery: int = 4  # default workers after Docker recovery
+
+
+@dataclass
+class MonitorState:
+    """Tracks the monitor's view of the eval run across poll cycles."""
+    last_completed: int = 0
+    last_progress_time: float = field(default_factory=time.time)
+    task_infra_counts: dict[str, int] = field(default_factory=dict)
+    skipped_tasks: set[str] = field(default_factory=set)
+    actions_taken: list[dict] = field(default_factory=list)
+    docker_restarts: int = 0
+    paused_by_monitor: bool = False
+
+
+def read_status(results_dir: Path) -> dict | None:
+    """Read .eval-status.json, return None if missing or corrupt."""
+    status_path = results_dir / ".eval-status.json"
+    if not status_path.exists():
+        return None
+    try:
+        with open(status_path) as f:
+            return json.load(f)
+    except (json.JSONDecodeError, IOError):
+        return None
+
+
+def send_command(results_dir: Path, command: str, value: str = "") -> bool:
+    """Write a command file to .eval-control/ for the eval to consume."""
+    control_dir = results_dir / ".eval-control"
+    control_dir.mkdir(parents=True, exist_ok=True)
+    cmd_path = control_dir / command
+    try:
+        cmd_path.write_text(value)
+        return True
+    except IOError as e:
+        log.error("Failed to write command %s: %s", command, e)
+        return False
+
+
+def read_checkpoint(results_dir: Path) -> dict | None:
+    """Read the latest in-progress checkpoint for per-task analysis."""
+    checkpoints = sorted(results_dir.glob("in-progress-*.json"), reverse=True)
+    if not checkpoints:
+        return None
+    try:
+        with open(checkpoints[0]) as f:
+            return json.load(f)
+    except (json.JSONDecodeError, IOError):
+        return None
+
+
+def count_task_infra_failures(checkpoint: dict) -> dict[str, int]:
+    """Count infra-only failures per task from checkpoint results."""
+    counts: dict[str, int] = {}
+    for task_result in checkpoint.get("results", []):
+        task_id = task_result["task_id"]
+        total_infra = 0
+        total_runs = 0
+        for cond in ("none", "flat_llm", "intent_layer"):
+            cond_data = task_result.get(cond)
+            if not cond_data:
+                continue
+            runs = cond_data.get("runs", [])
+            if not runs:
+                # Single run format
+                if cond_data.get("error", "").startswith((
+                    "[infrastructure]", "[pre-validation]",
+                )):
+                    total_infra += 1
+                total_runs += 1
+            else:
+                for run in runs:
+                    total_runs += 1
+                    if run.get("error", "").startswith((
+                        "[infrastructure]", "[pre-validation]",
+                    )):
+                        total_infra += 1
+        # Only count if ALL runs are infra failures
+        if total_runs > 0 and total_infra == total_runs:
+            counts[task_id] = total_infra
+    return counts
+
+
+def check_docker() -> bool:
+    """Check if Docker daemon is responsive."""
+    import subprocess
+    try:
+        result = subprocess.run(
+            ["docker", "info"],
+            capture_output=True, timeout=10,
+        )
+        return result.returncode == 0
+    except (subprocess.TimeoutExpired, FileNotFoundError):
+        return False
+
+
+def restart_docker() -> bool:
+    """Attempt to restart Docker via OrbStack (macOS)."""
+    import subprocess
+    try:
+        subprocess.run(["open", "-a", "OrbStack"], timeout=10)
+        # Wait for Docker to come back
+        for _ in range(12):  # 60 seconds max
+            time.sleep(5)
+            if check_docker():
+                return True
+        return False
+    except (subprocess.TimeoutExpired, FileNotFoundError):
+        return False
+
+
+def evaluate_policies(
+    status: dict,
+    state: MonitorState,
+    config: MonitorConfig,
+) -> list[dict]:
+    """Apply policy rules and return list of actions to take.
+
+    Each action is {"type": str, "command": str, "value": str, "reason": str}.
+    """
+    actions: list[dict] = []
+    now = time.time()
+
+    completed = status.get("completed", 0)
+    remaining = status.get("remaining", 0)
+    infra_rate = status.get("infra_rate", 0)
+    paused = status.get("paused", False)
+
+    # ── Stall detection ──────────────────────────────────────────
+    if completed > state.last_completed:
+        state.last_completed = completed
+        state.last_progress_time = now
+    elif remaining > 0 and not paused:
+        stall_seconds = now - state.last_progress_time
+        if stall_seconds > config.stall_timeout:
+            # Check if Docker is the problem
+            if not check_docker():
+                actions.append({
+                    "type": "docker_restart",
+                    "reason": f"No progress for {stall_seconds:.0f}s, Docker unresponsive",
+                })
+            else:
+                actions.append({
+                    "type": "log_warning",
+                    "reason": f"No progress for {stall_seconds:.0f}s but Docker is healthy. Eval may be stuck.",
+                })
+
+    # ── Infra-heavy task skipping ────────────────────────────────
+    checkpoint = read_checkpoint(config.results_dir)
+    if checkpoint:
+        task_infra = count_task_infra_failures(checkpoint)
+        for task_id, count in task_infra.items():
+            if (count >= config.infra_skip_threshold
+                    and task_id not in state.skipped_tasks):
+                actions.append({
+                    "type": "skip_task",
+                    "command": "skip-task",
+                    "value": task_id,
+                    "reason": f"Task {task_id}: {count} consecutive infra failures",
+                })
+
+    # ── Worker recovery ──────────────────────────────────────────
+    workers = status.get("workers", 0)
+    if workers < config.max_workers_recovery and infra_rate < 0.1 and remaining > 5:
+        actions.append({
+            "type": "set_workers",
+            "command": "set-workers",
+            "value": str(config.max_workers_recovery),
+            "reason": f"Infra rate low ({infra_rate:.0%}), recovering workers {workers} → {config.max_workers_recovery}",
+        })
+
+    # ── Eval complete ────────────────────────────────────────────
+    if remaining == 0 and completed > 0:
+        actions.append({
+            "type": "eval_complete",
+            "reason": f"Eval finished: {completed} runs, pass rate {status.get('pass_rate', 0):.0%}",
+        })
+
+    return actions
+
+
+def execute_actions(
+    actions: list[dict],
+    state: MonitorState,
+    config: MonitorConfig,
+) -> None:
+    """Execute the actions returned by evaluate_policies."""
+    for action in actions:
+        action_type = action["type"]
+        reason = action["reason"]
+        timestamp = time.strftime("%H:%M:%S")
+
+        if action_type == "docker_restart":
+            log.warning("[%s] %s — attempting Docker restart", timestamp, reason)
+            if restart_docker():
+                state.docker_restarts += 1
+                log.info("[%s] Docker restarted successfully (#%d)", timestamp, state.docker_restarts)
+                # Resume if we paused
+                if state.paused_by_monitor:
+                    send_command(config.results_dir, "resume")
+                    state.paused_by_monitor = False
+                state.last_progress_time = time.time()
+            else:
+                log.error("[%s] Docker restart failed", timestamp)
+                # Pause the eval to prevent wasting budget
+                send_command(config.results_dir, "pause")
+                state.paused_by_monitor = True
+
+        elif action_type == "skip_task":
+            task_id = action["value"]
+            log.warning("[%s] SKIP %s — %s", timestamp, task_id, reason)
+            send_command(config.results_dir, "skip-task", task_id)
+            state.skipped_tasks.add(task_id)
+
+        elif action_type == "set_workers":
+            new_workers = action["value"]
+            log.info("[%s] SET-WORKERS %s — %s", timestamp, new_workers, reason)
+            send_command(config.results_dir, "set-workers", new_workers)
+
+        elif action_type == "budget_pause":
+            log.warning("[%s] PAUSE — %s", timestamp, reason)
+            send_command(config.results_dir, "pause")
+            state.paused_by_monitor = True
+
+        elif action_type == "eval_complete":
+            log.info("[%s] COMPLETE — %s", timestamp, reason)
+
+        elif action_type == "log_warning":
+            log.warning("[%s] %s", timestamp, reason)
+
+        state.actions_taken.append({
+            "time": timestamp,
+            **action,
+        })
+
+
+def run_loop(config: MonitorConfig) -> MonitorState:
+    """Main polling loop. Runs until eval completes or is interrupted."""
+    state = MonitorState()
+    log.info(
+        "Eval monitor started — watching %s (poll every %ds)",
+        config.results_dir, config.poll_interval,
+    )
+
+    try:
+        while True:
+            status = read_status(config.results_dir)
+            if status is None:
+                log.debug("No status file yet, waiting...")
+                time.sleep(config.poll_interval)
+                continue
+
+            remaining = status.get("remaining", 0)
+            completed = status.get("completed", 0)
+            log.info(
+                "Status: %d/%d completed, %d remaining, workers=%d, infra_rate=%.0f%%",
+                completed, completed + remaining, remaining,
+                status.get("workers", 0),
+                status.get("infra_rate", 0) * 100,
+            )
+
+            actions = evaluate_policies(status, state, config)
+            if actions:
+                execute_actions(actions, state, config)
+
+            # Stop if eval is done
+            if remaining == 0 and completed > 0:
+                log.info("Eval complete. Monitor shutting down.")
+                break
+
+            time.sleep(config.poll_interval)
+
+    except KeyboardInterrupt:
+        log.info("Monitor interrupted by user.")
+
+    # Write action log
+    log_path = config.results_dir / ".monitor-log.json"
+    with open(log_path, "w") as f:
+        json.dump(state.actions_taken, f, indent=2)
+    log.info("Action log written to %s (%d actions)", log_path, len(state.actions_taken))
+
+    return state
+
+
+def main():
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Eval monitor: automated eval supervisor",
+    )
+    parser.add_argument(
+        "--results-dir", required=True, type=Path,
+        help="Path to eval results directory (contains .eval-status.json)",
+    )
+    parser.add_argument(
+        "--poll-interval", type=int, default=15,
+        help="Seconds between status checks (default: 15)",
+    )
+    parser.add_argument(
+        "--stall-timeout", type=int, default=300,
+        help="Seconds without progress before stall alert (default: 300)",
+    )
+    parser.add_argument(
+        "--infra-skip-threshold", type=int, default=3,
+        help="Skip task after N infra-only failures (default: 3)",
+    )
+    parser.add_argument(
+        "--max-workers", type=int, default=4,
+        help="Worker count to recover to after infra stabilizes (default: 4)",
+    )
+    parser.add_argument(
+        "-v", "--verbose", action="store_true",
+        help="Enable debug logging",
+    )
+    args = parser.parse_args()
+
+    logging.basicConfig(
+        level=logging.DEBUG if args.verbose else logging.INFO,
+        format="%(asctime)s [monitor] %(levelname)s %(message)s",
+        datefmt="%H:%M:%S",
+    )
+
+    config = MonitorConfig(
+        results_dir=args.results_dir,
+        poll_interval=args.poll_interval,
+        stall_timeout=args.stall_timeout,
+        infra_skip_threshold=args.infra_skip_threshold,
+        max_workers_recovery=args.max_workers,
+    )
+    run_loop(config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/eval-harness/lib/reporter.py b/eval-harness/lib/reporter.py
index 9b307c0..8d452e1 100644
--- a/eval-harness/lib/reporter.py
+++ b/eval-harness/lib/reporter.py
@@ -8,7 +8,7 @@
 from typing import Any
 
 from lib.task_runner import TaskResult, Condition
-from lib.stats import wilson_score_interval, ci_overlap, mcnemar_test
+from lib.stats import wilson_score_interval, ci_overlap, mcnemar_test, fisher_exact_test
 from lib.budget import fmt_tokens
 
 
@@ -372,6 +372,12 @@ def median_tokens(task_results: list[TaskResult]) -> int:
                 if mcnemar_entry and mcnemar_entry["n_discordant"] > 0:
                     summary[f"{treatment}_vs_none_significant"] = mcnemar_entry["p_value"] < 0.05
 
+        # Per-task Fisher's exact tests + recommendations
+        if has_multi_run:
+            per_task_fisher = self._compute_per_task_fisher(results)
+            summary["per_task_fisher"] = per_task_fisher
+            summary["recommendations"] = self._compute_recommendations(per_task_fisher, results)
+
         return summary
 
     def _compute_mcnemar(self, results: list[TaskResult]) -> dict:
@@ -422,6 +428,99 @@ def _compute_mcnemar(self, results: list[TaskResult]) -> dict:
 
         return mcnemar_results
 
+    def _compute_per_task_fisher(self, results: list[TaskResult]) -> list[dict]:
+        """Run Fisher's exact test per task for each condition pair.
+
+        Unlike McNemar (which pools all tasks for paired analysis), Fisher
+        tests each task independently — useful for identifying which specific
+        tasks drive the aggregate signal and for flagging task quality issues.
+        """
+        # Group by task_id → condition → list of valid results
+        grouped: dict[str, dict[str, list[TaskResult]]] = {}
+        for r in results:
+            if self._is_infra_error(r):
+                continue
+            if r.task_id not in grouped:
+                grouped[r.task_id] = {}
+            cond = r.condition.value
+            if cond not in grouped[r.task_id]:
+                grouped[r.task_id][cond] = []
+            grouped[r.task_id][cond].append(r)
+
+        comparisons = [
+            ("none", "flat_llm"),
+            ("none", "intent_layer"),
+            ("flat_llm", "intent_layer"),
+        ]
+
+        per_task: list[dict] = []
+        for task_id, conditions in grouped.items():
+            task_entry: dict[str, Any] = {"task_id": task_id, "comparisons": {}}
+
+            for cond_a, cond_b in comparisons:
+                a_runs = conditions.get(cond_a, [])
+                b_runs = conditions.get(cond_b, [])
+                if not a_runs or not b_runs:
+                    continue
+
+                a_pass = sum(1 for r in a_runs if r.success)
+                b_pass = sum(1 for r in b_runs if r.success)
+                result = fisher_exact_test(a_pass, len(a_runs), b_pass, len(b_runs))
+                task_entry["comparisons"][f"{cond_a}_vs_{cond_b}"] = result
+
+            # Task quality flags
+            all_runs = [r for runs in conditions.values() for r in runs]
+            total_pass = sum(1 for r in all_runs if r.success)
+            task_entry["total_runs"] = len(all_runs)
+            task_entry["total_pass"] = total_pass
+            task_entry["pass_rate"] = round(total_pass / len(all_runs), 2) if all_runs else 0
+
+            # Ceiling: all conditions ~100% → no discriminative power
+            task_entry["ceiling_effected"] = total_pass == len(all_runs) and len(all_runs) >= 3
+
+            # Floor: all conditions 0% → task may be broken or too hard
+            task_entry["floor_effected"] = total_pass == 0 and len(all_runs) >= 3
+
+            per_task.append(task_entry)
+
+        return per_task
+
+    def _compute_recommendations(self, per_task_fisher: list[dict], results: list[TaskResult]) -> list[str]:
+        """Generate actionable recommendations from per-task analysis."""
+        recs: list[str] = []
+
+        # Check for tasks with all infra errors (no valid runs)
+        task_ids_with_data = {t["task_id"] for t in per_task_fisher}
+        all_task_ids = set(r.task_id for r in results)
+        infra_only = all_task_ids - task_ids_with_data
+        for tid in sorted(infra_only):
+            recs.append(f"**{tid}**: all runs were infrastructure errors. Check Docker setup and pre-validation.")
+
+        for t in per_task_fisher:
+            tid = t["task_id"]
+            if t["ceiling_effected"]:
+                recs.append(
+                    f"**{tid}**: ceiling-effected ({t['total_pass']}/{t['total_runs']} pass). "
+                    f"No discriminative power — consider replacing with a harder task."
+                )
+            if t["floor_effected"]:
+                recs.append(
+                    f"**{tid}**: floor-effected (0/{t['total_runs']} pass). "
+                    f"All conditions fail — task may be too hard or misconfigured."
+                )
+
+            # Flag significant per-task results
+            for comp_key, comp in t["comparisons"].items():
+                if comp["p_value"] < 0.10 and abs(comp["rate_diff"]) >= 0.3:
+                    direction = "+" if comp["rate_diff"] > 0 else ""
+                    recs.append(
+                        f"**{tid}** ({comp_key.replace('_vs_', ' vs ')}): "
+                        f"{direction}{comp['rate_diff']:.0%} rate difference "
+                        f"(p={comp['p_value']:.3f}). Worth deeper investigation."
+                    )
+
+        return recs
+
     def write_checkpoint(
         self,
         results: list['TaskResult'],
@@ -729,6 +828,51 @@ def write_markdown(self, results: EvalResults) -> str:
                     f"{data['p_value']:.3f} | {sig} |"
                 )
 
+        # Per-Task Fisher's Exact Test section
+        per_task_fisher = summary.get("per_task_fisher", [])
+        tasks_with_comparisons = [t for t in per_task_fisher if t["comparisons"]]
+        if tasks_with_comparisons:
+            lines += [
+                "",
+                "",
+                "## Per-Task Analysis (Fisher's Exact Test)",
+                "",
+                "| Task | Comparison | A rate | B rate | Diff | p-value | Sig. |",
+                "|------|------------|--------|--------|------|---------|------|",
+            ]
+            for t in tasks_with_comparisons:
+                tid = t["task_id"]
+                for comp_key, comp in t["comparisons"].items():
+                    label = comp_key.replace("_vs_", " vs ")
+                    sig = "*" if comp["p_value"] < 0.05 else ("~" if comp["p_value"] < 0.10 else "")
+                    lines.append(
+                        f"| {tid} | {label} | {comp['a_rate']:.0%} | "
+                        f"{comp['b_rate']:.0%} | {comp['rate_diff']:+.0%} | "
+                        f"{comp['p_value']:.3f} | {sig} |"
+                    )
+
+            # Quality flags
+            ceiling = [t for t in per_task_fisher if t["ceiling_effected"]]
+            floor = [t for t in per_task_fisher if t["floor_effected"]]
+            if ceiling or floor:
+                lines.append("")
+                for t in ceiling:
+                    lines.append(f"- **{t['task_id']}**: ceiling-effected (100% all conditions)")
+                for t in floor:
+                    lines.append(f"- **{t['task_id']}**: floor-effected (0% all conditions)")
+
+        # Recommendations section
+        recs = summary.get("recommendations", [])
+        if recs:
+            lines += [
+                "",
+                "",
+                "## Recommendations",
+                "",
+            ]
+            for rec in recs:
+                lines.append(f"- {rec}")
+
         # Budget Impact section (when budget data is available)
         if results.budget:
             try:
diff --git a/eval-harness/lib/stats.py b/eval-harness/lib/stats.py
index 750a70c..cefb84e 100644
--- a/eval-harness/lib/stats.py
+++ b/eval-harness/lib/stats.py
@@ -132,3 +132,57 @@ def mcnemar_test(b: int, c: int) -> dict:
     p_value = min(p_value * 2, 1.0)  # two-sided
 
     return {"p_value": p_value, "n_discordant": n, "a_wins": b, "b_wins": c}
+
+
+def fisher_exact_test(a_pass: int, a_total: int, b_pass: int, b_total: int) -> dict:
+    """Fisher's exact test for 2x2 contingency table (two-sided).
+
+    Compares pass rates between two independent groups (e.g., condition A
+    vs condition B for a single task). Uses hypergeometric distribution
+    to compute exact p-value without scipy.
+
+    Args:
+        a_pass, a_total: successes and total trials for group A
+        b_pass, b_total: successes and total trials for group B
+
+    Returns:
+        dict with p_value, a_rate, b_rate, rate_diff
+    """
+    a_fail = a_total - a_pass
+    b_fail = b_total - b_pass
+    n = a_total + b_total
+    row1 = a_pass + b_pass  # total passes
+    row2 = a_fail + b_fail  # total fails
+
+    if n == 0:
+        return {"p_value": 1.0, "a_rate": 0.0, "b_rate": 0.0, "rate_diff": 0.0}
+
+    # Probability of a specific table under H0 (hypergeometric)
+    def table_prob(a_p: int) -> float:
+        b_p = row1 - a_p
+        a_f = a_total - a_p
+        b_f = b_total - b_p
+        if any(x < 0 for x in (a_p, b_p, a_f, b_f)):
+            return 0.0
+        return (
+            math.comb(a_total, a_p)
+            * math.comb(b_total, b_p)
+            / math.comb(n, row1)
+        )
+
+    # Two-sided: sum probabilities of tables as extreme or more extreme
+    observed_prob = table_prob(a_pass)
+    p_value = 0.0
+    for i in range(max(0, row1 - b_total), min(row1, a_total) + 1):
+        prob = table_prob(i)
+        if prob <= observed_prob + 1e-12:  # tolerance for float comparison
+            p_value += prob
+
+    a_rate = a_pass / a_total if a_total else 0.0
+    b_rate = b_pass / b_total if b_total else 0.0
+    return {
+        "p_value": round(min(p_value, 1.0), 4),
+        "a_rate": round(a_rate, 4),
+        "b_rate": round(b_rate, 4),
+        "rate_diff": round(b_rate - a_rate, 4),
+    }
diff --git a/eval-harness/tests/test_monitor.py b/eval-harness/tests/test_monitor.py
new file mode 100644
index 0000000..8fcd5d0
--- /dev/null
+++ b/eval-harness/tests/test_monitor.py
@@ -0,0 +1,295 @@
+# tests/test_monitor.py
+import json
+import tempfile
+import time
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+from lib.monitor import (
+    MonitorConfig,
+    MonitorState,
+    read_status,
+    send_command,
+    count_task_infra_failures,
+    evaluate_policies,
+    execute_actions,
+)
+
+
+@pytest.fixture
+def results_dir():
+    with tempfile.TemporaryDirectory() as d:
+        yield Path(d)
+
+
+def _write_status(results_dir: Path, **overrides):
+    """Write a .eval-status.json with sensible defaults."""
+    status = {
+        "eval_id": "test-eval",
+        "timestamp": "2026-02-24T12:00:00",
+        "uptime_seconds": 600,
+        "workers": 4,
+        "paused": False,
+        "batch_total": 60,
+        "completed": 30,
+        "passes": 20,
+        "genuine_failures": 5,
+        "infra_failures": 5,
+        "remaining": 30,
+        "pass_rate": 0.8,
+        "infra_rate": 0.167,
+        **overrides,
+    }
+    path = results_dir / ".eval-status.json"
+    path.write_text(json.dumps(status))
+    return status
+
+
+def _write_checkpoint(results_dir: Path, results: list[dict]):
+    """Write a minimal checkpoint file."""
+    checkpoint = {
+        "eval_id": "test-eval",
+        "checkpoint": True,
+        "completed_runs": sum(1 for _ in results),
+        "results": results,
+    }
+    path = results_dir / "in-progress-test-eval.json"
+    path.write_text(json.dumps(checkpoint))
+
+
+class TestReadStatus:
+    def test_reads_valid_status(self, results_dir):
+        _write_status(results_dir, completed=42)
+        status = read_status(results_dir)
+        assert status["completed"] == 42
+
+    def test_returns_none_when_missing(self, results_dir):
+        assert read_status(results_dir) is None
+
+    def test_returns_none_on_corrupt_json(self, results_dir):
+        (results_dir / ".eval-status.json").write_text("not json{{{")
+        assert read_status(results_dir) is None
+
+
+class TestSendCommand:
+    def test_creates_command_file(self, results_dir):
+        send_command(results_dir, "pause")
+        cmd_path = results_dir / ".eval-control" / "pause"
+        assert cmd_path.exists()
+        assert cmd_path.read_text() == ""
+
+    def test_creates_command_with_value(self, results_dir):
+        send_command(results_dir, "skip-task", "fix-broken-task")
+        cmd_path = results_dir / ".eval-control" / "skip-task"
+        assert cmd_path.read_text() == "fix-broken-task"
+
+    def test_creates_control_dir(self, results_dir):
+        send_command(results_dir, "resume")
+        assert (results_dir / ".eval-control").is_dir()
+
+
+class TestCountTaskInfraFailures:
+    def test_counts_all_infra_task(self):
+        checkpoint = {"results": [{
+            "task_id": "task-broken",
+            "none": {"runs": [
+                {"error": "[infrastructure] Docker setup timed out"},
+                {"error": "[pre-validation] test failed"},
+            ]},
+            "flat_llm": {"runs": [
+                {"error": "[infrastructure] Docker setup timed out"},
+            ]},
+        }]}
+        counts = count_task_infra_failures(checkpoint)
+        assert counts["task-broken"] == 3
+
+    def test_ignores_mixed_task(self):
+        """Task with some genuine failures is NOT counted as all-infra."""
+        checkpoint = {"results": [{
+            "task_id": "task-mixed",
+            "none": {"runs": [
+                {"error": "[infrastructure] Docker setup timed out"},
+                {"success": True},  # genuine result
+            ]},
+        }]}
+        counts = count_task_infra_failures(checkpoint)
+        assert "task-mixed" not in counts
+
+    def test_empty_results(self):
+        assert count_task_infra_failures({"results": []}) == {}
+
+
+class TestEvaluatePolicies:
+    def _config(self, results_dir):
+        return MonitorConfig(results_dir=results_dir)
+
+    def test_detects_progress(self, results_dir):
+        """Completed count advancing resets stall timer."""
+        config = self._config(results_dir)
+        state = MonitorState(last_completed=10)
+        status = {"completed": 15, "remaining": 45, "workers": 4,
+                  "infra_rate": 0.1, "paused": False}
+        actions = evaluate_policies(status, state, config)
+
+        assert state.last_completed == 15
+        # No stall warning since progress was made
+        assert not any(a["type"] == "log_warning" for a in actions)
+
+    @patch("lib.monitor.check_docker", return_value=False)
+    def test_stall_triggers_docker_check(self, mock_docker, results_dir):
+        """No progress beyond stall_timeout triggers Docker check."""
+        config = self._config(results_dir)
+        config.stall_timeout = 10
+        state = MonitorState(
+            last_completed=10,
+            last_progress_time=time.time() - 60,  # 60s ago
+        )
+        status = {"completed": 10, "remaining": 50, "workers": 4,
+                  "infra_rate": 0.1, "paused": False}
+        actions = evaluate_policies(status, state, config)
+
+        assert any(a["type"] == "docker_restart" for a in actions)
+
+    @patch("lib.monitor.check_docker", return_value=True)
+    def test_stall_with_healthy_docker_warns(self, mock_docker, results_dir):
+        """Stall with healthy Docker logs warning instead of restart."""
+        config = self._config(results_dir)
+        config.stall_timeout = 10
+        state = MonitorState(
+            last_completed=10,
+            last_progress_time=time.time() - 60,
+        )
+        status = {"completed": 10, "remaining": 50, "workers": 4,
+                  "infra_rate": 0.1, "paused": False}
+        actions = evaluate_policies(status, state, config)
+
+        assert any(a["type"] == "log_warning" for a in actions)
+        assert not any(a["type"] == "docker_restart" for a in actions)
+
+    def test_skip_task_after_infra_threshold(self, results_dir):
+        """Task with all infra failures gets skipped."""
+        config = self._config(results_dir)
+        config.infra_skip_threshold = 2
+        # Write checkpoint with all-infra task
+        _write_checkpoint(results_dir, [{
+            "task_id": "task-broken",
+            "none": {"runs": [
+                {"error": "[infrastructure] fail"},
+                {"error": "[infrastructure] fail"},
+            ]},
+            "flat_llm": {"runs": [
+                {"error": "[infrastructure] fail"},
+            ]},
+        }])
+        state = MonitorState(last_completed=5)
+        status = {"completed": 5, "remaining": 55, "workers": 4,
+                  "infra_rate": 0.3, "paused": False}
+        actions = evaluate_policies(status, state, config)
+
+        skip_actions = [a for a in actions if a["type"] == "skip_task"]
+        assert len(skip_actions) == 1
+        assert skip_actions[0]["value"] == "task-broken"
+
+    def test_skip_task_idempotent(self, results_dir):
+        """Already-skipped task is not skipped again."""
+        config = self._config(results_dir)
+        config.infra_skip_threshold = 1
+        _write_checkpoint(results_dir, [{
+            "task_id": "task-broken",
+            "none": {"runs": [{"error": "[infrastructure] fail"}]},
+        }])
+        state = MonitorState(
+            last_completed=5,
+            skipped_tasks={"task-broken"},
+        )
+        status = {"completed": 5, "remaining": 55, "workers": 4,
+                  "infra_rate": 0.3, "paused": False}
+        actions = evaluate_policies(status, state, config)
+        assert not any(a["type"] == "skip_task" for a in actions)
+
+    def test_worker_recovery(self, results_dir):
+        """Low infra rate triggers worker recovery."""
+        config = self._config(results_dir)
+        config.max_workers_recovery = 8
+        state = MonitorState(last_completed=10)
+        status = {"completed": 10, "remaining": 50, "workers": 2,
+                  "infra_rate": 0.05, "paused": False}
+        actions = evaluate_policies(status, state, config)
+
+        worker_actions = [a for a in actions if a["type"] == "set_workers"]
+        assert len(worker_actions) == 1
+        assert worker_actions[0]["value"] == "8"
+
+    def test_no_worker_recovery_when_high_infra(self, results_dir):
+        """Don't bump workers when infra rate is high."""
+        config = self._config(results_dir)
+        state = MonitorState(last_completed=10)
+        status = {"completed": 10, "remaining": 50, "workers": 2,
+                  "infra_rate": 0.4, "paused": False}
+        actions = evaluate_policies(status, state, config)
+        assert not any(a["type"] == "set_workers" for a in actions)
+
+    def test_eval_complete(self, results_dir):
+        """Remaining=0 triggers eval_complete."""
+        config = self._config(results_dir)
+        state = MonitorState(last_completed=60)
+        status = {"completed": 60, "remaining": 0, "workers": 4,
+                  "infra_rate": 0.1, "pass_rate": 0.75, "paused": False}
+        actions = evaluate_policies(status, state, config)
+
+        assert any(a["type"] == "eval_complete" for a in actions)
+
+    def test_no_stall_when_paused(self, results_dir):
+        """Paused eval doesn't trigger stall detection."""
+        config = self._config(results_dir)
+        config.stall_timeout = 10
+        state = MonitorState(
+            last_completed=10,
+            last_progress_time=time.time() - 600,
+        )
+        status = {"completed": 10, "remaining": 50, "workers": 4,
+                  "infra_rate": 0.1, "paused": True}
+        actions = evaluate_policies(status, state, config)
+        assert not any(a["type"] in ("docker_restart", "log_warning") for a in actions)
+
+
+class TestExecuteActions:
+    def test_skip_task_sends_command(self, results_dir):
+        config = MonitorConfig(results_dir=results_dir)
+        state = MonitorState()
+        actions = [{
+            "type": "skip_task",
+            "command": "skip-task",
+            "value": "task-broken",
+            "reason": "test",
+        }]
+        execute_actions(actions, state, config)
+
+        assert "task-broken" in state.skipped_tasks
+        cmd_path = results_dir / ".eval-control" / "skip-task"
+        assert cmd_path.exists()
+        assert cmd_path.read_text() == "task-broken"
+
+    def test_set_workers_sends_command(self, results_dir):
+        config = MonitorConfig(results_dir=results_dir)
+        state = MonitorState()
+        actions = [{
+            "type": "set_workers",
+            "command": "set-workers",
+            "value": "8",
+            "reason": "test",
+        }]
+        execute_actions(actions, state, config)
+
+        cmd_path = results_dir / ".eval-control" / "set-workers"
+        assert cmd_path.exists()
+        assert cmd_path.read_text() == "8"
+
+    def test_actions_logged(self, results_dir):
+        config = MonitorConfig(results_dir=results_dir)
+        state = MonitorState()
+        actions = [{"type": "log_warning", "reason": "test warning"}]
+        execute_actions(actions, state, config)
+        assert len(state.actions_taken) == 1
+        assert state.actions_taken[0]["reason"] == "test warning"
diff --git a/eval-harness/tests/test_reporter.py b/eval-harness/tests/test_reporter.py
index fe819f7..7fd91e7 100644
--- a/eval-harness/tests/test_reporter.py
+++ b/eval-harness/tests/test_reporter.py
@@ -886,3 +886,134 @@ def test_mcnemar_markdown_output(tmp_path):
     assert "flat_llm vs none" in content
     assert "intent_layer vs none" in content
     assert "p-value" in content
+
+
+def _make_fisher_results():
+    """Create multi-task multi-rep results to test per-task Fisher analysis.
+
+    Task A: none 0/3 pass, flat 1/3, intent 3/3 (star result pattern)
+    Task B: none 3/3 pass, flat 3/3, intent 3/3 (ceiling-effected)
+    """
+    results = []
+    # Task A: clear signal
+    for rep in range(3):
+        results.append(TaskResult(
+            task_id="task-signal", condition=Condition.NONE, success=False,
+            test_output="FAIL", wall_clock_seconds=50.0, rep=rep,
+            input_tokens=2000, output_tokens=1000, tool_calls=10,
+            lines_changed=20, files_touched=["a.py"],
+        ))
+    for rep in range(3):
+        results.append(TaskResult(
+            task_id="task-signal", condition=Condition.FLAT_LLM,
+            success=(rep == 0),  # 1/3 pass
+            test_output="PASS" if rep == 0 else "FAIL",
+            wall_clock_seconds=50.0, rep=rep,
+            input_tokens=2000, output_tokens=1000, tool_calls=10,
+            lines_changed=20, files_touched=["a.py"],
+        ))
+    for rep in range(3):
+        results.append(TaskResult(
+            task_id="task-signal", condition=Condition.INTENT_LAYER,
+            success=True, test_output="PASS", wall_clock_seconds=50.0, rep=rep,
+            input_tokens=2000, output_tokens=1000, tool_calls=10,
+            lines_changed=20, files_touched=["a.py"],
+        ))
+
+    # Task B: ceiling-effected
+    for cond in (Condition.NONE, Condition.FLAT_LLM, Condition.INTENT_LAYER):
+        for rep in range(3):
+            results.append(TaskResult(
+                task_id="task-ceiling", condition=cond, success=True,
+                test_output="PASS", wall_clock_seconds=50.0, rep=rep,
+                input_tokens=2000, output_tokens=1000, tool_calls=10,
+                lines_changed=20, files_touched=["a.py"],
+            ))
+    return results
+
+
+def test_per_task_fisher_in_summary():
+    """Per-task Fisher tests appear in summary for multi-run data."""
+    reporter = Reporter(output_dir="/tmp")
+    results = _make_fisher_results()
+    eval_results = reporter.compile_results(results)
+    summary = eval_results.summary
+
+    assert "per_task_fisher" in summary
+    fisher = summary["per_task_fisher"]
+    assert len(fisher) == 2
+
+    # Find task-signal entry
+    signal_task = next(t for t in fisher if t["task_id"] == "task-signal")
+    assert "none_vs_intent_layer" in signal_task["comparisons"]
+    comp = signal_task["comparisons"]["none_vs_intent_layer"]
+    assert comp["a_rate"] == 0.0
+    assert comp["b_rate"] == 1.0
+    assert comp["p_value"] <= 0.10  # borderline significant
+
+    # Ceiling-effected task
+    ceiling_task = next(t for t in fisher if t["task_id"] == "task-ceiling")
+    assert ceiling_task["ceiling_effected"] is True
+    assert ceiling_task["pass_rate"] == 1.0
+
+
+def test_recommendations_generated():
+    """Recommendations section flags ceiling-effected tasks and significant results."""
+    reporter = Reporter(output_dir="/tmp")
+    results = _make_fisher_results()
+    eval_results = reporter.compile_results(results)
+    recs = eval_results.summary.get("recommendations", [])
+
+    assert len(recs) >= 1
+    # Should flag ceiling-effected task
+    ceiling_recs = [r for r in recs if "ceiling-effected" in r]
+    assert len(ceiling_recs) == 1
+    assert "task-ceiling" in ceiling_recs[0]
+
+    # task-signal has p=0.10 exactly for none vs intent (3 reps per group),
+    # which is at the boundary of our p < 0.10 threshold — not flagged.
+    # With 5 reps (0/5 vs 5/5), it would be p=0.008 and clearly flagged.
+
+
+def test_recommendations_flags_infra_only_tasks():
+    """Tasks with only infra errors get flagged in recommendations."""
+    results = [
+        TaskResult(
+            task_id="task-broken", condition=Condition.NONE, success=False,
+            test_output="", wall_clock_seconds=0, rep=0,
+            input_tokens=0, output_tokens=0, tool_calls=0,
+            lines_changed=0, files_touched=[],
+            error="[infrastructure] Docker setup timed out",
+        ),
+        TaskResult(
+            task_id="task-broken", condition=Condition.NONE, success=False,
+            test_output="", wall_clock_seconds=0, rep=1,
+            input_tokens=0, output_tokens=0, tool_calls=0,
+            lines_changed=0, files_touched=[],
+            error="[infrastructure] Docker setup timed out",
+        ),
+    ]
+    reporter = Reporter(output_dir="/tmp")
+    eval_results = reporter.compile_results(results)
+    recs = eval_results.summary.get("recommendations", [])
+
+    infra_recs = [r for r in recs if "task-broken" in r and "infrastructure" in r]
+    assert len(infra_recs) == 1
+
+
+def test_fisher_markdown_output(tmp_path):
+    """Per-Task Analysis and Recommendations sections appear in markdown."""
+    reporter = Reporter(output_dir=str(tmp_path))
+    results = _make_fisher_results()
+    eval_results = reporter.compile_results(results)
+    md_path = reporter.write_markdown(eval_results)
+
+    with open(md_path) as f:
+        content = f.read()
+
+    assert "## Per-Task Analysis (Fisher's Exact Test)" in content
+    assert "task-signal" in content
+    assert "none vs intent_layer" in content
+
+    assert "## Recommendations" in content
+    assert "ceiling-effected" in content
diff --git a/eval-harness/tests/test_stats.py b/eval-harness/tests/test_stats.py
index f33259f..24e544d 100644
--- a/eval-harness/tests/test_stats.py
+++ b/eval-harness/tests/test_stats.py
@@ -1,6 +1,6 @@
 # tests/test_stats.py
 import pytest
-from lib.stats import _inverse_normal_cdf, wilson_score_interval, ci_overlap, mcnemar_test
+from lib.stats import _inverse_normal_cdf, wilson_score_interval, ci_overlap, mcnemar_test, fisher_exact_test
 
 
 class TestInverseNormalCDF:
@@ -169,3 +169,46 @@ def test_mcnemar_single_pair(self):
         assert result["n_discordant"] == 1
         assert result["a_wins"] == 0
         assert result["b_wins"] == 1
+
+
+class TestFisherExactTest:
+    def test_identical_rates(self):
+        """Same pass rates — not significant."""
+        result = fisher_exact_test(3, 5, 3, 5)
+        assert result["p_value"] == 1.0
+        assert result["rate_diff"] == 0.0
+
+    def test_extreme_difference(self):
+        """0/5 vs 5/5 — highly significant."""
+        result = fisher_exact_test(0, 5, 5, 5)
+        assert result["p_value"] < 0.01
+        assert result["a_rate"] == 0.0
+        assert result["b_rate"] == 1.0
+        assert result["rate_diff"] == 1.0
+
+    def test_our_star_result(self):
+        """0/3 vs 3/3 — the ansible star result (Fisher p=0.05)."""
+        result = fisher_exact_test(0, 3, 3, 3)
+        assert result["p_value"] <= 0.10  # borderline significant
+        assert result["a_rate"] == 0.0
+        assert result["b_rate"] == 1.0
+
+    def test_moderate_difference(self):
+        """3/5 vs 1/5 — not enough power to detect."""
+        result = fisher_exact_test(3, 5, 1, 5)
+        assert result["p_value"] > 0.10
+        assert result["rate_diff"] < 0
+
+    def test_empty_groups(self):
+        """Both empty — p=1.0."""
+        result = fisher_exact_test(0, 0, 0, 0)
+        assert result["p_value"] == 1.0
+
+    def test_returns_all_fields(self):
+        """Verify all expected fields are present."""
+        result = fisher_exact_test(2, 5, 4, 5)
+        assert "p_value" in result
+        assert "a_rate" in result
+        assert "b_rate" in result
+        assert "rate_diff" in result
+        assert 0 <= result["p_value"] <= 1.0

From 8ca48a5cd0fcbfbba180b9c27166d5ab771d4e5d Mon Sep 17 00:00:00 2001
From: Ryan Orban <ryan@orban.dev>
Date: Tue, 24 Feb 2026 21:07:28 -0800
Subject: [PATCH 08/21] add AGENTbench native adapter with security and perf
 fixes

- AGENTbench loader (HuggingFace dataset) and runner (Docker-based eval)
- run-agentbench CLI command with 4 conditions (none/flat/human/intent_layer)
- Dynamic condition discovery in reporter (no more hardcoded condition list)
- Path traversal protection in write_test_infrastructure
- Docker --network none for test isolation
- Thread-safe temp dirs (PID + thread ID)
- Checkpoint batching (every 10 results vs every 1)
- Monitor uses Reporter.INFRA_ERROR_PREFIXES (no drift)
- Set-based infra result filtering (replaces fragile list.remove)
- Empty dict all() guard in pre-validation

Entire-Checkpoint: 815f2403cb52
---
 ...026-02-24-agentbench-adapter-brainstorm.md | 172 +++++++
 ...-24-feat-agentbench-native-adapter-plan.md | 418 ++++++++++++++++
 eval-harness/lib/agentbench_loader.py         |  91 ++++
 eval-harness/lib/agentbench_runner.py         | 472 ++++++++++++++++++
 eval-harness/lib/cli.py                       | 456 ++++++++++++++++-
 eval-harness/lib/docker_runner.py             |   2 +-
 eval-harness/lib/monitor.py                   |  18 +-
 eval-harness/lib/reporter.py                  | 260 +++++-----
 eval-harness/lib/task_runner.py               |   1 +
 eval-harness/tests/test_agentbench.py         | 390 +++++++++++++++
 eval-harness/tests/test_reporter.py           |  26 +-
 eval-harness/tests/test_resume.py             |   7 +-
 eval-harness/tests/test_task_runner.py        |   3 +-
 13 files changed, 2134 insertions(+), 182 deletions(-)
 create mode 100644 eval-harness/docs/brainstorms/2026-02-24-agentbench-adapter-brainstorm.md
 create mode 100644 eval-harness/docs/plans/2026-02-24-feat-agentbench-native-adapter-plan.md
 create mode 100644 eval-harness/lib/agentbench_loader.py
 create mode 100644 eval-harness/lib/agentbench_runner.py
 create mode 100644 eval-harness/tests/test_agentbench.py

diff --git a/eval-harness/docs/brainstorms/2026-02-24-agentbench-adapter-brainstorm.md b/eval-harness/docs/brainstorms/2026-02-24-agentbench-adapter-brainstorm.md
new file mode 100644
index 0000000..529ae06
--- /dev/null
+++ b/eval-harness/docs/brainstorms/2026-02-24-agentbench-adapter-brainstorm.md
@@ -0,0 +1,172 @@
+# AGENTbench Native Adapter
+
+**Date**: 2026-02-24
+**Status**: Brainstorm complete, ready for planning
+
+## What we're building
+
+A native adapter in our eval harness that loads the AGENTbench paper's 138
+instances directly from HuggingFace and runs them with four conditions
+(none, flat_llm, human, intent_layer) using their test infrastructure but our
+resilience and statistics pipeline.
+
+## Why this approach
+
+The paper (arxiv 2602.11988v1) claims context files hurt agent performance.
+Our prior runs suggest flat context hurts but hierarchical context is neutral
+to helpful. To make that case credibly:
+
+- We need to run the **same tasks** they did, not cherry-picked ones
+- We need to use their **test evaluation** (both instance tests AND repo
+  regression checks), not just exit-code pass/fail
+- We need the **statistical rigor** they lack: multiple reps, paired tests,
+  per-task significance testing
+
+Running 138 instances x 4 conditions at their fidelity, with our stats, is
+the strongest possible response to the paper.
+
+## Key decisions
+
+### 1. Use their test infrastructure verbatim
+
+Their instances include pre-built Python test runner scripts
+(`repo_test_runner`, `test_file_runner`) that emit structured JSON results.
+We'll execute these inside Docker exactly as they do, rather than converting
+to our shell-command-and-exit-code approach. This eliminates the "different
+test criteria" objection entirely.
+
+### 2. Use their problem descriptions
+
+Their task text is curated markdown extracted from PRs. We'll pass it to
+Claude verbatim rather than regenerating from commit messages or issues.
+Same task statement = same experiment, just with an added condition.
+
+### 3. Use their Docker images on x86 cloud
+
+Their pre-built images (`tgloaguen/planbenchx86_*`) are x86-only. We'll run
+on EC2 (or similar x86 cloud) rather than fighting Rosetta/QEMU emulation
+on ARM Macs. This eliminates timing artifacts and architecture mismatches.
+
+Fallback: replicate their setup_commands on base images if their images go
+stale.
+
+### 4. Four conditions
+
+| Condition | Context source | Notes |
+|-----------|---------------|-------|
+| `none` | All .md + .github + docs stripped | Matches their baseline exactly |
+| `flat_llm` | Their cached generated files from `agentbench_traces.tar.zst` | Exact same files they tested — no regeneration variance |
+| `human` | Developer-written files from git history | Their `human_planner.py` approach: fetch from future commits after base_sha |
+| `intent_layer` | Hierarchical AGENTS.md generated at HEAD | See "Data leakage" section below |
+
+Adding `human` as a 4th condition is low effort (the instances include the
+developer's original files) and gives us a direct calibration point against
+their published numbers.
+
+### 5. Flat LLM uses their cached files
+
+Rather than regenerating with our prompt (which introduces variance), we use
+their pre-generated files from `agentbench_traces.tar.zst` → `extracted_plans.json`.
+This is the exact content they tested. Zero regeneration variance.
+
+### 6. Phased execution
+
+**Phase 1**: All 138 instances x 1 rep x 4 conditions = 552 runs.
+Replicates their experiment design plus our new conditions.
+Validates our infrastructure against their published numbers.
+
+**Phase 2**: Deep-dive on instances where intent_layer shows signal.
+3-5 reps on interesting tasks for Fisher per-task analysis.
+
+### 7. Doc stripping matches theirs
+
+They strip ALL markdown (`find . -name "*.md" -delete`) plus .github/ and
+docs/. We currently only strip context files. For this experiment, we match
+their stripping exactly — our `none` baseline must equal their `none`.
+
+### 8. Temperature and repetitions
+
+Phase 1 runs at temperature=0 for direct comparison to their n=1 numbers.
+Phase 2 runs at default temperature with 3-5 reps for variance estimation.
+Both are reported.
+
+### 9. Data leakage strategy for Intent Layer
+
+**Problem**: Intent Layer generated at HEAD contains knowledge derived from
+the entire git history, including the very fixes being tested. AGENTS.md
+might effectively teach Claude the answer.
+
+**The paper has the same issue**: Their `human_planner.py` fetches
+developer-written AGENTS.md/CLAUDE.md from future commits after `base_sha`.
+The developer who wrote those files had full knowledge of the codebase
+including the fixes. The paper doesn't acknowledge or control for this.
+
+**Our approach**:
+- **Phase 1**: Generate Intent Layer at HEAD (same temporal stance as their
+  human condition). Accept the leakage — if we beat their human condition
+  with the same leakage profile, the comparison is valid.
+- **Phase 2 robustness check**: Generate once per repo at a pre-dataset
+  cutoff commit (before any instance's base_sha). 12 generations instead
+  of 138. Tests whether results hold without fix-specific knowledge.
+
+This is defensible: we're comparing our approach to theirs under identical
+temporal conditions. If someone objects to leakage, the objection applies
+equally to their human condition.
+
+## What changes in our harness
+
+The adapter sits alongside the existing YAML-based task runner, not replacing
+it. Key new components:
+
+- **AgentbenchLoader**: Loads instances from HuggingFace dataset, converts to
+  internal representation
+- **AgentbenchEvaluator**: Runs their test scripts inside Docker, parses JSON
+  results, maps to our pass/fail + regression-check format
+- **Doc-strip mode**: Match their aggressive stripping (all .md, .github, docs/)
+- **Condition injector**: Four conditions — none strips everything, flat_llm
+  uses their cached files, human uses developer files from git, intent_layer
+  adds hierarchy
+
+Our existing infrastructure stays: circuit breaker, supervisor loop, monitor,
+checkpoint/resume, Fisher/McNemar stats pipeline.
+
+## Methodological alignment summary
+
+| Dimension | Theirs | Ours (with adapter) | Gap |
+|-----------|--------|---------------------|-----|
+| Tasks | 138 from 12 repos | Same 138 | None |
+| Docker env | Pre-built x86 images | Same images on x86 cloud | None |
+| Test eval | Instance + repo regression | Same scripts | None |
+| Problem text | Curated markdown | Same text | None |
+| Doc stripping | All .md + .github + docs | Same | None |
+| Flat LLM files | Generated + cached | Their cached files | None |
+| Human files | Developer files from git | Same approach | None |
+| Conditions | none, LLM, human | none, flat_llm, human, intent_layer | Superset |
+| Agent | Claude, Codex, Qwen, Gemini | Claude only | Narrower |
+| Temp/reps | temp=0, n=1 | Phase 1: temp=0 n=1; Phase 2: default, n=3-5 | Ours is superset |
+| Stats | Aggregate % only | Aggregate + Wilson CI + McNemar + Fisher per-task | Ours is superset |
+| Resilience | None (move on if it fails) | Circuit breaker + supervisor + monitor | Ours is superset |
+| IL temporal | N/A | Phase 1: HEAD; Phase 2: pre-cutoff robustness | Novel |
+
+## Resolved questions
+
+1. **x86 Docker images** → Run on EC2 or similar x86 cloud.
+2. **Flat LLM generation** → Use their cached files from traces archive.
+3. **Human condition** → Yes, include as 4th condition for calibration.
+4. **Data leakage** → Phase 1 at HEAD (matches their temporal stance for
+   human); Phase 2 robustness check at pre-dataset cutoff.
+
+## Open questions
+
+1. **Budget.** Phase 1: ~552 calls x ~$1.50 = ~$830. Phase 2: depends on
+   how many tasks show signal. Total likely $1000-1800.
+
+2. **Wall clock time.** At 1800s timeout, 4 workers, 552 runs: worst case
+   ~38 hours for phase 1. Realistic (not all timeout): ~16-24 hours.
+
+3. **EC2 instance type.** Need Docker + Claude API access. Probably
+   c5.xlarge or similar. Cost ~$0.17/hr x 24hr = ~$4.
+
+4. **Intent Layer quality per repo.** Some of the 12 repos may not benefit
+   from hierarchical context (small repos, flat structure). Worth checking
+   which repos are good IL candidates before running.
diff --git a/eval-harness/docs/plans/2026-02-24-feat-agentbench-native-adapter-plan.md b/eval-harness/docs/plans/2026-02-24-feat-agentbench-native-adapter-plan.md
new file mode 100644
index 0000000..0e0b2ad
--- /dev/null
+++ b/eval-harness/docs/plans/2026-02-24-feat-agentbench-native-adapter-plan.md
@@ -0,0 +1,418 @@
+---
+title: "feat: AGENTbench native adapter for 138-instance replication"
+type: feat
+date: 2026-02-24
+revised: 2026-02-24
+brainstorm: ../brainstorms/2026-02-24-agentbench-adapter-brainstorm.md
+---
+
+# feat: AGENTbench native adapter for 138-instance replication
+
+## Overview
+
+Build a native adapter that loads the AGENTbench paper's 138 instances from
+HuggingFace (`eth-sri/agentbench`) and runs them through our eval harness
+with 4 conditions (none, flat_llm, human, intent_layer). Uses their exact
+test infrastructure but our resilience and statistics pipeline.
+
+Goal: produce a publication-quality response to the paper's claim that
+context files hurt coding agent performance. Our thesis: flat context can
+hurt, but hierarchical context is neutral to helpful.
+
+## Problem statement
+
+Our current eval runs use hand-curated YAML task files for 3-4 repos. The
+paper tested 138 instances across 12 repos. To make a credible counter-claim
+we must run the *same tasks* with the *same evaluation criteria* and *better
+statistics*.
+
+The paper ran n=1 at temp=0 with no confidence intervals. A 2-4% aggregate
+difference could easily be noise. Our replication adds:
+- A 4th condition (hierarchical context) they didn't test
+- Multiple reps with Wilson CIs and paired tests (Phase 2)
+- Per-task Fisher analysis to find where context actually matters (Phase 2)
+
+## Architectural constraint: CC-in-CC
+
+We run Claude Code CLI inside Claude Code, piggybacking on the subscription.
+No direct API calls. Claude gets a prompt and works autonomously in a
+Docker-mounted workspace. This means:
+
+- We can't replicate their exact agent loop (they use direct API + custom tool calling)
+- We give Claude their `problem_description` as a prompt and let it work
+- This is more realistic (tests what a developer would actually do) but
+  our `none` baseline won't be byte-identical to theirs
+- Their published numbers are a reference point, not a target to match exactly
+
+## Dataset schema (from HuggingFace inspection)
+
+Each of the 138 instances has:
+
+```
+instance_id:          "ansible_ansible-83217"
+repo:                 "ansible_ansible"
+base_repo:            "ansible/ansible"
+base_sha:             "fb7fd51b..."
+docker_image:         "tgloaguen/planbenchx86_ansible_ansible:latest"
+problem_description:  markdown description of the bug
+setup_commands:       ["python3 -m venv .venv", "pip install -e .", ...]
+test_file_names:      ["test/units/modules/test_debconf_empty_password.py"]
+test_file_contents:   ["# test source code..."]
+test_commands:        ["python run_pr_tests.py"]
+test_file_runner:     "#!/usr/bin/env python3\n..."  # writes pr_test_results.json
+repo_test_runner:     "#!/usr/bin/env python3\n..."  # writes test_results.json
+repo_test_commands:   ["source .venv/bin/activate", "python run_tests.py test/units"]
+repo_test_after_pr_patch: '{"test/foo.py::test_bar": true, ...}'  # JSON string
+clean_pr_patch:       "diff --git a/..."  # the actual fix (for reference only)
+```
+
+Test runner output format (both runners):
+```json
+{"test/path/test_foo.py::test_bar": true, "test/path/test_baz.py::test_qux": false}
+```
+Dict of `node_id -> passed`. Both runners write JSON to workspace root
+(`pr_test_results.json` and `test_results.json`).
+
+12 unique Docker images (one per repo), all x86: `tgloaguen/planbenchx86_*`.
+
+## Proposed solution
+
+Three new files + modifications to three existing files. All AGENTbench-specific
+code stays self-contained. The existing harness is touched minimally.
+
+### Design principles (from review)
+
+1. **Keep AGENTbench code contained** -- don't scatter across models.py,
+   prompt_builder.py, task_runner.py. New code goes in new files.
+2. **Reuse existing `TaskResult`** -- encode two-tier results in `success` +
+   `test_output`, not new dataclass fields. Use `error` prefix for partial
+   failures.
+3. **Make reporter condition-agnostic** -- iterate over whatever conditions
+   appear in results, not hardcoded lists. Adding HUMAN should be a 1-line
+   enum change, not a 17-location edit.
+4. **Extract supervisor loop** -- both `run` and `run-agentbench` call the
+   same `_run_supervisor()`, avoiding 400 lines of duplication.
+5. **No stats machinery for Phase 1** -- McNemar and Fisher are meaningless
+   at n=1. Defer to Phase 2 when we have reps.
+
+## Implementation
+
+### Phase A: Get one instance running end-to-end
+
+#### A1. Create `lib/agentbench_loader.py` (~50 lines)
+
+```python
+@dataclass(frozen=True)
+class AgentbenchInstance:
+    instance_id: str
+    repo: str                    # "ansible_ansible"
+    base_repo: str               # "ansible/ansible"
+    base_sha: str
+    docker_image: str
+    problem_description: str
+    setup_commands: list[str]
+    test_files: list[tuple[str, str]]  # (path, content) pairs — zipped on load
+    test_commands: list[str]
+    test_file_runner: str
+    repo_test_runner: str
+    repo_test_commands: list[str]
+    repo_test_after_pr_patch: dict[str, bool]
+    clean_pr_patch: str | None = None
+
+
+def load_instances(
+    filter_repo: str | None = None,
+    filter_ids: list[str] | None = None,
+) -> list[AgentbenchInstance]:
+    """Load from HuggingFace, zip test_file_names+contents, parse JSON fields."""
+```
+
+Key details:
+- `test_file_names` and `test_file_contents` are zipped into `test_files`
+  on load, making the coupling structural (not parallel arrays)
+- `repo_test_after_pr_patch` is parsed from JSON string to dict on load
+- Validates `len(test_file_names) == len(test_file_contents)` on load
+- Frozen because instances are immutable; multiple workers share them
+
+#### A2. Create `lib/agentbench_runner.py` (~150 lines)
+
+Single file containing all AGENTbench-specific execution logic:
+
+```python
+def strip_agentbench_docs(workspace: Path) -> None:
+    """Delete ALL .md files, .github/, docs/, .claude/, .cursor/, .codex/."""
+
+def inject_human_context(workspace: Path, repo_url: str, default_branch: str) -> list[str]:
+    """Fetch developer context files from repo HEAD. Returns list of files injected."""
+
+def write_test_infrastructure(workspace: Path, instance: AgentbenchInstance) -> None:
+    """Write test_files, test_file_runner, repo_test_runner to workspace."""
+
+def evaluate_instance(
+    workspace: Path,
+    instance: AgentbenchInstance,
+    timeout: int = 300,
+) -> tuple[bool, str]:
+    """Two-tier evaluation. Returns (success, test_output_summary).
+
+    Runs instance tests (test_file_runner -> pr_test_results.json)
+    then regression tests (repo_test_runner -> test_results.json).
+    Both must pass for success=True.
+
+    test_output_summary is a human-readable string like:
+    "INSTANCE: 3/3 passed | REGRESSION: 45/47 passed (2 flipped: test_foo, test_bar)"
+    """
+
+def build_prompt(
+    problem_description: str,
+    condition: Condition,
+    test_output: str | None = None,
+) -> str:
+    """Preamble + problem_description + optional failing test output.
+
+    Preamble: none=nothing, flat_llm/human=FLAT_PREAMBLE,
+    intent_layer=INTENT_LAYER_PREAMBLE (from existing prompt_builder.py).
+    """
+
+def run_single(
+    instance: AgentbenchInstance,
+    condition: Condition,
+    rep: int,
+    workspaces_dir: Path,
+    reference_clones: dict[str, Path],
+    index_cache: IndexCache,
+    claude_timeout: int = 1800,
+    model: str = "sonnet",
+) -> TaskResult:
+    """Full per-instance execution. Returns a standard TaskResult.
+
+    Steps:
+    1. Clone repo at base_sha (from reference clone, hardlink)
+    2. strip_agentbench_docs()
+    3. Inject condition context:
+       - none: nothing
+       - flat_llm: generate via existing _generate_flat_context() + paper's prompt
+       - human: inject_human_context() from repo HEAD
+       - intent_layer: restore from IndexCache + inject hooks
+    4. write_test_infrastructure()
+    5. Pre-validate: instance tests should fail, regression tests should pass
+    6. Create baseline commit
+    7. build_prompt()
+    8. run_claude()
+    9. evaluate_instance()
+    10. Return TaskResult(success=..., test_output=..., ...)
+    """
+```
+
+Key details:
+- `evaluate_instance()` returns `(bool, str)` — drops into existing
+  `TaskResult.success` and `TaskResult.test_output`. No new dataclass.
+- Uses `_parse_test_results()` helper that handles missing/corrupt JSON
+  defensively (returns None on failure, treated as test failure).
+- Timeout is split as a deadline: `end_time = time.time() + timeout`,
+  remaining time passed to each Docker step.
+- Reuses `run_in_docker`, `run_claude`, `clone_repo` from existing harness.
+- Human preamble reuses existing `INTENT_LAYER_PREAMBLE` (tells Claude to
+  look for CLAUDE.md and AGENTS.md — harmless if some don't exist).
+- Flat_llm always generates fresh via existing `_generate_flat_context()`
+  using the paper's prompt. No `extracted_plans.json` loader (we're already
+  diverging from their exact methodology with CC-in-CC).
+
+#### A3. Add HUMAN to Condition enum
+
+One-line change in `lib/task_runner.py`:
+
+```python
+class Condition(Enum):
+    NONE = "none"
+    FLAT_LLM = "flat_llm"
+    INTENT_LAYER = "intent_layer"
+    HUMAN = "human"
+```
+
+The existing YAML-based `run` command never produces `HUMAN` work items,
+so the existing if/elif chains in `task_runner.py` don't need HUMAN branches.
+HUMAN is only used by `agentbench_runner.run_single()`.
+
+#### A4. Refactor `lib/reporter.py` to be condition-agnostic
+
+Replace all hardcoded condition iteration with dynamic discovery:
+
+```python
+# Before (17 locations like this):
+none_runs = conditions.get("none", [])
+flat_runs = conditions.get("flat_llm", [])
+il_runs = conditions.get("intent_layer", [])
+
+# After:
+conditions_present = sorted(set(r.condition for r in results), key=lambda c: c.value)
+baseline = Condition.NONE
+treatments = [c for c in conditions_present if c != baseline]
+```
+
+McNemar and Fisher comparison pairs: generate from all unique condition
+pairs dynamically, not hardcoded. Same for markdown output sections.
+
+This is a preparatory refactor that makes adding any future condition
+zero-cost. Do it first, then HUMAN "just works."
+
+Also fix the same hardcoded lists in `cli.py`:
+- `_merge_results()` (line 149): iterate conditions dynamically
+- `_recompute_summary()` (line 192): same
+- Choice lists in CLI flags
+
+#### A5. Extract `_run_supervisor()` from `cli.py`
+
+Extract the supervisor loop (work queue, ThreadPoolExecutor, circuit breaker,
+control plane, retry, checkpoint, budget tracking) into a reusable function:
+
+```python
+def _run_supervisor(
+    work_queue: list[WorkItem],
+    run_single_fn: Callable[[WorkItem], TaskResult],
+    reporter: Reporter,
+    eval_id: str,
+    run_config: dict,
+    parallel: int = 4,
+    max_retry_rounds: int = 2,
+) -> list[TaskResult]:
+    """Shared supervisor loop for both YAML and AGENTbench runs."""
+```
+
+Both `run` and `run-agentbench` call this. Avoids duplicating ~400 lines.
+
+#### A6. Add `run-agentbench` subcommand to `cli.py` (~100 lines)
+
+```
+eval-harness run-agentbench \
+  --conditions none flat_llm human intent_layer \
+  --repetitions 1 \
+  --parallel 4 \
+  --timeout 1800 \
+  --filter-repo ansible_ansible \
+  --filter-ids "ansible_ansible-83217,getzep_graphiti-761" \
+  --resume results/in-progress-*.json \
+  --temperature 0
+```
+
+The subcommand:
+1. Loads instances from HuggingFace (with optional filters)
+2. Pre-pulls Docker images (12 unique, with retries)
+3. Creates reference clones (one per repo)
+4. Warms caches (intent_layer per repo, human context per repo)
+5. Builds work queue: instances x conditions x reps
+6. Calls `_run_supervisor()` with `agentbench_runner.run_single` as the
+   run function
+7. Results flow through existing reporter
+
+#### A7. Smoke test
+
+Pick `ansible_ansible-83217` (the first instance). Run 1 rep x 1 condition
+(none). Validate:
+- Docker image pulls on x86
+- Clone at base_sha works
+- Doc stripping runs
+- Test runners write parseable JSON
+- Pre-validation works (instance tests fail, regression passes)
+- `run_claude()` invocation works
+- Result flows through reporter
+
+Then expand: 1 instance x 4 conditions, then 5 instances x 4 conditions.
+
+### Phase B: Run the full 552
+
+#### B1. Full execution
+
+138 instances x 4 conditions x 1 rep = 552 runs at temp=0.
+Run on x86 EC2 with monitor supervising.
+Expected: ~16-24 hours, ~$830 Claude costs, ~$4 EC2 costs.
+
+#### B2. Analysis
+
+After Phase B1:
+- Compare our `none` baseline to their published Claude numbers (calibration)
+- Identify instances where conditions diverge
+- Produce pass-rate table (the Phase 1 deliverable)
+
+#### B3. Phase 2 decision
+
+Based on Phase B1 results:
+- Which instances warrant 3-5 rep deep-dive?
+- Run Phase 2 on signal instances with full stats (McNemar, Fisher)
+- Generate the publication-quality report
+
+## File change summary
+
+| File | Action | ~Lines | What |
+|------|--------|--------|------|
+| `lib/agentbench_loader.py` | **Create** | 50 | Dataclass + HuggingFace loader |
+| `lib/agentbench_runner.py` | **Create** | 150 | Strip, inject, evaluate, prompt, run_single |
+| `lib/cli.py` | Modify | +100, ~50 refactored | Extract `_run_supervisor()`, add `run-agentbench` |
+| `lib/task_runner.py` | Modify | +1 | Add `HUMAN = "human"` to Condition enum |
+| `lib/reporter.py` | Modify | ~0 net (refactor) | Make condition iteration dynamic |
+| `tests/test_agentbench.py` | **Create** | 100 | Loader parsing, evaluator with mock Docker |
+| **Total** | | ~400 new + ~50 refactored | |
+
+## What was cut (and why)
+
+| From original plan | Why cut |
+|---|---|
+| `AgentbenchTestResult` dataclass | Encode in existing `TaskResult.success` + `test_output` |
+| `regression_delta` tracking | YAGNI — investigate manually if needed |
+| New fields on `TaskResult` | Don't pollute shared type with adapter-specific fields |
+| `HUMAN_PREAMBLE` constant | Reuse existing `INTENT_LAYER_PREAMBLE` |
+| `build_agentbench_prompt()` in prompt_builder.py | Keep in agentbench_runner.py |
+| `_strip_agentbench_docs()` on TaskRunner | Standalone function in agentbench_runner.py |
+| `extracted_plans.json` cache loader | Just use existing `_generate_flat_context()` |
+| 6-pair McNemar/Fisher for Phase 1 | Meaningless at n=1. Defer to Phase 2 |
+| Regression Analysis markdown section | YAGNI — post-hoc script if needed |
+| Separate test files per module | One test file is enough |
+| 6-phase decomposition | Collapsed to 2 phases: one instance, then all 552 |
+
+## Error handling
+
+| Failure mode | Handling |
+|---|---|
+| Test runner JSON missing/corrupt | `_parse_test_results()` returns None, treated as failure |
+| Docker image pull failure | Pre-pull with retries in warmup phase |
+| Instance tests pass at base_sha | Pre-validation catches, recorded as `[pre-validation]` |
+| Regression tests fail at base_sha | Pre-validation catches, recorded as `[pre-validation]` |
+| Claude times out | Existing timeout handling, recorded as `[timeout]` |
+| Instance tests pass but regression fails | `success=False`, details in `test_output` |
+| Timeout cascading (two Docker steps) | Use deadline: `end_time = time.time() + timeout` |
+| Worker crash | Existing try/except in supervisor, `[worker-crash]` error |
+
+## Risks and mitigations
+
+| Risk | Impact | Mitigation |
+|------|--------|-----------|
+| Docker images stale/broken | Blocks all runs for that repo | Fallback: rebuild from setup_commands on base image |
+| HuggingFace dataset schema changes | Loader breaks | Pin dataset version, validate schema on load |
+| x86 emulation on ARM dev machines | Can't test locally | EC2 for execution; unit tests mock Docker |
+| CC-in-CC baseline doesn't match paper | Results not directly comparable | Report as "CC-in-CC replication" not "exact replication" |
+| Budget overrun | >$1500 | Circuit breaker, monitor, --filter-repo for partial runs |
+| Some repos have no developer context files | `human` = `none` for those repos | Record and report; still valid data point |
+
+## Success criteria
+
+- [ ] Smoke test: 1 instance x 4 conditions passes end-to-end on x86
+- [ ] Phase B1: >=120/138 instances complete (<=13% infra failure rate)
+- [ ] Our `none` baseline within 10pp of their published Claude numbers
+- [ ] Pass-rate table clearly shows per-condition results across 12 repos
+- [ ] Results reproducible: same dataset, same conditions, same output format
+
+## Dependencies
+
+- `datasets` Python package (for HuggingFace loading)
+- x86 Docker host (EC2 c5.xlarge or similar)
+- AGENTbench Docker images accessible (`tgloaguen/planbenchx86_*`)
+- Claude Code CLI installed and authenticated on EC2
+
+## References
+
+- Brainstorm: `docs/brainstorms/2026-02-24-agentbench-adapter-brainstorm.md`
+- Prior plan: `docs/plans/2026-02-16-feat-agentbench-replication-three-condition-eval-plan.md`
+- Paper: arxiv 2602.11988v1
+- Dataset: `eth-sri/agentbench` on HuggingFace (138 instances, 12 repos)
+- Paper repo: `github.com/eth-sri/agentbench`
+- Existing harness: `lib/task_runner.py`, `lib/cli.py`, `lib/reporter.py`
diff --git a/eval-harness/lib/agentbench_loader.py b/eval-harness/lib/agentbench_loader.py
new file mode 100644
index 0000000..80f70da
--- /dev/null
+++ b/eval-harness/lib/agentbench_loader.py
@@ -0,0 +1,91 @@
+# lib/agentbench_loader.py
+"""Load AGENTbench instances from HuggingFace (eth-sri/agentbench)."""
+from __future__ import annotations
+
+import json
+import logging
+from dataclasses import dataclass
+
+logger = logging.getLogger(__name__)
+
+DATASET_NAME = "eth-sri/agentbench"
+
+
+@dataclass(frozen=True)
+class AgentbenchInstance:
+    """Single AGENTbench task instance (immutable, shared across workers)."""
+    instance_id: str
+    repo: str                                    # "ansible_ansible"
+    base_repo: str                               # "ansible/ansible"
+    base_sha: str
+    docker_image: str
+    problem_description: str
+    setup_commands: list[str]
+    test_files: tuple[tuple[str, str], ...]      # ((path, content), ...) — frozen-compatible
+    test_commands: list[str]
+    test_file_runner: str
+    repo_test_runner: str
+    repo_test_commands: list[str]
+    repo_test_after_pr_patch: dict[str, bool]
+    clean_pr_patch: str | None = None
+
+
+def load_instances(
+    filter_repo: str | None = None,
+    filter_ids: list[str] | None = None,
+) -> list[AgentbenchInstance]:
+    """Load instances from HuggingFace, zip test arrays, parse JSON fields.
+
+    Requires the `datasets` package: pip install datasets
+    """
+    from datasets import load_dataset
+
+    ds = load_dataset(DATASET_NAME, split="test")
+    logger.info("Loaded %d instances from %s", len(ds), DATASET_NAME)
+
+    filter_ids_set = set(filter_ids) if filter_ids else None
+    instances: list[AgentbenchInstance] = []
+
+    for row in ds:
+        if filter_repo and row["repo"] != filter_repo:
+            continue
+        if filter_ids_set and row["instance_id"] not in filter_ids_set:
+            continue
+
+        names = row["test_file_names"]
+        contents = row["test_file_contents"]
+        if len(names) != len(contents):
+            logger.warning(
+                "Skipping %s: test_file_names (%d) != test_file_contents (%d)",
+                row["instance_id"], len(names), len(contents),
+            )
+            continue
+
+        # Parse repo_test_after_pr_patch from JSON string
+        rtapp = row.get("repo_test_after_pr_patch", "{}")
+        if isinstance(rtapp, str):
+            try:
+                rtapp = json.loads(rtapp)
+            except json.JSONDecodeError:
+                logger.warning("Bad JSON in repo_test_after_pr_patch for %s", row["instance_id"])
+                rtapp = {}
+
+        instances.append(AgentbenchInstance(
+            instance_id=row["instance_id"],
+            repo=row["repo"],
+            base_repo=row["base_repo"],
+            base_sha=row["base_sha"],
+            docker_image=row["docker_image"],
+            problem_description=row["problem_description"],
+            setup_commands=row["setup_commands"],
+            test_files=tuple(zip(names, contents)),
+            test_commands=row["test_commands"],
+            test_file_runner=row["test_file_runner"],
+            repo_test_runner=row["repo_test_runner"],
+            repo_test_commands=row["repo_test_commands"],
+            repo_test_after_pr_patch=rtapp,
+            clean_pr_patch=row.get("clean_pr_patch"),
+        ))
+
+    logger.info("Loaded %d instances (after filters)", len(instances))
+    return instances
diff --git a/eval-harness/lib/agentbench_runner.py b/eval-harness/lib/agentbench_runner.py
new file mode 100644
index 0000000..679f3a3
--- /dev/null
+++ b/eval-harness/lib/agentbench_runner.py
@@ -0,0 +1,472 @@
+# lib/agentbench_runner.py
+"""AGENTbench-specific execution logic.
+
+Loads instances from HuggingFace (via agentbench_loader), runs them through
+Docker + Claude, and returns standard TaskResult objects that flow through
+the existing reporter/stats pipeline.
+
+All AGENTbench-specific code is contained here — the existing TaskRunner
+class doesn't know about AGENTbench.
+"""
+from __future__ import annotations
+
+import json
+import logging
+import os
+import shutil
+import subprocess
+import threading
+import time
+from pathlib import Path
+from typing import Callable
+
+from lib.agentbench_loader import AgentbenchInstance
+from lib.claude_runner import run_claude
+from lib.docker_runner import run_in_docker
+from lib.git_ops import clone_repo, checkout_commit, create_baseline_commit, get_diff_stats
+from lib.prompt_builder import FLAT_PREAMBLE, INTENT_LAYER_PREAMBLE
+from lib.task_runner import Condition, TaskResult, SkillGenerationMetrics
+
+logger = logging.getLogger(__name__)
+
+# Timeout for Docker steps (setup, test runs) — separate from Claude timeout
+DOCKER_STEP_TIMEOUT = 300
+PRE_VALIDATION_TIMEOUT = 300
+
+
+def _build_docker_cmd(setup_commands: list[str], commands: list[str]) -> str:
+    """Join setup + test commands into a single shell command string."""
+    parts = setup_commands + commands
+    return " && ".join(parts)
+
+
+def strip_docs(workspace: Path) -> int:
+    """Delete all .md files, .github/, docs/, .claude/, .cursor/, .codex/.
+
+    Returns count of files/dirs removed.
+    """
+    count = 0
+    # Remove directories
+    for dirname in (".github", "docs", ".claude", ".cursor", ".codex"):
+        dirpath = workspace / dirname
+        if dirpath.exists():
+            shutil.rmtree(dirpath)
+            count += 1
+
+    # Remove markdown files
+    for md_file in workspace.rglob("*.md"):
+        md_file.unlink()
+        count += 1
+
+    return count
+
+
+def inject_human_context(workspace: Path, base_repo: str) -> list[str]:
+    """Fetch developer context files (.md, .claude/) from repo default branch.
+
+    Returns list of relative paths injected.
+    """
+    injected = []
+    # Clone default branch into a temp location, copy context files
+    tmp_head = workspace.parent / f".human-ctx-{os.getpid()}-{threading.get_ident()}"
+    try:
+        repo_url = f"https://github.com/{base_repo}.git"
+        clone_repo(repo_url, str(tmp_head), shallow=True)
+
+        # Copy .md files from root
+        for md_file in tmp_head.glob("*.md"):
+            dest = workspace / md_file.name
+            shutil.copy2(md_file, dest)
+            injected.append(md_file.name)
+
+        # Copy context directories
+        for dirname in (".claude", ".cursor", ".github"):
+            src = tmp_head / dirname
+            if src.exists() and src.is_dir():
+                dest = workspace / dirname
+                shutil.copytree(src, dest, dirs_exist_ok=True)
+                for f in dest.rglob("*"):
+                    if f.is_file():
+                        injected.append(str(f.relative_to(workspace)))
+    except subprocess.CalledProcessError:
+        logger.warning("Could not fetch HEAD context for %s", base_repo)
+    finally:
+        if tmp_head.exists():
+            shutil.rmtree(tmp_head)
+
+    return injected
+
+
+def write_test_infrastructure(workspace: Path, instance: AgentbenchInstance) -> None:
+    """Write test files, test_file_runner, and repo_test_runner to workspace."""
+    resolved_ws = workspace.resolve()
+    # Write test files
+    for rel_path, content in instance.test_files:
+        dest = (workspace / rel_path).resolve()
+        if not dest.is_relative_to(resolved_ws):
+            raise ValueError(f"Path traversal detected in test_files: {rel_path}")
+        dest.parent.mkdir(parents=True, exist_ok=True)
+        dest.write_text(content, encoding="utf-8")
+
+    # Write test runners
+    runner_path = workspace / "run_pr_tests.py"
+    runner_path.write_text(instance.test_file_runner, encoding="utf-8")
+    runner_path.chmod(0o755)
+
+    repo_runner_path = workspace / "run_tests.py"
+    repo_runner_path.write_text(instance.repo_test_runner, encoding="utf-8")
+    repo_runner_path.chmod(0o755)
+
+
+def _parse_test_results(workspace: Path, filename: str) -> dict[str, bool] | None:
+    """Parse a test results JSON file. Returns None on failure."""
+    results_path = workspace / filename
+    if not results_path.exists():
+        logger.warning("Test results file missing: %s", results_path)
+        return None
+    try:
+        data = json.loads(results_path.read_text(encoding="utf-8"))
+        if not isinstance(data, dict):
+            logger.warning("Test results not a dict: %s", results_path)
+            return None
+        return data
+    except (json.JSONDecodeError, OSError) as e:
+        logger.warning("Failed to parse %s: %s", results_path, e)
+        return None
+
+
+def evaluate_instance(
+    workspace: Path,
+    instance: AgentbenchInstance,
+    docker_image: str,
+    timeout: int = DOCKER_STEP_TIMEOUT,
+) -> tuple[bool, str]:
+    """Two-tier evaluation. Returns (success, test_output_summary).
+
+    Tier 1: Instance tests (test_file_runner → pr_test_results.json)
+    Tier 2: Repo regression tests (repo_test_runner → test_results.json)
+    Both must pass for success=True.
+    """
+    parts = []
+
+    # Tier 1: Instance tests
+    test_cmd = _build_docker_cmd(instance.setup_commands, instance.test_commands)
+    result = run_in_docker(str(workspace), docker_image, test_cmd, timeout=timeout)
+    pr_results = _parse_test_results(workspace, "pr_test_results.json")
+
+    if pr_results is None:
+        return False, f"INSTANCE: pr_test_results.json missing or corrupt | docker exit={result.exit_code}"
+
+    pr_total = len(pr_results)
+    pr_passed = sum(1 for v in pr_results.values() if v)
+    pr_all_pass = pr_passed == pr_total
+    parts.append(f"INSTANCE: {pr_passed}/{pr_total} passed")
+
+    if not pr_all_pass:
+        failed_tests = [k for k, v in pr_results.items() if not v]
+        parts.append(f"failed: {', '.join(failed_tests[:5])}")
+        return False, " | ".join(parts)
+
+    # Tier 2: Regression tests
+    repo_test_cmd = _build_docker_cmd(instance.setup_commands, instance.repo_test_commands)
+    result = run_in_docker(str(workspace), docker_image, repo_test_cmd, timeout=timeout)
+    repo_results = _parse_test_results(workspace, "test_results.json")
+
+    if repo_results is None:
+        # Regression runner failed — instance tests passed though
+        parts.append("REGRESSION: test_results.json missing or corrupt")
+        return False, " | ".join(parts)
+
+    repo_total = len(repo_results)
+    repo_passed = sum(1 for v in repo_results.values() if v)
+
+    # Check which tests flipped compared to expected (repo_test_after_pr_patch)
+    expected = instance.repo_test_after_pr_patch
+    flipped = []
+    for test_id, actual_pass in repo_results.items():
+        expected_pass = expected.get(test_id)
+        if expected_pass is not None and actual_pass != expected_pass:
+            flipped.append(test_id)
+
+    repo_all_pass = repo_passed == repo_total and not flipped
+    parts.append(f"REGRESSION: {repo_passed}/{repo_total} passed")
+    if flipped:
+        parts.append(f"{len(flipped)} flipped: {', '.join(flipped[:5])}")
+
+    return repo_all_pass, " | ".join(parts)
+
+
+def build_prompt(
+    problem_description: str,
+    condition: Condition,
+) -> str:
+    """Build prompt from AGENTbench problem_description + condition preamble."""
+    preamble = {
+        Condition.NONE: "",
+        Condition.FLAT_LLM: FLAT_PREAMBLE,
+        Condition.HUMAN: INTENT_LAYER_PREAMBLE,  # tells Claude to look for CLAUDE.md/AGENTS.md
+        Condition.INTENT_LAYER: INTENT_LAYER_PREAMBLE,
+    }[condition]
+
+    return f"""{preamble}Fix the following bug:
+
+{problem_description}
+
+The fix should make the existing tests pass. Do not modify the test files."""
+
+
+# -- Context generation helpers (used by run_single Step 3) --
+
+# Type alias: (workspace, instance, workspaces_dir, model) -> (input_tokens, output_tokens)
+_GenerateFn = Callable[[Path, AgentbenchInstance, Path, str], tuple[int, int]]
+
+
+def _generate_flat_context(
+    workspace: Path, instance: AgentbenchInstance, workspaces_dir: Path, model: str,
+) -> tuple[int, int]:
+    """Generate flat CLAUDE.md via Claude, dual-write to AGENTS.md."""
+    from lib.prompt_builder import build_flat_generation_prompt
+    log_dir = workspaces_dir.parent / "logs"
+    log_dir.mkdir(parents=True, exist_ok=True)
+    stderr_log = log_dir / f"{instance.repo}-{instance.base_sha[:8]}-flat_gen.log"
+    result = run_claude(
+        str(workspace), build_flat_generation_prompt(), timeout=600, model=model,
+        stderr_log=str(stderr_log),
+    )
+    # Dual-write CLAUDE.md -> AGENTS.md (matching paper behavior)
+    claude_md = workspace / "CLAUDE.md"
+    agents_md = workspace / "AGENTS.md"
+    if claude_md.exists() and not agents_md.exists():
+        shutil.copy2(claude_md, agents_md)
+    return result.input_tokens, result.output_tokens
+
+
+def _generate_il_context(
+    workspace: Path, plugin_root: str, model: str,
+) -> tuple[int, int]:
+    """Generate intent-layer context via the skill prompt."""
+    from lib.prompt_builder import build_skill_generation_prompt
+    result = run_claude(
+        str(workspace), build_skill_generation_prompt(plugin_root),
+        timeout=600, model=model,
+    )
+    return result.input_tokens, result.output_tokens
+
+
+def _inject_cached_context(
+    instance: AgentbenchInstance,
+    workspace: Path,
+    workspaces_dir: Path,
+    index_cache,
+    model: str,
+    cache_key: str,
+    generate_fn: _GenerateFn,
+) -> SkillGenerationMetrics:
+    """Check cache, generate on miss, return metrics. Shared by FLAT_LLM and INTENT_LAYER."""
+    gen_start = time.time()
+    gen_input_tokens = 0
+    gen_output_tokens = 0
+    cache_hit = False
+
+    if index_cache:
+        repo_url = f"https://github.com/{instance.base_repo}.git"
+        cache_entry = index_cache.lookup_repo(repo_url, cache_key)
+        if cache_entry:
+            index_cache.restore(cache_entry, str(workspace))
+            cache_hit = True
+
+    if not cache_hit:
+        gen_input_tokens, gen_output_tokens = generate_fn(
+            workspace, instance, workspaces_dir, model,
+        )
+
+    return SkillGenerationMetrics(
+        wall_clock_seconds=time.time() - gen_start,
+        input_tokens=gen_input_tokens,
+        output_tokens=gen_output_tokens,
+        cache_hit=cache_hit,
+    )
+
+
+def run_single(
+    instance: AgentbenchInstance,
+    condition: Condition,
+    rep: int,
+    workspaces_dir: Path,
+    reference_clones: dict[str, Path],
+    index_cache=None,
+    claude_timeout: int = 1800,
+    model: str = "sonnet",
+    progress_callback=None,
+) -> TaskResult:
+    """Full per-instance execution. Returns a standard TaskResult.
+
+    Steps:
+    1. Clone repo at base_sha (from reference clone)
+    2. strip_docs()
+    3. Inject condition context
+    4. write_test_infrastructure()
+    5. Pre-validate: instance tests should fail
+    6. Create baseline commit
+    7. build_prompt() + run_claude()
+    8. evaluate_instance()
+    9. Return TaskResult
+    """
+    task_id = instance.instance_id
+    start = time.time()
+    docker_image = instance.docker_image
+
+    def _progress(step: str, msg: str = ""):
+        if progress_callback:
+            progress_callback(task_id, condition.value, step, msg)
+
+    # --- Step 1: Setup workspace ---
+    _progress("setup", "cloning workspace")
+    task_hash = format(hash(task_id) % 0xFFFF, '04x')
+    workspace_name = f"{instance.repo}-{instance.base_sha[:8]}-{task_hash}-{condition.value}-r{rep}"
+    workspace = workspaces_dir / workspace_name
+    if workspace.exists():
+        shutil.rmtree(workspace)
+
+    ref_clone = reference_clones.get(instance.base_repo)
+    clone_repo(
+        f"https://github.com/{instance.base_repo}.git",
+        str(workspace),
+        shallow=False,
+        reference=str(ref_clone) if ref_clone else None,
+    )
+    checkout_commit(str(workspace), instance.base_sha)
+
+    # --- Step 2: Strip docs ---
+    _progress("strip", "removing doc files")
+    strip_docs(workspace)
+
+    # --- Step 3: Inject condition context ---
+    skill_metrics = None
+    _progress("context", f"injecting {condition.value}")
+
+    if condition == Condition.FLAT_LLM:
+        skill_metrics = _inject_cached_context(
+            instance, workspace, workspaces_dir, index_cache, model,
+            cache_key="flat_llm",
+            generate_fn=_generate_flat_context,
+        )
+
+    elif condition == Condition.HUMAN:
+        human_start = time.time()
+        files = inject_human_context(workspace, instance.base_repo)
+        skill_metrics = SkillGenerationMetrics(
+            wall_clock_seconds=time.time() - human_start,
+            input_tokens=0,
+            output_tokens=0,
+            cache_hit=False,
+            files_created=files,
+        )
+
+    elif condition == Condition.INTENT_LAYER:
+        plugin_root = os.environ.get("INTENT_LAYER_PLUGIN_ROOT", "")
+        if not plugin_root:
+            return TaskResult(
+                task_id=task_id, condition=condition, success=False,
+                test_output="", wall_clock_seconds=time.time() - start,
+                input_tokens=0, output_tokens=0, tool_calls=0,
+                lines_changed=0, files_touched=[], rep=rep,
+                error="[infrastructure] INTENT_LAYER_PLUGIN_ROOT not set",
+            )
+        skill_metrics = _inject_cached_context(
+            instance, workspace, workspaces_dir, index_cache, model,
+            cache_key="intent_layer",
+            generate_fn=lambda ws, inst, ws_dir, mdl: _generate_il_context(ws, plugin_root, mdl),
+        )
+
+    # --- Step 4: Write test infrastructure ---
+    _progress("test-infra", "writing test files")
+    write_test_infrastructure(workspace, instance)
+
+    # --- Step 5: Pre-validation ---
+    _progress("pre-validate", "checking instance tests fail at base")
+    test_cmd = _build_docker_cmd(instance.setup_commands, instance.test_commands)
+    run_in_docker(  # side effect: writes pr_test_results.json
+        str(workspace), docker_image, test_cmd, timeout=PRE_VALIDATION_TIMEOUT
+    )
+    pre_pr_results = _parse_test_results(workspace, "pr_test_results.json")
+
+    if pre_pr_results is not None and pre_pr_results and all(pre_pr_results.values()):
+        return TaskResult(
+                task_id=task_id, condition=condition, success=False,
+                test_output="", wall_clock_seconds=time.time() - start,
+                input_tokens=0, output_tokens=0, tool_calls=0,
+                lines_changed=0, files_touched=[], rep=rep,
+                error="[pre-validation] instance tests already pass at base_sha",
+                skill_generation=skill_metrics,
+            )
+
+    # --- Step 6: Baseline commit ---
+    _progress("baseline", "creating baseline commit")
+    create_baseline_commit(str(workspace))
+
+    # --- Step 7: Run Claude ---
+    _progress("claude", "running claude")
+    prompt = build_prompt(instance.problem_description, condition)
+    log_dir = workspaces_dir.parent / "logs"
+    log_dir.mkdir(parents=True, exist_ok=True)
+    stderr_log = log_dir / f"{task_id}-{condition.value}-r{rep}.log"
+
+    claude_result = run_claude(
+        str(workspace), prompt, timeout=claude_timeout, model=model,
+        stderr_log=str(stderr_log),
+    )
+
+    if claude_result.timed_out:
+        return TaskResult(
+            task_id=task_id, condition=condition, success=False,
+            test_output="", wall_clock_seconds=time.time() - start,
+            input_tokens=claude_result.input_tokens,
+            output_tokens=claude_result.output_tokens,
+            tool_calls=claude_result.tool_calls,
+            lines_changed=0, files_touched=[], rep=rep,
+            error="[timeout] claude timed out",
+            skill_generation=skill_metrics,
+            exit_code=claude_result.exit_code,
+            is_timeout=True,
+        )
+
+    # Check for empty run (Claude did nothing)
+    if claude_result.tool_calls == 0:
+        return TaskResult(
+            task_id=task_id, condition=condition, success=False,
+            test_output="", wall_clock_seconds=time.time() - start,
+            input_tokens=claude_result.input_tokens,
+            output_tokens=claude_result.output_tokens,
+            tool_calls=0,
+            lines_changed=0, files_touched=[], rep=rep,
+            error="[empty-run] claude made no tool calls",
+            skill_generation=skill_metrics,
+            exit_code=claude_result.exit_code,
+        )
+
+    # --- Step 8: Evaluate ---
+    _progress("evaluate", "running tests")
+    success, test_output = evaluate_instance(
+        workspace, instance, docker_image, timeout=DOCKER_STEP_TIMEOUT
+    )
+
+    # --- Step 9: Collect diff stats ---
+    diff = get_diff_stats(str(workspace))
+
+    elapsed = time.time() - start
+    return TaskResult(
+        task_id=task_id,
+        condition=condition,
+        success=success,
+        test_output=test_output,
+        wall_clock_seconds=elapsed,
+        input_tokens=claude_result.input_tokens,
+        output_tokens=claude_result.output_tokens,
+        tool_calls=claude_result.tool_calls,
+        lines_changed=diff.lines_changed,
+        files_touched=diff.files,
+        rep=rep,
+        skill_generation=skill_metrics,
+        exit_code=claude_result.exit_code,
+    )
diff --git a/eval-harness/lib/cli.py b/eval-harness/lib/cli.py
index e10bfae..8e9a993 100644
--- a/eval-harness/lib/cli.py
+++ b/eval-harness/lib/cli.py
@@ -20,6 +20,8 @@
 from lib.git_ops import clone_repo, checkout_commit
 from lib.index_cache import IndexCache
 from lib.budget import check_budget, get_budget_status, refresh_budget_snapshot, fmt_tokens
+from lib.agentbench_loader import load_instances, AgentbenchInstance
+from lib.agentbench_runner import run_single as agentbench_run_single
 
 
 # Thread-safe print lock for progress output
@@ -65,7 +67,9 @@ def _load_prior_results(json_path: str) -> tuple[set[tuple[str, str]], set[tuple
                 f"Invalid results file: task at index {i} missing 'task_id' in {json_path}"
             )
         task_id = task["task_id"]
-        for cond_key in ("none", "flat_llm", "intent_layer"):
+        # Discover condition keys dynamically from prior results
+        cond_keys = Reporter._discover_conditions([task])
+        for cond_key in cond_keys:
             cond_data = task.get(cond_key)
             if cond_data is None:
                 continue
@@ -146,7 +150,11 @@ def _merge_results(new_results: 'EvalResults', prior_data: dict, passed_pairs: s
         merged_task = {"task_id": task_id}
         has_carried = False  # Any condition kept from prior via passed_pairs
         has_new = False  # Any condition replaced with new results
-        for cond_key in ("none", "flat_llm", "intent_layer"):
+        # Discover conditions from both prior and new task dicts
+        all_cond_keys = Reporter._discover_conditions(
+            [t for t in (prior_task, new_task) if t is not None]
+        )
+        for cond_key in all_cond_keys:
             if (task_id, cond_key) in passed_pairs:
                 # Carry forward from prior
                 merged_task[cond_key] = prior_task.get(cond_key)
@@ -189,16 +197,17 @@ def _recompute_summary(merged_results: list[dict]) -> dict:
     for multi-run data. Significance flags (from McNemar) are NOT recomputed
     here — they are carried forward from the original compilation.
     """
+    # Discover conditions dynamically from the merged data
+    conditions_present = Reporter._discover_conditions(merged_results)
     cond_stats: dict[str, dict] = {
-        "none": {"successes": 0, "total": 0, "assigned": 0},
-        "flat_llm": {"successes": 0, "total": 0, "assigned": 0},
-        "intent_layer": {"successes": 0, "total": 0, "assigned": 0},
+        c: {"successes": 0, "total": 0, "assigned": 0}
+        for c in conditions_present
     }
     infra_errors = 0
     has_multi_run = False
 
     for task in merged_results:
-        for cond_key in ("none", "flat_llm", "intent_layer"):
+        for cond_key in conditions_present:
             cond_data = task.get(cond_key)
             if cond_data is None:
                 continue
@@ -234,18 +243,15 @@ def itt_rate(stats):
     summary: dict = {
         "total_tasks": len(merged_results),
         "infrastructure_errors": infra_errors,
-        "none_success_rate": rate(cond_stats["none"]),
-        "flat_llm_success_rate": rate(cond_stats["flat_llm"]),
-        "intent_layer_success_rate": rate(cond_stats["intent_layer"]),
-        "none_itt_rate": itt_rate(cond_stats["none"]),
-        "flat_llm_itt_rate": itt_rate(cond_stats["flat_llm"]),
-        "intent_layer_itt_rate": itt_rate(cond_stats["intent_layer"]),
         "resumed_from": None,  # Filled in by caller
     }
+    for label in conditions_present:
+        summary[f"{label}_success_rate"] = rate(cond_stats[label])
+        summary[f"{label}_itt_rate"] = itt_rate(cond_stats[label])
 
     # Add Wilson Score CIs when multi-run data is present
     if has_multi_run:
-        for label in ("none", "flat_llm", "intent_layer"):
+        for label in conditions_present:
             stats = cond_stats[label]
             if stats["total"] > 0:
                 ci_lower, ci_upper, _ = wilson_score_interval(
@@ -276,7 +282,7 @@ def _load_pre_validated_tasks(prior_data: dict) -> frozenset[str]:
         task_id = task.get("task_id")
         if not task_id:
             continue
-        for cond_key in ("none", "flat_llm", "intent_layer"):
+        for cond_key in Reporter._discover_conditions([task]):
             cond = task.get(cond_key)
             if cond is None:
                 continue
@@ -373,8 +379,8 @@ def scan(repo, output, since, limit, docker_image, setup, test_command, branch):
 @click.option("--no-cache", is_flag=True, help="Disable index caching entirely")
 @click.option("--cache-dir", default="workspaces/.index-cache", help="Index cache directory")
 @click.option("--condition", "-c", multiple=True,
-              type=click.Choice(["none", "flat_llm", "intent_layer"]),
-              help="Conditions to run (default: all three)")
+              type=click.Choice([c.value for c in Condition]),
+              help="Conditions to run (default: none, flat_llm, intent_layer)")
 @click.option("--model", default="sonnet",
               help="Claude model to use (default: sonnet)")
 @click.option("--repetitions", "-n", default=1,
@@ -407,11 +413,12 @@ def run(tasks, parallel, category, output, keep_workspaces, dry_run, timeout, ve
         cache.clear()
         click.echo(f"Cleared index cache at {cache_dir}")
 
-    # Determine conditions to run
+    # Determine conditions to run (HUMAN is AGENTbench-only, not used here)
+    YAML_CONDITIONS = [Condition.NONE, Condition.FLAT_LLM, Condition.INTENT_LAYER]
     if condition:
         conditions = [Condition(c) for c in condition]
     else:
-        conditions = list(Condition)
+        conditions = YAML_CONDITIONS
 
     # Build work queue (with repetitions)
     work_queue = []
@@ -778,12 +785,15 @@ def _run_batch(batch: list, workers: int) -> list[TaskResult]:
                 except Exception as e:
                     click.echo(f"  Warning: trial write failed: {e}", err=True)
 
-                try:
-                    checkpoint_path = reporter.write_checkpoint(results, eval_id, run_config=run_config)
-                    if len(results) == 1:
-                        click.echo(f"  Checkpoint: {checkpoint_path} (updated after each result)")
-                except Exception as e:
-                    click.echo(f"  Warning: checkpoint write failed: {e}", err=True)
+                # Checkpoint every 10 results (+ first and last) to avoid O(n^2) recompilation
+                is_last = len(results) >= len(work_queue)
+                if len(results) == 1 or len(results) % 10 == 0 or is_last:
+                    try:
+                        checkpoint_path = reporter.write_checkpoint(results, eval_id, run_config=run_config)
+                        if len(results) == 1:
+                            click.echo(f"  Checkpoint: {checkpoint_path} (updated every 10 results)")
+                    except Exception as e:
+                        click.echo(f"  Warning: checkpoint write failed: {e}", err=True)
 
                 if budget_threshold and not budget_warned and preflight_budget:
                     cumulative_tokens = sum(r.input_tokens + r.output_tokens for r in results)
@@ -875,9 +885,8 @@ def _cb_reset():
         click.echo(f"\n  {len(infra_results)} infra failures detected, diagnosing...")
 
         # Remove infra results from global list — they'll be replaced by retries
-        for r in infra_results:
-            if r in results:
-                results.remove(r)
+        infra_remove = {(r.task_id, r.condition.value, r.rep) for r in infra_results}
+        results[:] = [r for r in results if (r.task_id, r.condition.value, r.rep) not in infra_remove]
 
         # Diagnose and remediate
         has_docker_failures = any(
@@ -948,6 +957,399 @@ def _cb_reset():
         click.echo("Cleaned up workspaces")
 
 
+@main.command("run-agentbench")
+@click.option("--parallel", "-p", default=4, help="Number of parallel workers")
+@click.option("--output", "-o", default="results", help="Output directory")
+@click.option("--timeout", default=1800, help="Per-task Claude timeout in seconds")
+@click.option("--verbose", "-v", is_flag=True, help="Show detailed progress for each step")
+@click.option("--condition", "-c", multiple=True,
+              type=click.Choice([c.value for c in Condition]),
+              help="Conditions to run (default: none, flat_llm, human, intent_layer)")
+@click.option("--model", default="sonnet", help="Claude model to use (default: sonnet)")
+@click.option("--repetitions", "-n", default=1,
+              help="Number of times to repeat each instance/condition pair (default: 1)")
+@click.option("--filter-repo", default=None, help="Only run instances from this repo (e.g. ansible_ansible)")
+@click.option("--filter-ids", default=None, help="Comma-separated instance IDs to run")
+@click.option("--resume", default=None, type=click.Path(exists=True),
+              help="Prior results JSON — skip passed pairs, re-run infra errors")
+@click.option("--retry-all", is_flag=True,
+              help="With --resume: also retry genuine failures, not just infra errors")
+@click.option("--keep-workspaces", is_flag=True, help="Don't cleanup workspaces")
+@click.option("--dry-run", is_flag=True, help="Show what would run")
+@click.option("--no-cache", is_flag=True, help="Disable index caching entirely")
+@click.option("--cache-dir", default="workspaces/.index-cache", help="Index cache directory")
+def run_agentbench(parallel, output, timeout, verbose, condition, model, repetitions,
+                   filter_repo, filter_ids, resume, retry_all, keep_workspaces,
+                   dry_run, no_cache, cache_dir):
+    """Run AGENTbench paper instances (138 tasks from HuggingFace)."""
+    import subprocess
+    import time as _time
+
+    # --- Load instances from HuggingFace ---
+    filter_id_list = [s.strip() for s in filter_ids.split(",")] if filter_ids else None
+    click.echo("Loading AGENTbench instances from HuggingFace...")
+    instances = load_instances(filter_repo=filter_repo, filter_ids=filter_id_list)
+    if not instances:
+        raise click.ClickException("No instances matched filters")
+    click.echo(f"Loaded {len(instances)} instances across "
+               f"{len(set(i.repo for i in instances))} repos")
+
+    # Default conditions: all four for AGENTbench
+    ALL_CONDITIONS = [Condition.NONE, Condition.FLAT_LLM, Condition.HUMAN, Condition.INTENT_LAYER]
+    if condition:
+        conditions = [Condition(c) for c in condition]
+    else:
+        conditions = ALL_CONDITIONS
+
+    # --- Build work queue ---
+    work_queue: list[tuple[AgentbenchInstance, Condition, int]] = []
+    for inst in instances:
+        for cond in conditions:
+            for rep in range(repetitions):
+                work_queue.append((inst, cond, rep))
+
+    # --- Resume filtering ---
+    passed_pairs: set[tuple[str, str]] = set()
+    genuine_fail_pairs: set[tuple[str, str]] = set()
+    prior_data = None
+    if resume:
+        passed_pairs, genuine_fail_pairs, prior_data = _load_prior_results(resume)
+        skip_pairs = passed_pairs.copy()
+        if not retry_all:
+            skip_pairs |= genuine_fail_pairs
+        work_queue = [item for item in work_queue if (item[0].instance_id, item[1].value) not in skip_pairs]
+        click.echo(f"Resume: {len(passed_pairs)} passed (carried forward), "
+                   f"{len(genuine_fail_pairs)} genuine failures ({'retrying' if retry_all else 'skipped'}), "
+                   f"{len(work_queue)} to re-run")
+
+    if dry_run:
+        click.echo("\nDry run - would execute:")
+        for inst, cond, rep in work_queue:
+            rep_tag = f" [rep {rep+1}]" if repetitions > 1 else ""
+            click.echo(f"  - {inst.instance_id} ({cond.value}){rep_tag}")
+        if not work_queue:
+            click.echo("  (nothing to re-run)")
+        return
+
+    total_unique = len(set((item[0].instance_id, item[1].value) for item in work_queue))
+    rep_note = f" x{repetitions} reps" if repetitions > 1 else ""
+    click.echo(f"Running {total_unique} instance/condition pairs{rep_note} "
+               f"({len(work_queue)} total) with {parallel} workers")
+
+    # --- Pre-pull Docker images ---
+    unique_images = sorted(set(i.docker_image for i in instances))
+    click.echo(f"Pre-pulling {len(unique_images)} Docker image(s)...")
+    for image in unique_images:
+        for attempt in range(3):
+            try:
+                subprocess.run(
+                    ["docker", "pull", image],
+                    capture_output=True, timeout=300, check=True,
+                )
+                click.echo(f"  {image}: ready")
+                break
+            except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
+                if attempt < 2:
+                    click.echo(f"  {image}: retry {attempt+1}/3 ({e})", err=True)
+                else:
+                    click.echo(f"  {image}: FAILED after 3 attempts", err=True)
+
+    # --- Create reference clones ---
+    workspaces_dir = Path("workspaces")
+    unique_repos = {f"https://github.com/{i.base_repo}.git" for i in instances}
+    reference_clones_str = _create_reference_clones(unique_repos, workspaces_dir)
+    # Convert to Path values for agentbench_runner
+    reference_clones: dict[str, Path] = {
+        # Key by repo slug (e.g. "ansible/ansible") not URL
+        repo_url.replace("https://github.com/", "").replace(".git", ""): Path(path)
+        for repo_url, path in reference_clones_str.items()
+    }
+
+    # --- Warm caches ---
+    index_cache = None if no_cache else IndexCache(cache_dir)
+
+    progress_callback = _make_progress_callback(verbose)
+
+    # --- Circuit breaker ---
+    _cb_counts: dict[tuple[str, ...], int] = {}
+    _cb_tripped: set[tuple[str, ...]] = set()
+    _cb_lock = threading.Lock()
+    CB_THRESHOLD = 2
+
+    def _cb_record(task_id: str, condition_val: str, error: str) -> bool:
+        with _cb_lock:
+            if "[pre-validation]" in error:
+                key = (task_id,)
+            else:
+                key = (task_id, condition_val)
+            _cb_counts[key] = _cb_counts.get(key, 0) + 1
+            if _cb_counts[key] >= CB_THRESHOLD:
+                newly = key not in _cb_tripped
+                _cb_tripped.add(key)
+                return newly
+        return False
+
+    def _cb_is_tripped(task_id: str, condition_val: str) -> bool:
+        with _cb_lock:
+            return (task_id,) in _cb_tripped or (task_id, condition_val) in _cb_tripped
+
+    def _cb_reset():
+        with _cb_lock:
+            _cb_counts.clear()
+            _cb_tripped.clear()
+
+    # --- Worker function ---
+    def _run_one(item: tuple[AgentbenchInstance, Condition, int]) -> TaskResult:
+        inst, cond, rep = item
+
+        if _cb_is_tripped(inst.instance_id, cond.value):
+            return TaskResult(
+                task_id=inst.instance_id, condition=cond, success=False,
+                test_output="", wall_clock_seconds=0,
+                input_tokens=0, output_tokens=0, tool_calls=0,
+                lines_changed=0, files_touched=[], rep=rep,
+                error="[circuit-breaker] skipped — repeated failures for this instance",
+            )
+
+        result = agentbench_run_single(
+            instance=inst,
+            condition=cond,
+            rep=rep,
+            workspaces_dir=workspaces_dir,
+            reference_clones=reference_clones,
+            index_cache=index_cache,
+            claude_timeout=timeout,
+            model=model,
+            progress_callback=progress_callback,
+        )
+
+        if result.error:
+            newly_tripped = _cb_record(inst.instance_id, cond.value, result.error)
+            if newly_tripped:
+                scope = inst.instance_id if "[pre-validation]" in result.error else f"{inst.instance_id}/{cond.value}"
+                click.echo(f"  \u26a1 Circuit breaker tripped for {scope}")
+
+        return result
+
+    # --- Reporter + eval ID ---
+    reporter = Reporter(output)
+    eval_id = datetime.now().strftime("%Y-%m-%d-%H%M%S")
+    run_config = {
+        "task_ids": sorted(set(i.instance_id for i in instances)),
+        "conditions": sorted(c.value for c in conditions),
+        "repetitions": repetitions,
+        "timeout": timeout,
+        "model": model,
+        "source": "agentbench",
+        "filter_repo": filter_repo,
+        "filter_ids": filter_ids,
+    }
+
+    # --- Control plane ---
+    control_dir = Path(output) / ".eval-control"
+    control_dir.mkdir(parents=True, exist_ok=True)
+    status_path = Path(output) / ".eval-status.json"
+    _current_workers = parallel
+    _start_time = _time.time()
+    _paused = False
+
+    def _write_status(batch_total: int, completed: int, infra_fails: int,
+                      genuine_fails: int, passes: int, paused: bool):
+        status = {
+            "eval_id": eval_id,
+            "timestamp": datetime.now().isoformat(),
+            "uptime_seconds": _time.time() - _start_time,
+            "workers": _current_workers,
+            "paused": paused,
+            "batch_total": batch_total,
+            "completed": completed,
+            "passes": passes,
+            "genuine_failures": genuine_fails,
+            "infra_failures": infra_fails,
+            "remaining": batch_total - completed,
+            "pass_rate": round(passes / max(completed - infra_fails, 1), 3),
+            "infra_rate": round(infra_fails / max(completed, 1), 3),
+        }
+        tmp = status_path.with_suffix(f".tmp.{threading.get_ident()}")
+        with open(tmp, "w") as f:
+            json.dump(status, f, indent=2)
+        tmp.rename(status_path)
+
+    def _check_control() -> dict[str, str]:
+        commands = {}
+        if not control_dir.exists():
+            return commands
+        for p in sorted(control_dir.iterdir()):
+            if p.name.startswith("."):
+                continue
+            val = p.read_text().strip() if p.stat().st_size > 0 else ""
+            commands[p.name] = val
+            p.unlink()
+        return commands
+
+    # --- Batch runner ---
+    results: list[TaskResult] = []
+
+    def _run_batch(batch: list, workers: int) -> list[TaskResult]:
+        nonlocal _current_workers, _paused
+        batch_results: list[TaskResult] = []
+        with ThreadPoolExecutor(max_workers=workers) as executor:
+            futures = {executor.submit(_run_one, item): item for item in batch}
+            for future in as_completed(futures):
+                # Check control commands
+                for cmd, val in _check_control().items():
+                    if cmd == "pause":
+                        _paused = True
+                        click.echo("\n  \u23f8 Paused by external supervisor")
+                    elif cmd == "resume":
+                        _paused = False
+                        click.echo("\n  \u25b6 Resumed")
+                    elif cmd == "set-workers":
+                        try:
+                            _current_workers = max(1, int(val))
+                            click.echo(f"\n  \u2699 Workers set to {_current_workers} (next batch)")
+                        except ValueError:
+                            click.echo(f"\n  Warning: invalid set-workers value: {val}", err=True)
+                    elif cmd == "skip-task":
+                        with _cb_lock:
+                            _cb_tripped.add((val,))
+                        click.echo(f"\n  \u23ed Skipping {val}")
+
+                while _paused:
+                    _time.sleep(2)
+                    for cmd2, _ in _check_control().items():
+                        if cmd2 == "resume":
+                            _paused = False
+                            click.echo("\n  \u25b6 Resumed")
+
+                item = futures[future]
+                _inst, _cond, rep = item
+                try:
+                    result = future.result()
+                except Exception as e:
+                    click.echo(f"  {_inst.instance_id} ({_cond.value}): CRASH - {e}", err=True)
+                    result = TaskResult(
+                        task_id=_inst.instance_id, condition=_cond, success=False,
+                        test_output="", wall_clock_seconds=0,
+                        input_tokens=0, output_tokens=0, tool_calls=0,
+                        lines_changed=0, files_touched=[], rep=rep,
+                        error=f"[worker-crash] {e}",
+                    )
+                batch_results.append(result)
+                results.append(result)
+
+                status_str = "PASS" if result.success else "FAIL"
+                rep_tag = f" [rep {rep+1}/{repetitions}]" if repetitions > 1 else ""
+                line = f"  {result.task_id} ({result.condition.value}){rep_tag}: {status_str}"
+                if not result.success:
+                    if result.error:
+                        first_line = result.error.split("\n")[0][:80]
+                        line += f" - {first_line}"
+                    elif result.test_output:
+                        non_empty = [s.strip() for s in result.test_output.strip().split("\n") if s.strip()]
+                        if non_empty:
+                            line += f" - {non_empty[-1][:80]}"
+                click.echo(line)
+
+                try:
+                    reporter.write_trial(result)
+                except Exception as e:
+                    click.echo(f"  Warning: trial write failed: {e}", err=True)
+                # Checkpoint every 10 results (+ first and last) to avoid O(n^2) recompilation
+                is_last = len(results) >= len(work_queue)
+                if len(results) == 1 or len(results) % 10 == 0 or is_last:
+                    try:
+                        checkpoint_path = reporter.write_checkpoint(results, eval_id, run_config=run_config)
+                        if len(results) == 1:
+                            click.echo(f"  Checkpoint: {checkpoint_path}")
+                    except Exception as e:
+                        click.echo(f"  Warning: checkpoint write failed: {e}", err=True)
+
+                n_infra = sum(1 for r in results if r.error and r.error.startswith(Reporter.INFRA_ERROR_PREFIXES))
+                n_pass = sum(1 for r in results if r.success)
+                n_genuine = len(results) - n_pass - n_infra
+                _write_status(len(work_queue), len(results), n_infra, n_genuine, n_pass, _paused)
+
+        return batch_results
+
+    # --- Supervisor loop ---
+    MAX_RETRY_ROUNDS = 2
+    current_batch = work_queue
+
+    for supervisor_round in range(1 + MAX_RETRY_ROUNDS):
+        if supervisor_round > 0:
+            click.echo(f"\n{'='*60}")
+            click.echo(f"Supervisor retry round {supervisor_round}/{MAX_RETRY_ROUNDS}")
+            click.echo(f"{'='*60}")
+
+        batch_results = _run_batch(current_batch, _current_workers)
+
+        infra_results = [
+            r for r in batch_results
+            if r.error and (
+                r.error.startswith(Reporter.INFRA_ERROR_PREFIXES)
+                or r.error.startswith("[circuit-breaker]")
+            )
+        ]
+
+        if not infra_results:
+            break
+
+        infra_keys = {(r.task_id, r.condition.value, r.rep) for r in infra_results}
+        retry_queue = [
+            item for item in current_batch
+            if (item[0].instance_id, item[1].value, item[2]) in infra_keys
+        ]
+
+        if not retry_queue or supervisor_round >= MAX_RETRY_ROUNDS:
+            if retry_queue:
+                click.echo(f"\n  {len(retry_queue)} infra failures remain after {MAX_RETRY_ROUNDS} retry rounds")
+            break
+
+        click.echo(f"\n  {len(infra_results)} infra failures detected, retrying...")
+        infra_remove = {(r.task_id, r.condition.value, r.rep) for r in infra_results}
+        results[:] = [r for r in results if (r.task_id, r.condition.value, r.rep) not in infra_remove]
+
+        _cb_reset()
+        current_batch = retry_queue
+
+    # --- Generate reports ---
+    eval_results = reporter.compile_results(results)
+    eval_results = replace(
+        eval_results,
+        eval_id=eval_id,
+        timestamp=eval_results.timestamp,
+        run_config=run_config,
+    )
+
+    if prior_data is not None:
+        carry_forward = passed_pairs | (genuine_fail_pairs if not retry_all else set())
+        eval_results = _merge_results(eval_results, prior_data, carry_forward)
+        eval_results.summary["resumed_from"] = prior_data.get("eval_id")
+
+    json_path = reporter.write_json(eval_results)
+    md_path = reporter.write_markdown(eval_results)
+    reporter.remove_checkpoint(eval_id)
+
+    click.echo(f"\nResults written to:")
+    click.echo(f"  JSON: {json_path}")
+    click.echo(f"  Markdown: {md_path}")
+
+    if not keep_workspaces and workspaces_dir.exists():
+        cache_path = Path(cache_dir)
+        tmp_cache = None
+        if cache_path.exists() and cache_path.is_relative_to(workspaces_dir):
+            tmp_cache = workspaces_dir.parent / ".index-cache-preserve"
+            if tmp_cache.exists():
+                shutil.rmtree(tmp_cache)
+            shutil.move(str(cache_path), str(tmp_cache))
+        shutil.rmtree(workspaces_dir)
+        if tmp_cache and tmp_cache.exists():
+            cache_path.parent.mkdir(parents=True, exist_ok=True)
+            shutil.move(str(tmp_cache), str(cache_path))
+        click.echo("Cleaned up workspaces")
+
+
 @main.command()
 @click.option("--tasks", "-t", multiple=True, required=True, help="Task YAML files")
 @click.option("--parallel", "-p", default=8, help="Number of parallel workers")
diff --git a/eval-harness/lib/docker_runner.py b/eval-harness/lib/docker_runner.py
index 98406d5..40f3bb8 100644
--- a/eval-harness/lib/docker_runner.py
+++ b/eval-harness/lib/docker_runner.py
@@ -46,7 +46,7 @@ def run_in_docker(
         cmd.extend(["-v", f"{cache_volume}:/root/.cache"])
     cmd.extend([
         "-w", "/work",
-        "--network", "host",
+        "--network", "none",
         "--memory", memory,
         "--cpus", cpus,
         image,
diff --git a/eval-harness/lib/monitor.py b/eval-harness/lib/monitor.py
index f30738a..5fa28be 100644
--- a/eval-harness/lib/monitor.py
+++ b/eval-harness/lib/monitor.py
@@ -16,6 +16,8 @@
 from dataclasses import dataclass, field
 from pathlib import Path
 
+from lib.reporter import Reporter
+
 log = logging.getLogger("monitor")
 
 
@@ -85,24 +87,26 @@ def count_task_infra_failures(checkpoint: dict) -> dict[str, int]:
         task_id = task_result["task_id"]
         total_infra = 0
         total_runs = 0
-        for cond in ("none", "flat_llm", "intent_layer"):
+        skip_keys = {"task_id", "deltas"}
+        cond_keys = [k for k in task_result if k not in skip_keys]
+        for cond in cond_keys:
             cond_data = task_result.get(cond)
             if not cond_data:
                 continue
             runs = cond_data.get("runs", [])
             if not runs:
                 # Single run format
-                if cond_data.get("error", "").startswith((
-                    "[infrastructure]", "[pre-validation]",
-                )):
+                if cond_data.get("error", "").startswith(
+                    Reporter.INFRA_ERROR_PREFIXES
+                ):
                     total_infra += 1
                 total_runs += 1
             else:
                 for run in runs:
                     total_runs += 1
-                    if run.get("error", "").startswith((
-                        "[infrastructure]", "[pre-validation]",
-                    )):
+                    if run.get("error", "").startswith(
+                        Reporter.INFRA_ERROR_PREFIXES
+                    ):
                         total_infra += 1
         # Only count if ALL runs are infra failures
         if total_runs > 0 and total_infra == total_runs:
diff --git a/eval-harness/lib/reporter.py b/eval-harness/lib/reporter.py
index 8d452e1..c7352e4 100644
--- a/eval-harness/lib/reporter.py
+++ b/eval-harness/lib/reporter.py
@@ -23,10 +23,42 @@ class EvalResults:
 
 
 class Reporter:
+    # Display names for conditions in markdown output
+    DISPLAY_NAMES = {
+        "none": "None",
+        "flat_llm": "Flat LLM",
+        "intent_layer": "Intent Layer",
+        "human": "Human",
+    }
+
     def __init__(self, output_dir: str):
         self.output_dir = Path(output_dir)
         self.output_dir.mkdir(parents=True, exist_ok=True)
 
+    @staticmethod
+    def _discover_conditions(compiled_results: list[dict]) -> list[str]:
+        """Discover condition keys present in compiled result dicts.
+
+        Returns baseline ("none") first if present, then remaining keys sorted.
+        """
+        skip = {"task_id", "deltas"}
+        cond_keys: set[str] = set()
+        for r in compiled_results:
+            for key in r:
+                if key not in skip and r[key] is not None:
+                    cond_keys.add(key)
+        # Baseline first, then alphabetical
+        baseline = "none"
+        rest = sorted(k for k in cond_keys if k != baseline)
+        if baseline in cond_keys:
+            return [baseline] + rest
+        return rest
+
+    @classmethod
+    def _display_name(cls, cond_key: str) -> str:
+        """Human-readable display name for a condition key."""
+        return cls.DISPLAY_NAMES.get(cond_key, cond_key.replace("_", " ").title())
+
     def compile_results(
         self,
         results: list[TaskResult],
@@ -52,22 +84,24 @@ def compile_results(
                 grouped[r.task_id][cond] = []
             grouped[r.task_id][cond].append(r)
 
+        # Discover conditions present in this run
+        conditions_present = sorted(set(r.condition.value for r in results))
+        baseline = Condition.NONE.value
+        treatments = [c for c in conditions_present if c != baseline]
+
         compiled = []
         for task_id, conditions in grouped.items():
-            none_runs = conditions.get("none", [])
-            flat_runs = conditions.get("flat_llm", [])
-            il_runs = conditions.get("intent_layer", [])
-
-            task_result = {
-                "task_id": task_id,
-                "none": self._serialize_condition(none_runs) if none_runs else None,
-                "flat_llm": self._serialize_condition(flat_runs) if flat_runs else None,
-                "intent_layer": self._serialize_condition(il_runs) if il_runs else None,
-                "deltas": {
-                    "flat_llm": self._compute_delta(none_runs, flat_runs),
-                    "intent_layer": self._compute_delta(none_runs, il_runs),
-                }
-            }
+            task_result: dict[str, Any] = {"task_id": task_id}
+            for cond_key in conditions_present:
+                runs = conditions.get(cond_key, [])
+                task_result[cond_key] = self._serialize_condition(runs) if runs else None
+
+            baseline_runs = conditions.get(baseline, [])
+            deltas = {}
+            for treatment in treatments:
+                deltas[treatment] = self._compute_delta(baseline_runs, conditions.get(treatment, []))
+            task_result["deltas"] = deltas
+
             compiled.append(task_result)
 
         summary = self._compute_summary(results)
@@ -294,6 +328,8 @@ def _compute_summary(self, results: list[TaskResult]) -> dict:
         Two scoring modes:
         - Per-protocol: infra errors excluded from denominator (existing behavior)
         - ITT (intent-to-treat): all assigned tasks count, timeout/infra = fail
+
+        Condition-agnostic: iterates over whatever conditions appear in results.
         """
         def success_rate(task_results: list[TaskResult]) -> float:
             """Per-protocol: exclude infra errors from denominator."""
@@ -308,9 +344,8 @@ def itt_rate(task_results: list[TaskResult]) -> float:
                 return 0
             return round(sum(1 for r in task_results if r.success) / len(task_results), 2)
 
-        none_results = [r for r in results if r.condition == Condition.NONE]
-        flat_results = [r for r in results if r.condition == Condition.FLAT_LLM]
-        il_results = [r for r in results if r.condition == Condition.INTENT_LAYER]
+        conditions_present = sorted(set(r.condition for r in results), key=lambda c: c.value)
+        per_cond = {c: [r for r in results if r.condition == c] for c in conditions_present}
 
         infra_errors = sum(1 for r in results if self._is_infra_error(r))
 
@@ -324,29 +359,24 @@ def median_tokens(task_results: list[TaskResult]) -> int:
         summary: dict[str, Any] = {
             "total_tasks": len(set(r.task_id for r in results)),
             "infrastructure_errors": infra_errors,
-            "none_success_rate": success_rate(none_results),
-            "flat_llm_success_rate": success_rate(flat_results),
-            "intent_layer_success_rate": success_rate(il_results),
-            "none_itt_rate": itt_rate(none_results),
-            "flat_llm_itt_rate": itt_rate(flat_results),
-            "intent_layer_itt_rate": itt_rate(il_results),
-            "none_median_tokens": median_tokens(none_results),
-            "flat_llm_median_tokens": median_tokens(flat_results),
-            "intent_layer_median_tokens": median_tokens(il_results),
         }
 
+        for cond in conditions_present:
+            label = cond.value
+            cond_results = per_cond[cond]
+            summary[f"{label}_success_rate"] = success_rate(cond_results)
+            summary[f"{label}_itt_rate"] = itt_rate(cond_results)
+            summary[f"{label}_median_tokens"] = median_tokens(cond_results)
+
         # Add CIs when we have multi-run data
         has_multi_run = any(
             sum(1 for r2 in results if r2.task_id == r.task_id and r2.condition == r.condition) > 1
             for r in results
         )
         if has_multi_run:
-            for label, cond_results in [
-                ("none", none_results),
-                ("flat_llm", flat_results),
-                ("intent_layer", il_results),
-            ]:
-                valid = [r for r in cond_results if not self._is_infra_error(r)]
+            for cond in conditions_present:
+                label = cond.value
+                valid = [r for r in per_cond[cond] if not self._is_infra_error(r)]
                 if valid:
                     successes = sum(1 for r in valid if r.success)
                     ci_lower, ci_upper, _ = wilson_score_interval(successes, len(valid), 0.90)
@@ -355,22 +385,19 @@ def median_tokens(task_results: list[TaskResult]) -> int:
                         "upper": round(ci_upper, 3),
                     }
 
-            # Significance: derived from McNemar (paired test), not CI overlap.
-            # CI overlap is a visual heuristic only — it's not a valid test
-            # for paired data and maps unreliably to p-values.
-            # McNemar p-values are populated below in _compute_mcnemar.
-
         # McNemar's paired analysis: compare conditions per (task, rep) pair
         summary["mcnemar"] = self._compute_mcnemar(results)
 
         # Derive significance flags from McNemar p-values (paired test).
         # Only for multi-run data — single-run has too few pairs to be meaningful.
+        baseline = Condition.NONE
+        treatments = [c for c in conditions_present if c != baseline]
         if has_multi_run:
-            for treatment in ("flat_llm", "intent_layer"):
-                key = f"{treatment}_vs_none"
+            for treatment in treatments:
+                key = f"{treatment.value}_vs_{baseline.value}"
                 mcnemar_entry = summary["mcnemar"].get(key)
                 if mcnemar_entry and mcnemar_entry["n_discordant"] > 0:
-                    summary[f"{treatment}_vs_none_significant"] = mcnemar_entry["p_value"] < 0.05
+                    summary[f"{treatment.value}_vs_{baseline.value}_significant"] = mcnemar_entry["p_value"] < 0.05
 
         # Per-task Fisher's exact tests + recommendations
         if has_multi_run:
@@ -400,11 +427,21 @@ def _compute_mcnemar(self, results: list[TaskResult]) -> dict:
             for runs in conditions.values():
                 runs.sort(key=lambda r: r.rep)
 
-        comparisons = [
-            ("flat_llm", "none"),
-            ("intent_layer", "none"),
-            ("intent_layer", "flat_llm"),
-        ]
+        # Generate all unique condition pairs: (treatment, baseline).
+        # Use baseline-first ordering so keys read "treatment_vs_baseline".
+        baseline = Condition.NONE.value
+        cond_values = sorted(set(r.condition.value for r in results))
+        comparisons = []
+        for i, a in enumerate(cond_values):
+            for b in cond_values[i + 1:]:
+                # Ensure baseline is always the second element
+                if a == baseline:
+                    comparisons.append((b, a))
+                elif b == baseline:
+                    comparisons.append((a, b))
+                else:
+                    # Neither is baseline — alphabetical: later vs earlier
+                    comparisons.append((b, a))
 
         mcnemar_results = {}
         for cond_a, cond_b in comparisons:
@@ -447,11 +484,14 @@ def _compute_per_task_fisher(self, results: list[TaskResult]) -> list[dict]:
                 grouped[r.task_id][cond] = []
             grouped[r.task_id][cond].append(r)
 
-        comparisons = [
-            ("none", "flat_llm"),
-            ("none", "intent_layer"),
-            ("flat_llm", "intent_layer"),
-        ]
+        # Generate all unique condition pairs dynamically
+        cond_values = sorted(set(
+            r.condition.value for r in results if not self._is_infra_error(r)
+        ))
+        comparisons = []
+        for i, a in enumerate(cond_values):
+            for b in cond_values[i + 1:]:
+                comparisons.append((a, b))
 
         per_task: list[dict] = []
         for task_id, conditions in grouped.items():
@@ -622,7 +662,12 @@ def write_markdown(self, results: EvalResults) -> str:
         """
         path = self.output_dir / f"{results.eval_id}.md"
         summary = results.summary
-        has_cis = "none_ci_90" in summary or "intent_layer_ci_90" in summary
+
+        # Discover conditions from the compiled results
+        cond_keys = self._discover_conditions(results.results)
+        baseline = Condition.NONE.value
+        treatments = [c for c in cond_keys if c != baseline]
+        has_cis = any(f"{c}_ci_90" in summary for c in cond_keys)
 
         lines = [
             f"# Eval Results: {results.eval_id}",
@@ -636,60 +681,52 @@ def write_markdown(self, results: EvalResults) -> str:
         ]
 
         # Per-condition summary with optional CIs
-        for label, display_name in [
-            ("none", "None"),
-            ("flat_llm", "Flat LLM"),
-            ("intent_layer", "Intent Layer"),
-        ]:
-            rate = summary[f"{label}_success_rate"]
+        for label in cond_keys:
+            display = self._display_name(label)
+            rate = summary.get(f"{label}_success_rate", 0)
             ci = summary.get(f"{label}_ci_90")
             if ci:
                 lines.append(
-                    f"- **{display_name} success rate:** {rate:.0%} "
+                    f"- **{display} success rate:** {rate:.0%} "
                     f"90% CI {self._format_ci(ci)}"
                 )
             else:
-                lines.append(f"- **{display_name} success rate:** {rate:.0%}")
+                lines.append(f"- **{display} success rate:** {rate:.0%}")
 
         # Per-condition median token usage
-        none_tok = summary.get("none_median_tokens", 0)
-        if none_tok:
+        baseline_tok = summary.get(f"{baseline}_median_tokens", 0)
+        if baseline_tok:
             lines.append("")
             lines.append("**Median tokens (input+output, fix phase only):**")
-            for label, display_name in [
-                ("none", "None"),
-                ("flat_llm", "Flat LLM"),
-                ("intent_layer", "Intent Layer"),
-            ]:
+            for label in cond_keys:
+                display = self._display_name(label)
                 tok = summary.get(f"{label}_median_tokens", 0)
                 tok_fmt = f"{tok / 1000:.0f}k" if tok else "N/A"
-                if label == "none" or not none_tok:
-                    lines.append(f"- **{display_name}:** {tok_fmt}")
+                if label == baseline or not baseline_tok:
+                    lines.append(f"- **{display}:** {tok_fmt}")
                 else:
-                    pct_diff = (tok - none_tok) / none_tok * 100
-                    lines.append(f"- **{display_name}:** {tok_fmt} ({pct_diff:+.0f}% vs none)")
+                    pct_diff = (tok - baseline_tok) / baseline_tok * 100
+                    lines.append(f"- **{display}:** {tok_fmt} ({pct_diff:+.0f}% vs {baseline})")
 
         # Significance flags
         if has_cis:
             lines.append("")
             mcnemar_data = summary.get("mcnemar", {})
-            for treatment, display_name in [
-                ("flat_llm", "Flat LLM"),
-                ("intent_layer", "Intent Layer"),
-            ]:
-                sig_key = f"{treatment}_vs_none_significant"
-                mcnemar_entry = mcnemar_data.get(f"{treatment}_vs_none")
+            for treatment in treatments:
+                display = self._display_name(treatment)
+                sig_key = f"{treatment}_vs_{baseline}_significant"
+                mcnemar_entry = mcnemar_data.get(f"{treatment}_vs_{baseline}")
                 if sig_key in summary and mcnemar_entry:
                     p = mcnemar_entry["p_value"]
                     n_disc = mcnemar_entry["n_discordant"]
                     if summary[sig_key]:
-                        lines.append(f"- **{display_name} vs None:** significant (McNemar p={p:.3f}, {n_disc} discordant pairs)")
+                        lines.append(f"- **{display} vs {self._display_name(baseline)}:** significant (McNemar p={p:.3f}, {n_disc} discordant pairs)")
                     else:
-                        lines.append(f"- **{display_name} vs None:** not significant (McNemar p={p:.3f}, {n_disc} discordant pairs)")
+                        lines.append(f"- **{display} vs {self._display_name(baseline)}:** not significant (McNemar p={p:.3f}, {n_disc} discordant pairs)")
 
             # CI width as variance proxy
             widths = []
-            for label in ("none", "flat_llm", "intent_layer"):
+            for label in cond_keys:
                 ci = summary.get(f"{label}_ci_90")
                 if ci:
                     widths.append((label, ci["upper"] - ci["lower"]))
@@ -705,50 +742,21 @@ def write_markdown(self, results: EvalResults) -> str:
             "",
         ]
 
-        # Table header — add IL vs none column when multi-run CIs exist
-        if has_cis:
-            lines.append(
-                "| Task | Condition | Success | Time (s) | Tokens | Tool Calls | Lines "
-                "| \u0394 Time | \u0394 Tokens | IL vs none |"
-            )
-            lines.append(
-                "|------|-----------|---------|----------|--------|------------|-------"
-                "|--------|----------|------------|"
-            )
-        else:
-            lines.append(
-                "| Task | Condition | Success | Time (s) | Tokens | Tool Calls | Lines "
-                "| \u0394 Time | \u0394 Tokens |"
-            )
-            lines.append(
-                "|------|-----------|---------|----------|--------|------------|-------"
-                "|--------|----------|"
-            )
+        # Table header
+        lines.append(
+            "| Task | Condition | Success | Time (s) | Tokens | Tool Calls | Lines "
+            "| \u0394 Time | \u0394 Tokens |"
+        )
+        lines.append(
+            "|------|-----------|---------|----------|--------|------------|-------"
+            "|--------|----------|"
+        )
 
         for r in results.results:
             task_id = r["task_id"]
             deltas = r.get("deltas", {})
 
-            # Per-task CI comparison for IL vs none (visual heuristic only —
-            # aggregate significance comes from McNemar in the summary)
-            none_data = r.get("none")
-            il_data = r.get("intent_layer")
-            il_vs_none = ""
-            if has_cis and none_data and il_data:
-                none_ci = none_data.get("ci_90")
-                il_ci = il_data.get("ci_90")
-                if none_ci and il_ci:
-                    none_rate = none_data.get("success_rate", 0)
-                    il_rate = il_data.get("success_rate", 0)
-                    diff = il_rate - none_rate
-                    overlaps = ci_overlap(
-                        (none_ci["lower"], none_ci["upper"]),
-                        (il_ci["lower"], il_ci["upper"]),
-                    )
-                    ci_label = "CIs overlap" if overlaps else "CIs disjoint"
-                    il_vs_none = f"{diff:+.0%} ({ci_label})"
-
-            for cond_key in ("none", "flat_llm", "intent_layer"):
+            for cond_key in cond_keys:
                 cond_data = r.get(cond_key)
                 if cond_data is None:
                     continue
@@ -776,8 +784,8 @@ def write_markdown(self, results: EvalResults) -> str:
 
                 tokens_fmt = f"{tokens / 1000:.1f}k"
 
-                # Deltas: none is baseline, shows "—"
-                if cond_key == "none":
+                # Deltas: baseline shows "—"
+                if cond_key == baseline:
                     d_time = "\u2014"
                     d_tokens = "\u2014"
                 else:
@@ -788,21 +796,13 @@ def write_markdown(self, results: EvalResults) -> str:
                 row = (
                     f"| {task_id} | {cond_key} | {success} | {time_s:.1f} | "
                     f"{tokens_fmt} | {tool_calls} | {lines_changed} | "
-                    f"{d_time} | {d_tokens}"
+                    f"{d_time} | {d_tokens} |"
                 )
 
-                if has_cis:
-                    # Show IL vs none comparison on the intent_layer row
-                    comparison = il_vs_none if cond_key == "intent_layer" else ""
-                    row += f" | {comparison} |"
-                else:
-                    row += " |"
-
                 lines.append(row)
 
             # Blank row between tasks
-            blank = "|  |  |  |  |  |  |  |  |  |" + ("  |" if has_cis else "")
-            lines.append(blank)
+            lines.append("|  |  |  |  |  |  |  |  |  |")
 
         # Remove trailing blank row
         if lines and lines[-1].strip().replace("|", "").replace(" ", "") == "":
diff --git a/eval-harness/lib/task_runner.py b/eval-harness/lib/task_runner.py
index 10f8d7e..e386a60 100644
--- a/eval-harness/lib/task_runner.py
+++ b/eval-harness/lib/task_runner.py
@@ -111,6 +111,7 @@ class Condition(Enum):
     NONE = "none"
     FLAT_LLM = "flat_llm"
     INTENT_LAYER = "intent_layer"
+    HUMAN = "human"
 
 
 @dataclass
diff --git a/eval-harness/tests/test_agentbench.py b/eval-harness/tests/test_agentbench.py
new file mode 100644
index 0000000..dcf55d1
--- /dev/null
+++ b/eval-harness/tests/test_agentbench.py
@@ -0,0 +1,390 @@
+# tests/test_agentbench.py
+"""Tests for AGENTbench loader and runner."""
+import json
+import tempfile
+from pathlib import Path
+from unittest.mock import patch, MagicMock
+
+import pytest
+
+from lib.agentbench_loader import AgentbenchInstance, load_instances
+from lib.agentbench_runner import (
+    strip_docs,
+    write_test_infrastructure,
+    _parse_test_results,
+    evaluate_instance,
+    build_prompt,
+)
+from lib.task_runner import Condition
+
+
+# ── Fixtures ──────────────────────────────────────────────────────────
+
+
+def _make_instance(**overrides) -> AgentbenchInstance:
+    """Build an AgentbenchInstance with sensible defaults."""
+    defaults = dict(
+        instance_id="ansible_ansible-83217",
+        repo="ansible_ansible",
+        base_repo="ansible/ansible",
+        base_sha="abc123def456",
+        docker_image="ghcr.io/eth-sri/agent-bench:ansible_ansible",
+        problem_description="Fix the bug in module_utils.",
+        setup_commands=["pip install -e ."],
+        test_files=(("tests/test_foo.py", "def test_foo(): pass"),),
+        test_commands=["python run_pr_tests.py"],
+        test_file_runner="#!/usr/bin/env python\nprint('run pr tests')",
+        repo_test_runner="#!/usr/bin/env python\nprint('run repo tests')",
+        repo_test_commands=["python run_tests.py"],
+        repo_test_after_pr_patch={"test_foo::test_bar": True},
+        clean_pr_patch=None,
+    )
+    defaults.update(overrides)
+    return AgentbenchInstance(**defaults)
+
+
+def _make_hf_row(**overrides) -> dict:
+    """Build a HuggingFace row dict that load_instances() parses."""
+    defaults = dict(
+        instance_id="ansible_ansible-83217",
+        repo="ansible_ansible",
+        base_repo="ansible/ansible",
+        base_sha="abc123",
+        docker_image="ghcr.io/eth-sri/agent-bench:ansible_ansible",
+        problem_description="Fix it.",
+        setup_commands=["pip install -e ."],
+        test_file_names=["tests/test_foo.py"],
+        test_file_contents=["def test_foo(): pass"],
+        test_commands=["python run_pr_tests.py"],
+        test_file_runner="runner code",
+        repo_test_runner="repo runner code",
+        repo_test_commands=["python run_tests.py"],
+        repo_test_after_pr_patch='{"test_a": true}',
+        clean_pr_patch=None,
+    )
+    defaults.update(overrides)
+    return defaults
+
+
+# ── Loader tests ──────────────────────────────────────────────────────
+
+
+class TestLoadInstances:
+
+    def _mock_dataset(self, rows: list[dict]):
+        """Patch datasets.load_dataset (locally imported by loader)."""
+        ds = MagicMock()
+        ds.__iter__ = lambda self: iter(rows)
+        ds.__len__ = lambda self: len(rows)
+        return patch("datasets.load_dataset", return_value=ds)
+
+    def test_basic_load(self):
+        row = _make_hf_row()
+        with self._mock_dataset([row]):
+            instances = load_instances()
+        assert len(instances) == 1
+        inst = instances[0]
+        assert inst.instance_id == "ansible_ansible-83217"
+        assert inst.repo == "ansible_ansible"
+        assert inst.base_repo == "ansible/ansible"
+        assert inst.test_files == (("tests/test_foo.py", "def test_foo(): pass"),)
+        assert inst.repo_test_after_pr_patch == {"test_a": True}
+
+    def test_filter_by_repo(self):
+        rows = [
+            _make_hf_row(instance_id="a-1", repo="repo_a"),
+            _make_hf_row(instance_id="b-1", repo="repo_b"),
+            _make_hf_row(instance_id="a-2", repo="repo_a"),
+        ]
+        with self._mock_dataset(rows):
+            instances = load_instances(filter_repo="repo_a")
+        assert len(instances) == 2
+        assert all(i.repo == "repo_a" for i in instances)
+
+    def test_filter_by_ids(self):
+        rows = [
+            _make_hf_row(instance_id="a-1"),
+            _make_hf_row(instance_id="a-2"),
+            _make_hf_row(instance_id="a-3"),
+        ]
+        with self._mock_dataset(rows):
+            instances = load_instances(filter_ids=["a-1", "a-3"])
+        assert [i.instance_id for i in instances] == ["a-1", "a-3"]
+
+    def test_mismatched_test_files_skipped(self):
+        row = _make_hf_row(
+            test_file_names=["a.py", "b.py"],
+            test_file_contents=["content_a"],  # only 1 content for 2 names
+        )
+        with self._mock_dataset([row]):
+            instances = load_instances()
+        assert len(instances) == 0
+
+    def test_bad_json_in_repo_test_after_pr_patch(self):
+        row = _make_hf_row(repo_test_after_pr_patch="not valid json{")
+        with self._mock_dataset([row]):
+            instances = load_instances()
+        assert len(instances) == 1
+        assert instances[0].repo_test_after_pr_patch == {}
+
+    def test_dict_repo_test_after_pr_patch(self):
+        """repo_test_after_pr_patch can be a dict (not JSON string)."""
+        row = _make_hf_row(repo_test_after_pr_patch={"test_x": False})
+        with self._mock_dataset([row]):
+            instances = load_instances()
+        assert instances[0].repo_test_after_pr_patch == {"test_x": False}
+
+    def test_multiple_test_files_zipped(self):
+        row = _make_hf_row(
+            test_file_names=["tests/a.py", "tests/b.py"],
+            test_file_contents=["content_a", "content_b"],
+        )
+        with self._mock_dataset([row]):
+            instances = load_instances()
+        assert instances[0].test_files == (
+            ("tests/a.py", "content_a"),
+            ("tests/b.py", "content_b"),
+        )
+
+    def test_frozen_instance(self):
+        inst = _make_instance()
+        with pytest.raises(AttributeError):
+            inst.instance_id = "new_id"  # type: ignore[misc]
+
+
+# ── Runner: strip_docs ────────────────────────────────────────────────
+
+
+class TestStripDocs:
+
+    def test_strips_md_files(self):
+        with tempfile.TemporaryDirectory() as d:
+            ws = Path(d)
+            (ws / "README.md").write_text("readme")
+            (ws / "CONTRIBUTING.md").write_text("contrib")
+            (ws / "src").mkdir()
+            (ws / "src" / "main.py").write_text("code")
+            (ws / "src" / "notes.md").write_text("notes")
+
+            count = strip_docs(ws)
+
+            assert count == 3  # README.md, CONTRIBUTING.md, src/notes.md
+            assert not (ws / "README.md").exists()
+            assert not (ws / "src" / "notes.md").exists()
+            assert (ws / "src" / "main.py").exists()
+
+    def test_strips_special_dirs(self):
+        with tempfile.TemporaryDirectory() as d:
+            ws = Path(d)
+            for dirname in (".github", "docs", ".claude", ".cursor", ".codex"):
+                (ws / dirname).mkdir()
+                (ws / dirname / "file.txt").write_text("content")
+
+            count = strip_docs(ws)
+
+            assert count == 5
+            for dirname in (".github", "docs", ".claude", ".cursor", ".codex"):
+                assert not (ws / dirname).exists()
+
+    def test_no_docs_returns_zero(self):
+        with tempfile.TemporaryDirectory() as d:
+            ws = Path(d)
+            (ws / "src.py").write_text("code")
+            assert strip_docs(ws) == 0
+
+
+# ── Runner: write_test_infrastructure ──────────────────────────────────
+
+
+class TestWriteTestInfrastructure:
+
+    def test_writes_test_files_and_runners(self):
+        inst = _make_instance(
+            test_files=(
+                ("tests/test_foo.py", "test content foo"),
+                ("tests/sub/test_bar.py", "test content bar"),
+            ),
+            test_file_runner="pr runner script",
+            repo_test_runner="repo runner script",
+        )
+        with tempfile.TemporaryDirectory() as d:
+            ws = Path(d)
+            write_test_infrastructure(ws, inst)
+
+            assert (ws / "tests" / "test_foo.py").read_text() == "test content foo"
+            assert (ws / "tests" / "sub" / "test_bar.py").read_text() == "test content bar"
+            assert (ws / "run_pr_tests.py").read_text() == "pr runner script"
+            assert (ws / "run_tests.py").read_text() == "repo runner script"
+            # Check executable permissions
+            import stat
+            assert (ws / "run_pr_tests.py").stat().st_mode & stat.S_IXUSR
+
+    def test_path_traversal_blocked(self):
+        """test_files with '../' paths must be rejected."""
+        inst = _make_instance(
+            test_files=(("../../../etc/evil.py", "malicious content"),),
+        )
+        with tempfile.TemporaryDirectory() as d:
+            ws = Path(d)
+            with pytest.raises(ValueError, match="Path traversal"):
+                write_test_infrastructure(ws, inst)
+
+    def test_absolute_path_blocked(self):
+        """test_files with absolute paths must be rejected."""
+        inst = _make_instance(
+            test_files=(("/tmp/evil.py", "malicious content"),),
+        )
+        with tempfile.TemporaryDirectory() as d:
+            ws = Path(d)
+            with pytest.raises(ValueError, match="Path traversal"):
+                write_test_infrastructure(ws, inst)
+
+
+# ── Runner: _parse_test_results ────────────────────────────────────────
+
+
+class TestParseTestResults:
+
+    def test_valid_json(self):
+        with tempfile.TemporaryDirectory() as d:
+            ws = Path(d)
+            data = {"test_a": True, "test_b": False}
+            (ws / "results.json").write_text(json.dumps(data))
+            assert _parse_test_results(ws, "results.json") == data
+
+    def test_missing_file(self):
+        with tempfile.TemporaryDirectory() as d:
+            assert _parse_test_results(Path(d), "results.json") is None
+
+    def test_corrupt_json(self):
+        with tempfile.TemporaryDirectory() as d:
+            ws = Path(d)
+            (ws / "results.json").write_text("not json{")
+            assert _parse_test_results(ws, "results.json") is None
+
+    def test_non_dict_json(self):
+        with tempfile.TemporaryDirectory() as d:
+            ws = Path(d)
+            (ws / "results.json").write_text("[1, 2, 3]")
+            assert _parse_test_results(ws, "results.json") is None
+
+
+# ── Runner: evaluate_instance ──────────────────────────────────────────
+
+
+class TestEvaluateInstance:
+
+    def _setup_results(self, ws: Path, pr_results: dict | None, repo_results: dict | None):
+        """Write test result files that the evaluator will parse."""
+        if pr_results is not None:
+            (ws / "pr_test_results.json").write_text(json.dumps(pr_results))
+        if repo_results is not None:
+            (ws / "test_results.json").write_text(json.dumps(repo_results))
+
+    @patch("lib.agentbench_runner.run_in_docker")
+    def test_all_pass(self, mock_docker):
+        inst = _make_instance(
+            repo_test_after_pr_patch={"test_a": True, "test_b": True},
+        )
+        mock_docker.return_value = MagicMock(exit_code=0)
+
+        with tempfile.TemporaryDirectory() as d:
+            ws = Path(d)
+            # Pre-write both result files (evaluator reads after Docker returns)
+            self._setup_results(ws, {"t1": True, "t2": True}, {"test_a": True, "test_b": True})
+
+            success, output = evaluate_instance(ws, inst, "test-image")
+            assert success is True
+            assert "INSTANCE: 2/2 passed" in output
+            assert "REGRESSION: 2/2 passed" in output
+
+    @patch("lib.agentbench_runner.run_in_docker")
+    def test_instance_test_fails(self, mock_docker):
+        inst = _make_instance()
+        mock_docker.return_value = MagicMock(exit_code=1)
+
+        with tempfile.TemporaryDirectory() as d:
+            ws = Path(d)
+            self._setup_results(ws, {"t1": True, "t2": False}, None)
+
+            success, output = evaluate_instance(ws, inst, "test-image")
+            assert success is False
+            assert "INSTANCE: 1/2 passed" in output
+
+    @patch("lib.agentbench_runner.run_in_docker")
+    def test_missing_pr_results(self, mock_docker):
+        inst = _make_instance()
+        mock_docker.return_value = MagicMock(exit_code=1)
+
+        with tempfile.TemporaryDirectory() as d:
+            ws = Path(d)
+            # No result files written
+            success, output = evaluate_instance(ws, inst, "test-image")
+            assert success is False
+            assert "missing or corrupt" in output
+
+    @patch("lib.agentbench_runner.run_in_docker")
+    def test_empty_pr_results_not_treated_as_all_pass(self, mock_docker):
+        """Empty dict should not be treated as 'all tests passed'."""
+        inst = _make_instance()
+        mock_docker.return_value = MagicMock(exit_code=0)
+
+        with tempfile.TemporaryDirectory() as d:
+            ws = Path(d)
+            # Empty results dict — all({}.values()) is True, but should not count as pass
+            self._setup_results(ws, {}, None)
+
+            success, output = evaluate_instance(ws, inst, "test-image")
+            assert success is False
+            assert "0/0" in output
+
+    @patch("lib.agentbench_runner.run_in_docker")
+    def test_regression_flipped_tests(self, mock_docker):
+        inst = _make_instance(
+            repo_test_after_pr_patch={"test_a": True, "test_b": True},
+        )
+        mock_docker.return_value = MagicMock(exit_code=0)
+
+        with tempfile.TemporaryDirectory() as d:
+            ws = Path(d)
+            # Instance tests pass, but regression test_b flipped from True -> False
+            self._setup_results(
+                ws,
+                {"t1": True},
+                {"test_a": True, "test_b": False},
+            )
+
+            success, output = evaluate_instance(ws, inst, "test-image")
+            assert success is False
+            assert "flipped" in output
+
+
+# ── Runner: build_prompt ───────────────────────────────────────────────
+
+
+class TestBuildPrompt:
+
+    def test_none_condition_no_preamble(self):
+        prompt = build_prompt("Fix the bug.", Condition.NONE)
+        assert "Fix the bug." in prompt
+        assert "AGENTS.md" not in prompt
+        assert "CLAUDE.md" not in prompt
+
+    def test_flat_llm_has_preamble(self):
+        prompt = build_prompt("Fix the bug.", Condition.FLAT_LLM)
+        assert "Fix the bug." in prompt
+        assert "CLAUDE.md" in prompt
+
+    def test_intent_layer_has_preamble(self):
+        prompt = build_prompt("Fix the bug.", Condition.INTENT_LAYER)
+        assert "Fix the bug." in prompt
+        assert "AGENTS.md" in prompt
+
+    def test_human_condition_has_preamble(self):
+        prompt = build_prompt("Fix the bug.", Condition.HUMAN)
+        assert "Fix the bug." in prompt
+        assert "AGENTS.md" in prompt
+
+    def test_prompt_includes_test_instruction(self):
+        prompt = build_prompt("Fix X.", Condition.NONE)
+        assert "Do not modify the test files" in prompt
diff --git a/eval-harness/tests/test_reporter.py b/eval-harness/tests/test_reporter.py
index 7fd91e7..47fdae4 100644
--- a/eval-harness/tests/test_reporter.py
+++ b/eval-harness/tests/test_reporter.py
@@ -195,11 +195,11 @@ def test_missing_condition():
 
     task = eval_results.results[0]
     assert task["none"] is not None
-    assert task["flat_llm"] is None
+    assert "flat_llm" not in task  # missing condition is absent, not None
     assert task["intent_layer"] is not None
 
-    # flat_llm delta should be empty (missing condition)
-    assert task["deltas"]["flat_llm"] == {}
+    # flat_llm delta should be absent (missing condition)
+    assert "flat_llm" not in task["deltas"]
     # intent_layer delta should exist
     assert task["deltas"]["intent_layer"]["time_percent"] == "-44.4%"
 
@@ -715,18 +715,18 @@ def test_markdown_multi_run_has_ci_columns(tmp_path):
     # CI brackets appear in success column
     assert "[" in content and "]" in content
 
-    # IL vs none column header present
-    assert "IL vs none" in content
+    # Comparison rows present in per-task Fisher table
+    assert "intent_layer vs none" in content
 
     # Significance labels appear
-    assert "overlap" in content or "sig." in content
+    assert "overlap" in content or "sig." in content or "*" in content
 
     # Summary has CI notation
     assert "90% CI" in content
 
 
 def test_markdown_single_run_no_ci_column(tmp_path, three_condition_results):
-    """Single-run markdown has no IL vs none column (backward-compatible)."""
+    """Single-run markdown has no per-task Fisher section (backward-compatible)."""
     reporter = Reporter(output_dir=str(tmp_path))
     eval_results = reporter.compile_results(three_condition_results)
     md_path = reporter.write_markdown(eval_results)
@@ -734,7 +734,7 @@ def test_markdown_single_run_no_ci_column(tmp_path, three_condition_results):
     with open(md_path) as f:
         content = f.read()
 
-    assert "IL vs none" not in content
+    assert "Per-Task Analysis" not in content
     assert "90% CI" not in content
 
 
@@ -945,10 +945,10 @@ def test_per_task_fisher_in_summary():
 
     # Find task-signal entry
     signal_task = next(t for t in fisher if t["task_id"] == "task-signal")
-    assert "none_vs_intent_layer" in signal_task["comparisons"]
-    comp = signal_task["comparisons"]["none_vs_intent_layer"]
-    assert comp["a_rate"] == 0.0
-    assert comp["b_rate"] == 1.0
+    assert "intent_layer_vs_none" in signal_task["comparisons"]
+    comp = signal_task["comparisons"]["intent_layer_vs_none"]
+    assert comp["a_rate"] == 1.0
+    assert comp["b_rate"] == 0.0
     assert comp["p_value"] <= 0.10  # borderline significant
 
     # Ceiling-effected task
@@ -1013,7 +1013,7 @@ def test_fisher_markdown_output(tmp_path):
 
     assert "## Per-Task Analysis (Fisher's Exact Test)" in content
     assert "task-signal" in content
-    assert "none vs intent_layer" in content
+    assert "intent_layer vs none" in content
 
     assert "## Recommendations" in content
     assert "ceiling-effected" in content
diff --git a/eval-harness/tests/test_resume.py b/eval-harness/tests/test_resume.py
index 963bbf1..ed344fe 100644
--- a/eval-harness/tests/test_resume.py
+++ b/eval-harness/tests/test_resume.py
@@ -614,10 +614,11 @@ def test_empty_results(self):
         summary = _recompute_summary([])
 
         assert summary["total_tasks"] == 0
-        assert summary["none_success_rate"] == 0
-        assert summary["flat_llm_success_rate"] == 0
-        assert summary["intent_layer_success_rate"] == 0
         assert summary["infrastructure_errors"] == 0
+        # No conditions present → no success rate keys
+        assert "none_success_rate" not in summary
+        assert "flat_llm_success_rate" not in summary
+        assert "intent_layer_success_rate" not in summary
 
 
 # --- _load_prior_results with genuine failures ---
diff --git a/eval-harness/tests/test_task_runner.py b/eval-harness/tests/test_task_runner.py
index 3a1c3c9..ab7d592 100644
--- a/eval-harness/tests/test_task_runner.py
+++ b/eval-harness/tests/test_task_runner.py
@@ -127,7 +127,8 @@ def test_condition_enum():
     assert Condition.NONE.value == "none"
     assert Condition.FLAT_LLM.value == "flat_llm"
     assert Condition.INTENT_LAYER.value == "intent_layer"
-    assert len(Condition) == 3
+    assert Condition.HUMAN.value == "human"
+    assert len(Condition) == 4
 
 
 def test_find_agents_files(sample_repo):

From 785c257557ab6cbd58ecb7b7539be09b041c2403 Mon Sep 17 00:00:00 2001
From: Ryan Orban <ryan@orban.dev>
Date: Tue, 24 Feb 2026 21:10:04 -0800
Subject: [PATCH 09/21] fix stale test references to renamed setup_workspace
 method

Entire-Checkpoint: 94115d70169e
---
 eval-harness/tests/test_task_runner.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/eval-harness/tests/test_task_runner.py b/eval-harness/tests/test_task_runner.py
index ab7d592..8cf1c6d 100644
--- a/eval-harness/tests/test_task_runner.py
+++ b/eval-harness/tests/test_task_runner.py
@@ -557,9 +557,9 @@ def test_workspace_name_includes_rep(sample_repo):
             prompt_source="commit_message"
         )
 
-        ws0 = runner._setup_workspace(task, Condition.NONE, rep=0)
-        ws1 = runner._setup_workspace(task, Condition.NONE, rep=1)
-        ws5 = runner._setup_workspace(task, Condition.NONE, rep=5)
+        ws0 = runner.setup_workspace(task, Condition.NONE, rep=0)
+        ws1 = runner.setup_workspace(task, Condition.NONE, rep=1)
+        ws5 = runner.setup_workspace(task, Condition.NONE, rep=5)
 
         assert ws0 != ws1
         assert ws1 != ws5
@@ -580,7 +580,7 @@ def test_workspace_default_rep_is_zero(sample_repo):
             prompt_source="commit_message"
         )
 
-        ws = runner._setup_workspace(task, Condition.NONE)
+        ws = runner.setup_workspace(task, Condition.NONE)
         assert "-r0" in ws
 
 

From b53c6148bcec3bc66ad87ef287646b584d452b81 Mon Sep 17 00:00:00 2001
From: Ryan Orban <ryan@orban.dev>
Date: Tue, 24 Feb 2026 21:20:30 -0800
Subject: [PATCH 10/21] add network parameter to run_in_docker, default to
 bridge

--network none breaks setup commands that need pip install.
Default to bridge (Docker's default) and expose the parameter
so callers can opt into none for pure-test phases later.

Entire-Checkpoint: 8118f4f3017a
---
 eval-harness/lib/docker_runner.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/eval-harness/lib/docker_runner.py b/eval-harness/lib/docker_runner.py
index 40f3bb8..709faf6 100644
--- a/eval-harness/lib/docker_runner.py
+++ b/eval-harness/lib/docker_runner.py
@@ -28,6 +28,7 @@ def run_in_docker(
     stream_log: str | Path | None = None,
     heartbeat_interval: int = 20,
     heartbeat_callback: Callable[[float, int, int], None] | None = None,
+    network: str = "bridge",
 ) -> DockerResult:
     """Run a command in a Docker container with workspace mounted.
 
@@ -46,7 +47,7 @@ def run_in_docker(
         cmd.extend(["-v", f"{cache_volume}:/root/.cache"])
     cmd.extend([
         "-w", "/work",
-        "--network", "none",
+        "--network", network,
         "--memory", memory,
         "--cpus", cpus,
         image,

From 2d42eab8af785569916b8a450bd37d48f242e2c2 Mon Sep 17 00:00:00 2001
From: Ryan Orban <ryan@orban.dev>
Date: Tue, 24 Feb 2026 21:38:13 -0800
Subject: [PATCH 11/21] fix agentbench loader split and docker shell
 compatibility

- agentbench_loader: use split="train" (dataset has no "test" split)
- docker_runner: use bash instead of sh (AGENTbench setup uses `source`)
Entire-Checkpoint: 0afc9c1d9169
---
 eval-harness/lib/agentbench_loader.py | 2 +-
 eval-harness/lib/docker_runner.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/eval-harness/lib/agentbench_loader.py b/eval-harness/lib/agentbench_loader.py
index 80f70da..a00efef 100644
--- a/eval-harness/lib/agentbench_loader.py
+++ b/eval-harness/lib/agentbench_loader.py
@@ -40,7 +40,7 @@ def load_instances(
     """
     from datasets import load_dataset
 
-    ds = load_dataset(DATASET_NAME, split="test")
+    ds = load_dataset(DATASET_NAME, split="train")
     logger.info("Loaded %d instances from %s", len(ds), DATASET_NAME)
 
     filter_ids_set = set(filter_ids) if filter_ids else None
diff --git a/eval-harness/lib/docker_runner.py b/eval-harness/lib/docker_runner.py
index 709faf6..fb4c00d 100644
--- a/eval-harness/lib/docker_runner.py
+++ b/eval-harness/lib/docker_runner.py
@@ -51,7 +51,7 @@ def run_in_docker(
         "--memory", memory,
         "--cpus", cpus,
         image,
-        "sh", "-c", command
+        "bash", "-c", command
     ])
 
     # Fast path: keep existing behavior when no streaming/heartbeat is needed.

From 9ab86325234b2a378f3df3df40fe9c045e2a04da Mon Sep 17 00:00:00 2001
From: Ryan Orban <ryan@orban.dev>
Date: Wed, 25 Feb 2026 22:25:29 -0800
Subject: [PATCH 12/21] use login shell in docker to pick up ~/.local/bin PATH

AGENTbench images install tools like uv to /root/.local/bin which
only gets added to PATH via /etc/profile in login shells. bash -lc
instead of bash -c fixes exit 127 for repos using uv.

Entire-Checkpoint: 5d8c2af91c5d
---
 eval-harness/lib/docker_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/eval-harness/lib/docker_runner.py b/eval-harness/lib/docker_runner.py
index fb4c00d..002895d 100644
--- a/eval-harness/lib/docker_runner.py
+++ b/eval-harness/lib/docker_runner.py
@@ -51,7 +51,7 @@ def run_in_docker(
         "--memory", memory,
         "--cpus", cpus,
         image,
-        "bash", "-c", command
+        "bash", "-lc", command
     ])
 
     # Fast path: keep existing behavior when no streaming/heartbeat is needed.

From 8d3b6efe8787cd19a38338d21a727ea1a302efb3 Mon Sep 17 00:00:00 2001
From: Ryan Orban <ryan@orban.dev>
Date: Fri, 27 Feb 2026 12:29:25 -0800
Subject: [PATCH 13/21] fix regression evaluation to match paper's delta-based
 logic, add remote docker support

Three fixes from run 4 analysis:

- Regression eval: only fail when a golden-passing test now fails (was
  requiring 100% pass rate, which is impossible when repos have 14-83
  pre-existing failures in the baseline)
- strip_docs: preserve README.md variants (setup.py reads them)
- docker_runner: add EVAL_DOCKER_HOST support for remote x86 execution
  via rsync, avoiding QEMU emulation on Apple Silicon

Also removes .index-cache-preserve/ (context files now generated
dynamically per run).

Entire-Checkpoint: 9f4d4a6248d7
---
 .gitignore                                    |   1 +
 .../ansible-flat_llm/AGENTS.md                | 111 ---------
 .../ansible-flat_llm/CLAUDE.md                |  86 -------
 .../ansible-intent_layer/CLAUDE.md            |  71 ------
 .../lib/ansible/_internal/AGENTS.md           |  32 ---
 .../lib/ansible/cli/AGENTS.md                 |  38 ---
 .../lib/ansible/config/AGENTS.md              |  32 ---
 .../lib/ansible/executor/AGENTS.md            |  26 --
 .../lib/ansible/galaxy/AGENTS.md              |  32 ---
 .../lib/ansible/module_utils/AGENTS.md        |  41 ---
 .../lib/ansible/modules/AGENTS.md             |  31 ---
 .../lib/ansible/parsing/AGENTS.md             |  22 --
 .../lib/ansible/playbook/AGENTS.md            |  23 --
 .../lib/ansible/plugins/AGENTS.md             |  31 ---
 .../lib/ansible/utils/AGENTS.md               |  30 ---
 .../ansible-intent_layer/test/AGENTS.md       |  35 ---
 .../.index-cache-preserve/cache-manifest.json | 112 ---------
 .../fastmcp-flat_llm/AGENTS.md                |  81 ------
 .../fastmcp-flat_llm/CLAUDE.md                |  81 ------
 .../fastmcp-intent_layer/AGENTS.md            | 133 ----------
 .../fastmcp-intent_layer/CLAUDE.md            | 133 ----------
 .../src/fastmcp/client/AGENTS.md              |  79 ------
 .../src/fastmcp/server/AGENTS.md              | 132 ----------
 .../src/fastmcp/tools/AGENTS.md               |  74 ------
 .../fastmcp-intent_layer/tests/AGENTS.md      |  71 ------
 .../graphiti-flat_llm/AGENTS.md               | 161 ------------
 .../graphiti-flat_llm/CLAUDE.md               | 120 ---------
 .../graphiti-intent_layer/CLAUDE.md           |  64 -----
 .../graphiti_core/AGENTS.md                   |  66 -----
 .../graphiti_core/driver/AGENTS.md            |  32 ---
 .../graphiti_core/llm_client/AGENTS.md        |  24 --
 .../graphiti_core/namespaces/AGENTS.md        |  18 --
 .../graphiti_core/search/AGENTS.md            |  31 ---
 .../graphiti_core/utils/AGENTS.md             |  48 ----
 .../mcp_server/AGENTS.md                      |  53 ----
 .../graphiti-intent_layer/server/AGENTS.md    |  23 --
 .../graphiti-intent_layer/tests/AGENTS.md     |  54 ----
 .../pdm-flat_llm/AGENTS.md                    | 124 ----------
 .../pdm-flat_llm/CLAUDE.md                    | 124 ----------
 .../pdm-intent_layer/CLAUDE.md                | 152 ------------
 .../pdm-intent_layer/src/pdm/cli/AGENTS.md    |  81 ------
 .../src/pdm/formats/AGENTS.md                 |  84 -------
 .../src/pdm/installers/AGENTS.md              |  81 ------
 .../pdm-intent_layer/src/pdm/models/AGENTS.md |  98 --------
 .../src/pdm/project/AGENTS.md                 |  87 -------
 .../src/pdm/resolver/AGENTS.md                |  78 ------
 eval-harness/lib/agentbench_runner.py         |  43 ++--
 eval-harness/lib/docker_runner.py             | 234 ++++++++++++++++--
 eval-harness/tests/test_agentbench.py         |   7 +-
 49 files changed, 241 insertions(+), 3184 deletions(-)
 delete mode 100644 eval-harness/.index-cache-preserve/ansible-flat_llm/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/ansible-flat_llm/CLAUDE.md
 delete mode 100644 eval-harness/.index-cache-preserve/ansible-intent_layer/CLAUDE.md
 delete mode 100644 eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/_internal/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/cli/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/config/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/executor/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/galaxy/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/module_utils/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/modules/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/parsing/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/playbook/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/plugins/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/utils/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/ansible-intent_layer/test/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/cache-manifest.json
 delete mode 100644 eval-harness/.index-cache-preserve/fastmcp-flat_llm/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/fastmcp-flat_llm/CLAUDE.md
 delete mode 100644 eval-harness/.index-cache-preserve/fastmcp-intent_layer/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/fastmcp-intent_layer/CLAUDE.md
 delete mode 100644 eval-harness/.index-cache-preserve/fastmcp-intent_layer/src/fastmcp/client/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/fastmcp-intent_layer/src/fastmcp/server/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/fastmcp-intent_layer/src/fastmcp/tools/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/fastmcp-intent_layer/tests/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/graphiti-flat_llm/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/graphiti-flat_llm/CLAUDE.md
 delete mode 100644 eval-harness/.index-cache-preserve/graphiti-intent_layer/CLAUDE.md
 delete mode 100644 eval-harness/.index-cache-preserve/graphiti-intent_layer/graphiti_core/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/graphiti-intent_layer/graphiti_core/driver/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/graphiti-intent_layer/graphiti_core/llm_client/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/graphiti-intent_layer/graphiti_core/namespaces/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/graphiti-intent_layer/graphiti_core/search/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/graphiti-intent_layer/graphiti_core/utils/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/graphiti-intent_layer/mcp_server/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/graphiti-intent_layer/server/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/graphiti-intent_layer/tests/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/pdm-flat_llm/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/pdm-flat_llm/CLAUDE.md
 delete mode 100644 eval-harness/.index-cache-preserve/pdm-intent_layer/CLAUDE.md
 delete mode 100644 eval-harness/.index-cache-preserve/pdm-intent_layer/src/pdm/cli/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/pdm-intent_layer/src/pdm/formats/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/pdm-intent_layer/src/pdm/installers/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/pdm-intent_layer/src/pdm/models/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/pdm-intent_layer/src/pdm/project/AGENTS.md
 delete mode 100644 eval-harness/.index-cache-preserve/pdm-intent_layer/src/pdm/resolver/AGENTS.md

diff --git a/.gitignore b/.gitignore
index 3d03bdd..18dcd9d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,3 +20,4 @@ CODEX_REVIEW.md
 .serena/
 # Nightshift plan artifacts (keep out of version control)
 .nightshift-plan
+.dmux/
diff --git a/eval-harness/.index-cache-preserve/ansible-flat_llm/AGENTS.md b/eval-harness/.index-cache-preserve/ansible-flat_llm/AGENTS.md
deleted file mode 100644
index fb588c1..0000000
--- a/eval-harness/.index-cache-preserve/ansible-flat_llm/AGENTS.md
+++ /dev/null
@@ -1,111 +0,0 @@
-# CLAUDE.md
-
-This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
-
-## Project overview
-
-ansible-core (v2.21.0.dev0) — the engine behind Ansible. Python >= 3.12 on the controller; modules support down to Python 3.9 on targets. Licensed GPL-3.0-or-later, except `lib/ansible/module_utils/` which is BSD-2-Clause.
-
-## Development setup
-
-```bash
-pip install -e .                    # editable install
-# OR run from checkout without installing:
-source ./hacking/env-setup
-pip install -r requirements.txt
-```
-
-Requires a POSIX OS. On Windows, use WSL.
-
-## Testing
-
-All testing goes through `ansible-test`, not pytest directly.
-
-```bash
-# Sanity (lint, static analysis: pylint, mypy, pep8, validate-modules, etc.)
-ansible-test sanity -v --docker default
-ansible-test sanity -v --docker default --test pep8
-ansible-test sanity -v --docker default lib/ansible/modules/command.py  # single file
-ansible-test sanity --list-tests
-
-# Unit tests
-ansible-test units -v --docker default
-ansible-test units -v --docker default test/units/modules/test_command.py  # single test
-
-# Integration tests (must use distro container, NOT default/base)
-ansible-test integration -v --docker ubuntu2404
-ansible-test integration -v --docker ubuntu2404 setup_remote_tmp_dir  # single target
-
-# Without Docker/Podman
-ansible-test sanity -v --venv
-```
-
-Container selection matters: `--docker default` for sanity/units only. Integration tests need distro-specific containers (`ubuntu2204`, `ubuntu2404`, etc.). Available containers listed in `test/lib/ansible_test/_data/completion/docker.txt`.
-
-## Architecture
-
-### Module isolation boundary
-
-The most important architectural rule: **modules execute on remote targets**, so `lib/ansible/modules/` can only import from `lib/ansible/module_utils/`. And `module_utils` cannot import from outside itself. This boundary is enforced by the AnsiballZ packaging system that bundles modules for remote execution.
-
-### Code layout
-
-- `lib/ansible/cli/` — CLI entry points (ansible, ansible-playbook, ansible-galaxy, etc.)
-- `lib/ansible/executor/` — core execution engine: runs plays, tasks, manages workers
-- `lib/ansible/playbook/` — data structures for plays, blocks, tasks, roles
-- `lib/ansible/plugins/` — plugin framework (action, connection, callback, filter, lookup, strategy, become, cache, inventory)
-- `lib/ansible/modules/` — built-in modules (apt, copy, file, git, user, etc.)
-- `lib/ansible/module_utils/` — shared utilities shipped to targets with modules
-- `lib/ansible/inventory/` — host/group inventory management
-- `lib/ansible/parsing/` — data loading, YAML, vault encryption
-- `lib/ansible/config/` — configuration system; `base.yml` defines all settings
-- `lib/ansible/_internal/` — private API (templating engine, datatag system, JSON profiles, SSH agent, AnsiballZ builder). Not for external use.
-- `lib/ansible/galaxy/` — ansible-galaxy client and dependency resolution
-
-### Plugin system
-
-Plugins live in `lib/ansible/plugins/<type>/`. Each type has its own base class in `__init__.py`. Action plugins run on the controller and typically wrap a module (e.g., `action/copy.py` handles local file transfer then invokes the `copy` module on the target).
-
-### Test layout
-
-- `test/units/` — mirrors `lib/ansible/` structure. Pytest-style, prefer functional tests over heavy mocking.
-- `test/integration/targets/` — each target is a directory with tasks, runme.sh, or both. Named after the feature being tested.
-- `test/sanity/` — ignore files and code-smell scripts for sanity tests.
-
-## Code conventions
-
-- Line limit: **160 characters** (not 80)
-- E402 is ignored — in `lib/ansible/modules/`, imports come after the DOCUMENTATION/EXAMPLES/RETURN string blocks
-- Use `from __future__ import annotations` for native type hints
-- Modules require static DOCUMENTATION, EXAMPLES, and RETURN blocks as YAML strings (parsed via AST, cannot be dynamic)
-- Modules must have a `main()` function and `if __name__ == '__main__':` guard
-- Prefer stdlib over external dependencies
-
-### Deprecation cycle
-
-4 releases: deprecate in current, warn for 2 more, remove in the 4th. Use version from `lib/ansible/release.py` plus 3 (e.g., deprecating in 2.19 means removal in 2.22). Use `Display.deprecated` or `AnsibleModule.deprecate`.
-
-## Changelog fragments
-
-Every PR needs a fragment in `changelogs/fragments/`. Valid sections (from `changelogs/config.yaml`):
-
-`major_changes`, `minor_changes`, `breaking_changes`, `deprecated_features`, `removed_features`, `security_fixes`, `bugfixes`, `known_issues`
-
-Naming: `{issue_number}-{short-description}.yml` or `{component}-{description}.yml`. Never reuse existing fragment files. Format: YAML with section key mapping to a list of strings.
-
-## PR and branch policy
-
-- All PRs target `devel`
-- New plugins belong in collections, not ansible-core
-- Backwards compatibility is the top priority
-- Bug fixes backported to latest stable only; critical fixes to latest + previous stable
-- Security issues go to security@ansible.com, not GitHub
-
-## CI
-
-Azure Pipelines. Key jobs: Sanity (2 groups), Units (Python 3.9-3.14), Integration (various distros), Windows (2016/2019/2022/2025). Check CI failures with:
-
-```bash
-gh pr view <number> --comments     # ansibot posts failure details
-gh pr checks <number>              # Azure Pipelines URLs
-```
diff --git a/eval-harness/.index-cache-preserve/ansible-flat_llm/CLAUDE.md b/eval-harness/.index-cache-preserve/ansible-flat_llm/CLAUDE.md
deleted file mode 100644
index 15c7fd5..0000000
--- a/eval-harness/.index-cache-preserve/ansible-flat_llm/CLAUDE.md
+++ /dev/null
@@ -1,86 +0,0 @@
-# CLAUDE.md
-
-This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
-
-## Project overview
-
-ansible-core (v2.21.0.dev0) — the engine behind Ansible. Python >= 3.12 on the controller; modules support down to Python 3.9 on targets. Licensed GPL-3.0-or-later, except `lib/ansible/module_utils/` which is BSD-2-Clause.
-
-## Testing
-
-Run only the specific test file relevant to the bug, not the full test suite. For example:
-
-```bash
-pytest test/units/modules/test_specific.py -x --tb=short
-pytest test/units/modules/test_specific.py::TestClassName::test_method
-```
-
-Unit tests live in `test/units/`, mirroring the `lib/ansible/` structure.
-
-## Architecture
-
-### Module isolation boundary
-
-The most important architectural rule: **modules execute on remote targets**, so `lib/ansible/modules/` can only import from `lib/ansible/module_utils/`. And `module_utils` cannot import from outside itself. This boundary is enforced by the AnsiballZ packaging system that bundles modules for remote execution.
-
-### Code layout
-
-- `lib/ansible/cli/` — CLI entry points (ansible, ansible-playbook, ansible-galaxy, etc.)
-- `lib/ansible/executor/` — core execution engine: runs plays, tasks, manages workers
-- `lib/ansible/playbook/` — data structures for plays, blocks, tasks, roles
-- `lib/ansible/plugins/` — plugin framework (action, connection, callback, filter, lookup, strategy, become, cache, inventory)
-- `lib/ansible/modules/` — built-in modules (apt, copy, file, git, user, etc.)
-- `lib/ansible/module_utils/` — shared utilities shipped to targets with modules
-- `lib/ansible/inventory/` — host/group inventory management
-- `lib/ansible/parsing/` — data loading, YAML, vault encryption
-- `lib/ansible/config/` — configuration system; `base.yml` defines all settings
-- `lib/ansible/_internal/` — private API (templating engine, datatag system, JSON profiles, SSH agent, AnsiballZ builder). Not for external use.
-- `lib/ansible/galaxy/` — ansible-galaxy client and dependency resolution
-
-### Plugin system
-
-Plugins live in `lib/ansible/plugins/<type>/`. Each type has its own base class in `__init__.py`. Action plugins run on the controller and typically wrap a module (e.g., `action/copy.py` handles local file transfer then invokes the `copy` module on the target).
-
-### Test layout
-
-- `test/units/` — mirrors `lib/ansible/` structure. Pytest-style, prefer functional tests over heavy mocking.
-- `test/integration/targets/` — each target is a directory with tasks, runme.sh, or both. Named after the feature being tested.
-- `test/sanity/` — ignore files and code-smell scripts for sanity tests.
-
-## Code conventions
-
-- Line limit: **160 characters** (not 80)
-- E402 is ignored — in `lib/ansible/modules/`, imports come after the DOCUMENTATION/EXAMPLES/RETURN string blocks
-- Use `from __future__ import annotations` for native type hints
-- Modules require static DOCUMENTATION, EXAMPLES, and RETURN blocks as YAML strings (parsed via AST, cannot be dynamic)
-- Modules must have a `main()` function and `if __name__ == '__main__':` guard
-- Prefer stdlib over external dependencies
-
-### Deprecation cycle
-
-4 releases: deprecate in current, warn for 2 more, remove in the 4th. Use version from `lib/ansible/release.py` plus 3 (e.g., deprecating in 2.19 means removal in 2.22). Use `Display.deprecated` or `AnsibleModule.deprecate`.
-
-## Changelog fragments
-
-Every PR needs a fragment in `changelogs/fragments/`. Valid sections (from `changelogs/config.yaml`):
-
-`major_changes`, `minor_changes`, `breaking_changes`, `deprecated_features`, `removed_features`, `security_fixes`, `bugfixes`, `known_issues`
-
-Naming: `{issue_number}-{short-description}.yml` or `{component}-{description}.yml`. Never reuse existing fragment files. Format: YAML with section key mapping to a list of strings.
-
-## PR and branch policy
-
-- All PRs target `devel`
-- New plugins belong in collections, not ansible-core
-- Backwards compatibility is the top priority
-- Bug fixes backported to latest stable only; critical fixes to latest + previous stable
-- Security issues go to security@ansible.com, not GitHub
-
-## CI
-
-Azure Pipelines. Key jobs: Sanity (2 groups), Units (Python 3.9-3.14), Integration (various distros), Windows (2016/2019/2022/2025). Check CI failures with:
-
-```bash
-gh pr view <number> --comments     # ansibot posts failure details
-gh pr checks <number>              # Azure Pipelines URLs
-```
diff --git a/eval-harness/.index-cache-preserve/ansible-intent_layer/CLAUDE.md b/eval-harness/.index-cache-preserve/ansible-intent_layer/CLAUDE.md
deleted file mode 100644
index 412cb9a..0000000
--- a/eval-harness/.index-cache-preserve/ansible-intent_layer/CLAUDE.md
+++ /dev/null
@@ -1,71 +0,0 @@
-# CLAUDE.md
-
-This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
-
-## Overview
-
-ansible-core (v2.21.0.dev0) — the engine behind Ansible. Python >= 3.12 on the controller; modules support down to Python 3.9 on targets. GPL-3.0-or-later, except `lib/ansible/module_utils/` which is BSD-2-Clause.
-
-## Testing
-
-Run only the specific test file relevant to the bug, not the full test suite. For example:
-
-```bash
-pytest test/units/modules/test_specific.py -x --tb=short
-pytest test/units/modules/test_specific.py::TestClassName::test_method
-```
-
-Unit tests live in `test/units/`, mirroring the `lib/ansible/` structure.
-
-## Architecture
-
-### Module isolation boundary (CRITICAL)
-
-Modules execute on remote targets. `lib/ansible/modules/` can ONLY import from `lib/ansible/module_utils/`. And `module_utils` cannot import from outside itself. Enforced by AnsiballZ packaging.
-
-### Code layout
-
-- `lib/ansible/cli/` — CLI entry points
-- `lib/ansible/executor/` — execution engine, runs plays/tasks, manages workers
-- `lib/ansible/playbook/` — data structures for plays, blocks, tasks, roles
-- `lib/ansible/plugins/` — plugin framework (action, connection, callback, filter, lookup, strategy, become, cache, inventory)
-- `lib/ansible/modules/` — built-in modules (apt, copy, file, git, user, etc.)
-- `lib/ansible/module_utils/` — shared utilities shipped to targets
-- `lib/ansible/parsing/` — YAML loading, vault encryption
-- `lib/ansible/config/` — configuration system; `base.yml` defines all settings
-- `lib/ansible/_internal/` — private API (templating, datatag, AnsiballZ)
-
-## Contracts
-
-- 160-char line limit (not 80)
-- E402 ignored in modules — imports come after DOCUMENTATION/EXAMPLES/RETURN blocks
-- `from __future__ import annotations` for type hints
-- Modules require static DOCUMENTATION, EXAMPLES, RETURN blocks as YAML strings
-- Modules must have `main()` and `if __name__ == '__main__':` guard
-- Deprecation cycle: 4 releases (deprecate, warn ×2, remove)
-- Every PR needs a changelog fragment in `changelogs/fragments/`
-- All PRs target `devel` branch
-
-## Pitfalls
-
-- Container selection: `--docker default` for sanity/units only, distro containers for integration
-- New plugins belong in collections, not ansible-core
-- `base.yml` defines all configuration — don't add settings anywhere else
-- Security issues go to security@ansible.com, not GitHub
-
-## Downlinks
-
-| Area | Node |
-|------|------|
-| Modules | `lib/ansible/modules/AGENTS.md` |
-| Module Utils | `lib/ansible/module_utils/AGENTS.md` |
-| Plugins | `lib/ansible/plugins/AGENTS.md` |
-| Executor | `lib/ansible/executor/AGENTS.md` |
-| Playbook | `lib/ansible/playbook/AGENTS.md` |
-| Parsing | `lib/ansible/parsing/AGENTS.md` |
-| CLI | `lib/ansible/cli/AGENTS.md` |
-| Config | `lib/ansible/config/AGENTS.md` |
-| Galaxy | `lib/ansible/galaxy/AGENTS.md` |
-| Utils | `lib/ansible/utils/AGENTS.md` |
-| Internals | `lib/ansible/_internal/AGENTS.md` |
-| Tests | `test/AGENTS.md` |
diff --git a/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/_internal/AGENTS.md b/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/_internal/AGENTS.md
deleted file mode 100644
index 18df3a1..0000000
--- a/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/_internal/AGENTS.md
+++ /dev/null
@@ -1,32 +0,0 @@
-# _internal
-
-Private controller-side internals. Everything here is unstable API, subject to change without notice.
-
-## Subsystems
-
-- `_templating/` — Jinja2 engine, lazy containers, marker handling, template variable resolution. The largest subsystem (~40k tokens). `_jinja_bits.py` contains filter/test registration; `_engine.py` is the core template evaluator.
-- `_ansiballz/` — module packaging for remote execution. `_builder.py` manages extensions (debugpy, pydevd, coverage). `_wrapper.py` is the remote-side unpacker.
-- `_datatag/` — type tagging system for tracking data provenance (trusted-as-template, origin tracking, encrypted string handling)
-- `_json/` — JSON serialization profiles. `_profiles/` contains legacy, cache-persistence, and inventory-specific serializers.
-- `_ssh/` — SSH agent management. `_ssh_agent.py` is a Python SSH agent client; `_agent_launch.py` handles agent process lifecycle.
-- `_errors/` — error factory, alarm/task timeouts, captured exceptions, handler utilities
-- `_encryption/` — crypt facade for password hashing
-- `_plugins/` — internal plugin caching
-
-## Entry points
-
-- `__init__.py` — injects controller-side serialization map and import hook into `module_utils._internal`. The `setup()` function triggers side-effect imports.
-- `_wrapt.py` — vendored wrapt (1.17.2) for decorator/proxy support
-
-## Contracts
-
-- This package augments `module_utils._internal` at import time by replacing stub functions with real implementations (e.g., `get_controller_serialize_map`, `import_controller_module`)
-- `is_controller = True` flag distinguishes controller vs target context
-- `@experimental` decorator marks types outside `_internal` that expose internal types
-
-## Pitfalls
-
-- Import order matters: `_internal.__init__` monkey-patches `module_utils._internal` on import. Disordered imports can break the controller detection mechanism (see DTFIX-FUTURE comment in `__init__.py`).
-- Marker handling in Jinja templates is fragile. Multiple bug fixes for edge cases: macro invocations, filter results returning Marker, None values in template nodes, tuple slicing. Always test template changes against `test/units/template/`.
-- AnsiballZ `sitecustomize` escaping: special characters in the wrapper need careful escaping (commit 6bb7bd7).
-- `EncryptedString` redaction has multiple code paths. Changes to serialization must account for both tagged and untagged contexts.
diff --git a/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/cli/AGENTS.md b/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/cli/AGENTS.md
deleted file mode 100644
index 77388b0..0000000
--- a/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/cli/AGENTS.md
+++ /dev/null
@@ -1,38 +0,0 @@
-# CLI
-
-Command-line entry points for all `ansible-*` commands.
-
-## Commands
-
-| File | Command | Purpose |
-|------|---------|---------|
-| `adhoc.py` | `ansible` | Run single tasks against hosts |
-| `playbook.py` | `ansible-playbook` | Execute playbooks |
-| `galaxy.py` | `ansible-galaxy` | Install/manage collections and roles (largest CLI, ~23k tokens) |
-| `doc.py` | `ansible-doc` | Browse module/plugin documentation |
-| `vault.py` | `ansible-vault` | Encrypt/decrypt files |
-| `config.py` | `ansible-config` | View/dump configuration |
-| `console.py` | `ansible-console` | Interactive REPL |
-| `inventory.py` | `ansible-inventory` | Inspect inventory |
-| `pull.py` | `ansible-pull` | Pull playbooks from VCS and run locally |
-
-## Entry points
-
-- `__init__.py` — shared base class `CLI`. Also handles `SSH_ASKPASS` interception (when invoked with specific env var, delegates to `_ssh_askpass.py` before any other imports).
-- `arguments/option_helpers.py` — shared argparse option definitions used across all CLI commands.
-- `scripts/` — shell wrapper entry points.
-
-## Contracts
-
-- Python >= 3.12 on controller (enforced in `__init__.py` via `_PY_MIN`)
-- UTF-8 locale required (checked at import time by `initialize_locale()`)
-- Blocking I/O required on stdin/stdout/stderr (checked by `check_blocking_io()`)
-- All CLI classes inherit from the base `CLI` class in `__init__.py`
-
-## Pitfalls
-
-- `ansible-doc` crashes when scanning collections whose path contains `ansible_collections` twice (commit c6d8d20).
-- `ansible-galaxy` must strip internal paths when using `AnsibleCollectionConfig.collection_paths` (commit 945516c).
-- `ansible-pull` has output inconsistencies with `--check` on changed status (commit 4bc4030).
-- `ansible-config` must serialize galaxy server config to proper JSON format, not Python repr (commit 2a4b1c8).
-- Askpass prompts are limited to a single attempt. The `SSH_ASKPASS` shm-based mechanism in `__init__.py` bypasses normal CLI initialization.
diff --git a/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/config/AGENTS.md b/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/config/AGENTS.md
deleted file mode 100644
index 5e42134..0000000
--- a/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/config/AGENTS.md
+++ /dev/null
@@ -1,32 +0,0 @@
-# Config
-
-Configuration system for ansible-core. All settings are defined declaratively in YAML.
-
-## Files
-
-- `base.yml` — the single source of truth for all ansible configuration settings (~2260 lines). Each entry defines name, description, type, default, env vars, ini options, and version_added.
-- `ansible_builtin_runtime.yml` — maps built-in plugin routing (redirects, deprecations, tombstones for removed plugins)
-- `manager.py` — `ConfigManager` class that loads `base.yml`, resolves values from env/ini/cli/vars with precedence, and performs type coercion via `ensure_type()`
-- `__init__.py` — re-exports
-
-## How configuration resolution works
-
-1. Plugin or CLI code requests a config value by name
-2. `ConfigManager` checks sources in precedence order: variable → CLI arg → env var → ini file → default
-3. `ensure_type()` coerces the raw value to the declared type (str, bool, int, float, list, dict, path, pathlist, pathspec, tmppath)
-4. Vaulted values are decrypted transparently during coercion
-
-## Contracts
-
-- All new settings go in `base.yml`, nowhere else. The declarative schema is the API.
-- Galaxy server definitions use a separate schema (`GALAXY_SERVER_DEF` in `manager.py`) with fields: url, username, password, token, auth_url, validate_certs, client_id, client_secret, timeout.
-- Type coercion supports these types: `str`, `bool`, `boolean`, `int`, `integer`, `float`, `list`, `none`, `path`, `tmppath`, `pathspec`, `pathlist`, `dict`
-- `INTERNAL_DEFS` defines settings only available to internal callers (e.g., lookup `_terms`)
-
-## Pitfalls
-
-- `ensure_type` must handle vaulted (encrypted) values. Values matching `_EncryptedStringProtocol` get decrypted before coercion (commit 9a426fe).
-- `auto_silent*` interpreter discovery options were removed (commit 790b66f). Don't reference them.
-- Config lookup with `show_origin` had bugs around variable resolution (commit 1cb2932).
-- Galaxy server config dump must produce valid JSON, not Python dict repr (commit 2a4b1c8).
-- The `NativeEnvironment` from Jinja2 is used for config value interpolation, so Jinja expressions in config values are evaluated.
diff --git a/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/executor/AGENTS.md b/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/executor/AGENTS.md
deleted file mode 100644
index 558d229..0000000
--- a/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/executor/AGENTS.md
+++ /dev/null
@@ -1,26 +0,0 @@
-# Executor
-
-Core execution engine that runs plays, tasks, and manages workers.
-
-## Key components
-
-- `task_executor.py` — runs individual tasks: loads action plugin, handles loops, retries, async
-- `play_iterator.py` — walks through play structure (pre_tasks, roles, tasks, post_tasks, handlers)
-- `task_queue_manager.py` — manages worker processes via multiprocessing
-- `module_common.py` — AnsiballZ: packages modules + module_utils for remote execution
-- `playbook_executor.py` — top-level: iterates over plays in a playbook
-
-## Execution flow
-
-1. PlaybookExecutor iterates plays
-2. StrategyPlugin (e.g., linear) uses PlayIterator to walk tasks
-3. TaskQueueManager dispatches to worker processes
-4. TaskExecutor loads the action plugin and runs it
-5. Action plugin packages module via AnsiballZ and sends to target
-
-## Contracts
-
-- Workers communicate via multiprocessing queues
-- Task results flow back through the queue to the strategy plugin
-- Handler execution is deferred until all tasks in a block complete
-- `rescue` and `always` blocks in block/rescue/always are handled by PlayIterator
diff --git a/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/galaxy/AGENTS.md b/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/galaxy/AGENTS.md
deleted file mode 100644
index 59f1825..0000000
--- a/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/galaxy/AGENTS.md
+++ /dev/null
@@ -1,32 +0,0 @@
-# Galaxy
-
-Client-side logic for Ansible Galaxy: installing, building, and managing collections and roles.
-
-## Structure
-
-- `api.py` — Galaxy REST API client. Handles v2/v3 API versions, authentication (token, client credentials), pagination, and server fallback.
-- `collection/` — collection management (install, download, verify, build)
-  - `__init__.py` — main collection operations (~19k tokens, largest file). Install, download, verify signatures, build tarballs.
-  - `concrete_artifact_manager.py` — resolves collection artifacts from Galaxy servers, URLs, local paths, or git repos
-  - `galaxy_api_proxy.py` — wraps multiple Galaxy servers with fallback
-  - `gpg.py` — GPG signature verification for collections
-- `dependency_resolution/` — resolves collection version constraints using `resolvelib`
-  - `dataclasses.py` — `Candidate`, `Requirement` types for the resolver
-- `role.py` — legacy role management (install from Galaxy or git)
-- `token.py` — Galaxy API token handling (keyring, file, config)
-- `user_agent.py` — user-agent string construction
-
-## Contracts
-
-- Collections use `resolvelib` for dependency resolution. The resolver operates on `Candidate` and `Requirement` objects from `dependency_resolution/dataclasses.py`.
-- Galaxy API responses may omit the `results` key in cached responses. Always check for it (commit 192948434c).
-- Collection metadata lives in `galaxy.yml` / `MANIFEST.json`. Schema validation uses data from `data/collections_galaxy_meta.yml`.
-- `download_url` values may lack a scheme:host prefix (commit 390e112). URL handling must account for relative paths.
-
-## Pitfalls
-
-- Collection install can have metadata/filesystem location mismatch (commit 1e31c7c). The installed path may not match the namespace.name in metadata.
-- `client_secret` and `access_token` config fields produce errant warnings if both are present (commit 183c695).
-- When `ansible_collections` appears twice in a path, `ansible-doc` crashes during collection scanning (commit c6d8d20 in cli).
-- Internal collection paths must be stripped from `AnsibleCollectionConfig.collection_paths` in user-facing output (commit 945516c).
-- The v1 source info schema validation expects specific argument spec structure (commit 612d54f).
diff --git a/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/module_utils/AGENTS.md b/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/module_utils/AGENTS.md
deleted file mode 100644
index ecf7e4c..0000000
--- a/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/module_utils/AGENTS.md
+++ /dev/null
@@ -1,41 +0,0 @@
-# module_utils
-
-Shared utilities shipped to remote targets alongside modules. BSD-2-Clause licensed (more permissive than the rest of ansible-core).
-
-## Isolation boundary (CRITICAL)
-
-`module_utils` cannot import from anything outside itself. Modules can only import from `module_utils`. This is enforced by AnsiballZ packaging: only `module_utils` code is bundled and sent to the target machine.
-
-## Structure
-
-- `basic.py` — `AnsibleModule` base class (~22k tokens). Argument parsing, check mode, atomic file operations, `run_command()`, `exit_json()`/`fail_json()`. Target-side Python minimum is 3.9 (`_PY_MIN`).
-- `common/` — shared helpers: argument spec validation (`arg_spec.py`, `parameters.py`), text converters, file operations, YAML loading, process management, sentinel value
-- `facts/` — system fact gathering (hardware, network, virtual, system, OS distribution detection). `hardware/linux.py` is the largest (~9.5k tokens).
-- `distro/` — vendored `distro` library for OS detection (`_distro.py` ~12k tokens)
-- `urls.py` — HTTP client (~14k tokens). `fetch_url()`, `open_url()`, cookie and CA cert handling
-- `_internal/` — private internals shared between controller and target: datatag, JSON serialization, deprecation, AnsiballZ extensions
-- `csharp/` — C# module utilities for Windows (`Ansible.Basic.cs` ~20k tokens)
-- `powershell/` — PowerShell module utilities for Windows
-- `six/` — vendored `six` library (deprecated as of v2.21, commit 686c365)
-- `parsing/` — boolean conversion, URL splitting
-- `compat/` — compatibility shims (selinux)
-
-## Contracts
-
-- `AnsibleModule` arguments are declared via `argument_spec` dict. Validation is automatic.
-- Module results must be returned via `module.exit_json(**result)` (success) or `module.fail_json(msg=..., **result)` (failure). Never use `sys.exit()` or `print()`.
-- `run_command()` is the safe way to execute external commands. Handles encoding, PATH, and returns (rc, stdout, stderr).
-- `human_to_bytes()` converts size strings ("10M", "1G") to integer bytes. Accepts both SI and IEC units.
-- `get_bin_path()` locates executables. The `required` parameter was removed in v2.21 (commit 9f1177a); it now always raises on missing binaries.
-
-## Pitfalls
-
-- `basic.py` deprecated imports were removed in v2.21 (commit 2e8a859). Don't import `get_exception`, `BOOLEANS`, `BOOLEANS_TRUE`, `BOOLEANS_FALSE` from `basic`.
-- `ansible.module_utils.six` is deprecated (commit 686c365). Use stdlib equivalents.
-- `compat.datetime` APIs were removed (commit 367de44).
-- `human_to_bytes` had a parsing bug with certain unit formats (commit 13a7393). Test edge cases with mixed-case units.
-- Sensitive information remembered by `AnsibleModule` for later use was reverted due to issues (commits 19e9f3d, then revert fd76cc2). Don't cache user-provided secrets on the module object.
-- Windows async wrapper code was refactored (commit 101e2eb). PowerShell async modules have different serialization requirements.
-- ClearLinux distribution detection was broken by Gentoo-style parsing (commit 869088b). Distribution fact code must handle overlapping `/etc/*-release` formats.
-- `fetch_file()` gained `ca_path` and `cookies` parameters (commit 1cd4369). Older code passing positional args may break.
-- Module respawn: `PYTHONPATH` must be explicitly set in the `ENV` dict copy, not inherited from the LIB env var (commit 82e4b46).
diff --git a/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/modules/AGENTS.md b/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/modules/AGENTS.md
deleted file mode 100644
index cfaf41a..0000000
--- a/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/modules/AGENTS.md
+++ /dev/null
@@ -1,31 +0,0 @@
-# Modules
-
-Built-in modules that execute on remote targets via AnsiballZ.
-
-## Isolation boundary
-
-Modules can ONLY import from `ansible.module_utils`. No other ansible imports allowed. This is the most important architectural rule — modules are packaged and shipped to remote machines.
-
-## Module structure
-
-Every module requires:
-1. `DOCUMENTATION` — YAML string (parsed via AST, must be static)
-2. `EXAMPLES` — YAML string with usage examples
-3. `RETURN` — YAML string describing return values
-4. `main()` function with module logic
-5. `if __name__ == '__main__': main()` guard
-
-## Contracts
-
-- E402 is ignored — imports come after the doc blocks
-- Use `AnsibleModule` from `ansible.module_utils.basic` for argument parsing
-- Return results via `module.exit_json()` or `module.fail_json()`
-- Check mode: implement `supports_check_mode=True` and test `module.check_mode`
-
-## Key modules
-
-- `command.py`, `shell.py` — run commands (command avoids shell, shell uses it)
-- `copy.py` — file transfer (action plugin handles local→remote, module handles permissions)
-- `file.py` — file/directory state management
-- `apt.py`, `yum.py`, `dnf.py` — package managers
-- `user.py`, `group.py` — user management
diff --git a/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/parsing/AGENTS.md b/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/parsing/AGENTS.md
deleted file mode 100644
index 509101e..0000000
--- a/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/parsing/AGENTS.md
+++ /dev/null
@@ -1,22 +0,0 @@
-# Parsing
-
-Data loading, YAML processing, and vault encryption.
-
-## Key components
-
-- `dataloader.py` — main data loader, reads YAML/JSON files with vault decryption
-- `vault/` — Ansible Vault: AES256 encryption for secrets in playbooks
-- `yaml/` — custom YAML loader with Jinja2 support and line number tracking
-- `mod_args.py` — module argument parsing (free-form vs key=value vs dict)
-
-## Contracts
-
-- YAML loader preserves line numbers for error reporting
-- Vault-encrypted strings are transparently decrypted during loading
-- Jinja2 expressions in YAML values are preserved as-is (evaluated later by the templating engine)
-- `mod_args.py` handles the three module argument formats: `command: foo`, `command: key=val`, and dict form
-
-## Pitfalls
-
-- The custom YAML loader doesn't support all YAML spec features — designed for Ansible's subset
-- Vault operations require a vault password/identity — missing it causes silent failures in some codepaths
diff --git a/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/playbook/AGENTS.md b/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/playbook/AGENTS.md
deleted file mode 100644
index 457834f..0000000
--- a/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/playbook/AGENTS.md
+++ /dev/null
@@ -1,23 +0,0 @@
-# Playbook
-
-Data structures representing Ansible playbook elements.
-
-## Key classes
-
-- `Playbook` — top-level container, holds list of `Play` objects
-- `Play` — a play: hosts, tasks, roles, vars, handlers
-- `Block` — groups tasks with rescue/always error handling
-- `Task` — a single task with module, args, conditionals, loops
-- `Role` — role abstraction with tasks, handlers, defaults, vars, files, templates
-- `RoleDefinition` — role metadata and dependency resolution
-
-## Data loading
-
-All classes use `load()` class methods that parse YAML dicts into validated objects. Field validation happens through the attribute descriptor system.
-
-## Contracts
-
-- Plays contain Blocks, Blocks contain Tasks (not Tasks directly in Plays)
-- Role dependencies are resolved recursively at load time
-- Variable precedence: extra vars > task vars > block vars > role vars > play vars > inventory vars
-- `when`, `loop`, `register`, `notify` are task-level attributes handled by the executor, not the playbook layer
diff --git a/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/plugins/AGENTS.md b/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/plugins/AGENTS.md
deleted file mode 100644
index 17eadb2..0000000
--- a/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/plugins/AGENTS.md
+++ /dev/null
@@ -1,31 +0,0 @@
-# Plugins
-
-Controller-side plugin framework with multiple types.
-
-## Architecture
-
-Each plugin type lives in `plugins/<type>/` with its own base class in `__init__.py`.
-
-### Plugin types
-- **action** — run on controller, typically wrap a module (e.g., `action/copy.py` handles local file transfer then invokes `copy` module on target)
-- **connection** — transport (ssh, local, docker, etc.)
-- **callback** — event hooks for output/logging
-- **filter** — Jinja2 filter plugins for templates
-- **lookup** — data retrieval from external sources
-- **strategy** — execution strategies (linear, free, debug)
-- **become** — privilege escalation (sudo, su, etc.)
-- **cache** — fact caching backends
-- **inventory** — dynamic inventory sources
-
-## Contracts
-
-- Plugins must inherit from the type's base class
-- Plugin loading uses a finder/loader system, not direct imports
-- Action plugins are the bridge between controller and target — they prepare data, invoke the module, and process results
-- New plugins should go in collections, not ansible-core
-
-## Pitfalls
-
-- Action plugins with the same name as a module automatically wrap that module
-- Plugin base classes define required methods — must implement all abstract methods
-- `_execute_module()` in action plugins handles AnsiballZ packaging transparently
diff --git a/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/utils/AGENTS.md b/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/utils/AGENTS.md
deleted file mode 100644
index ecbae95..0000000
--- a/eval-harness/.index-cache-preserve/ansible-intent_layer/lib/ansible/utils/AGENTS.md
+++ /dev/null
@@ -1,30 +0,0 @@
-# Utils
-
-Controller-side utility functions. Unlike `module_utils`, these are NOT shipped to targets.
-
-## Key files
-
-- `display.py` — `Display` singleton (~10.8k tokens). All user-facing output goes through this: `display()`, `v()`-`vvvvvv()` (verbosity levels), `warning()`, `error()`, `deprecated()`. Uses ctypes `wcwidth`/`wcswidth` for terminal width. Thread-safe via locking. Supports worker-process queue forwarding.
-- `collection_loader/_collection_finder.py` — custom Python import system for Ansible collections (~13.7k tokens). Installs meta path finders so `import ansible_collections.ns.name` works across multiple filesystem roots. Used by both ansible-core and ansible-test.
-- `encrypt.py` — password hashing via passlib or stdlib crypt. `do_encrypt()` is the main entry point.
-- `vars.py` — `combine_vars()` for merging variable dicts with configurable merge behavior (replace vs recursive merge).
-- `plugin_docs.py` — plugin documentation extraction and formatting (~4k tokens).
-- `unsafe_proxy.py` — `AnsibleUnsafeText`/`AnsibleUnsafeBytes` wrappers that mark strings as untrusted for Jinja2 templating.
-- `path.py` — path utilities: `unfrackpath()` (normalize/resolve), `makedirs_safe()`, temp file cleanup.
-- `singleton.py` — `Singleton` metaclass used by `Display` and other single-instance classes.
-- `ssh_functions.py` — SSH key checking and host key management.
-
-## Contracts
-
-- `Display` is a singleton. Get it via `Display()` anywhere; all calls share state.
-- `collection_loader` must remain compatible with all Python versions supported on both controller and remote (used by ansible-test import sanity). No non-stdlib imports allowed in its code.
-- `combine_vars()` behavior depends on `DEFAULT_HASH_BEHAVIOUR` config: "replace" (default) or "merge" (recursive).
-- `unsafe_proxy` types must pass through Jinja2 without being auto-escaped but also without being treated as trusted template content.
-
-## Pitfalls
-
-- `getuser()` fallback error handling was broken (commit 4184d96). When `getpass.getuser()` fails, the fallback must not raise a secondary exception.
-- Post-fork deadlock: early Python writers (like pydevd debugger) can deadlock the logging system after `os.fork()` (commit 1d1bbe3). Display uses fork-safe locking.
-- `deprecated()` calls require a `help_text` argument as of recent versions (commit ea7ad90). Old-style calls without it will fail.
-- `PluginInfo` was switched to use `PluginType` enum (commit 43c0132). Code creating `PluginInfo` objects must use the enum, not raw strings.
-- `_collection_finder.py` comment warns: "DO NOT add new non-stdlib import deps here." This file is loaded by external tools (ansible-test import sanity) in restricted environments.
diff --git a/eval-harness/.index-cache-preserve/ansible-intent_layer/test/AGENTS.md b/eval-harness/.index-cache-preserve/ansible-intent_layer/test/AGENTS.md
deleted file mode 100644
index dc1cdd7..0000000
--- a/eval-harness/.index-cache-preserve/ansible-intent_layer/test/AGENTS.md
+++ /dev/null
@@ -1,35 +0,0 @@
-# Tests
-
-Test infrastructure using ansible-test (not pytest directly).
-
-## Structure
-
-- `test/units/` — mirrors `lib/ansible/` structure, pytest-style
-- `test/integration/targets/` — each target is a directory with tasks/runme.sh
-- `test/sanity/` — ignore files and code-smell scripts
-
-## Running tests
-
-```bash
-# Unit (single file)
-ansible-test units -v --docker default test/units/modules/test_command.py
-
-# Integration (needs distro container)
-ansible-test integration -v --docker ubuntu2404 setup_remote_tmp_dir
-
-# Sanity
-ansible-test sanity -v --docker default --test pep8
-```
-
-## Contracts
-
-- Unit tests: prefer functional tests over heavy mocking
-- Integration targets are named after the feature being tested
-- Container: `--docker default` for sanity/units, distro containers for integration
-- Available containers listed in `test/lib/ansible_test/_data/completion/docker.txt`
-
-## Pitfalls
-
-- `ansible-test` wraps pytest with its own discovery — don't run pytest directly
-- Integration tests may need specific OS features — wrong container = mysterious failures
-- Some sanity tests (validate-modules) parse module source via AST — dynamic DOCUMENTATION blocks break them
diff --git a/eval-harness/.index-cache-preserve/cache-manifest.json b/eval-harness/.index-cache-preserve/cache-manifest.json
deleted file mode 100644
index 0c54d75..0000000
--- a/eval-harness/.index-cache-preserve/cache-manifest.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "entries": {
-    "pdm-flat_llm": {
-      "repo": "https://github.com/pdm-project/pdm",
-      "commit": "latest",
-      "workspace_path": "workspaces/.index-cache/pdm-flat_llm",
-      "created_at": "2026-02-17T20:27:26Z",
-      "agents_files": [
-        "AGENTS.md",
-        "CLAUDE.md"
-      ]
-    },
-    "pdm-intent_layer": {
-      "repo": "https://github.com/pdm-project/pdm",
-      "commit": "latest",
-      "workspace_path": "workspaces/.index-cache/pdm-intent_layer",
-      "created_at": "2026-02-17T20:38:01Z",
-      "agents_files": [
-        "CLAUDE.md",
-        "src/pdm/cli/AGENTS.md",
-        "src/pdm/formats/AGENTS.md",
-        "src/pdm/installers/AGENTS.md",
-        "src/pdm/models/AGENTS.md",
-        "src/pdm/project/AGENTS.md",
-        "src/pdm/resolver/AGENTS.md"
-      ]
-    },
-    "graphiti-flat_llm": {
-      "repo": "https://github.com/getzep/graphiti",
-      "commit": "latest",
-      "workspace_path": "workspaces/.index-cache/graphiti-flat_llm",
-      "created_at": "2026-02-17T22:13:53Z",
-      "agents_files": [
-        "AGENTS.md",
-        "CLAUDE.md"
-      ]
-    },
-    "ansible-flat_llm": {
-      "repo": "https://github.com/ansible/ansible",
-      "commit": "latest",
-      "workspace_path": "workspaces/.index-cache/ansible-flat_llm",
-      "created_at": "2026-02-17T22:18:47Z",
-      "agents_files": [
-        "AGENTS.md",
-        "CLAUDE.md"
-      ]
-    },
-    "graphiti-intent_layer": {
-      "repo": "https://github.com/getzep/graphiti",
-      "commit": "latest",
-      "workspace_path": "workspaces/.index-cache/graphiti-intent_layer",
-      "created_at": "2026-02-17T22:25:00Z",
-      "agents_files": [
-        "CLAUDE.md",
-        "graphiti_core/AGENTS.md",
-        "graphiti_core/driver/AGENTS.md",
-        "graphiti_core/llm_client/AGENTS.md",
-        "graphiti_core/namespaces/AGENTS.md",
-        "graphiti_core/search/AGENTS.md",
-        "graphiti_core/utils/AGENTS.md",
-        "mcp_server/AGENTS.md",
-        "server/AGENTS.md",
-        "tests/AGENTS.md"
-      ]
-    },
-    "ansible-intent_layer": {
-      "repo": "https://github.com/ansible/ansible",
-      "commit": "latest",
-      "workspace_path": "workspaces/.index-cache/ansible-intent_layer",
-      "created_at": "2026-02-17T22:25:00Z",
-      "agents_files": [
-        "CLAUDE.md",
-        "lib/ansible/_internal/AGENTS.md",
-        "lib/ansible/cli/AGENTS.md",
-        "lib/ansible/config/AGENTS.md",
-        "lib/ansible/executor/AGENTS.md",
-        "lib/ansible/galaxy/AGENTS.md",
-        "lib/ansible/module_utils/AGENTS.md",
-        "lib/ansible/modules/AGENTS.md",
-        "lib/ansible/parsing/AGENTS.md",
-        "lib/ansible/playbook/AGENTS.md",
-        "lib/ansible/plugins/AGENTS.md",
-        "lib/ansible/utils/AGENTS.md",
-        "test/AGENTS.md"
-      ]
-    },
-    "fastmcp-flat_llm": {
-      "repo": "https://github.com/jlowin/fastmcp",
-      "commit": "latest",
-      "workspace_path": "workspaces/.index-cache/fastmcp-flat_llm",
-      "created_at": "2026-02-18T13:03:12Z",
-      "agents_files": [
-        "AGENTS.md",
-        "CLAUDE.md"
-      ]
-    },
-    "fastmcp-intent_layer": {
-      "repo": "https://github.com/jlowin/fastmcp",
-      "commit": "latest",
-      "workspace_path": "workspaces/.index-cache/fastmcp-intent_layer",
-      "created_at": "2026-02-18T13:10:59Z",
-      "agents_files": [
-        "AGENTS.md",
-        "CLAUDE.md",
-        "src/fastmcp/client/AGENTS.md",
-        "src/fastmcp/server/AGENTS.md",
-        "src/fastmcp/tools/AGENTS.md",
-        "tests/AGENTS.md"
-      ]
-    }
-  }
-}
\ No newline at end of file
diff --git a/eval-harness/.index-cache-preserve/fastmcp-flat_llm/AGENTS.md b/eval-harness/.index-cache-preserve/fastmcp-flat_llm/AGENTS.md
deleted file mode 100644
index ec6a741..0000000
--- a/eval-harness/.index-cache-preserve/fastmcp-flat_llm/AGENTS.md
+++ /dev/null
@@ -1,81 +0,0 @@
-# CLAUDE.md
-
-This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
-
-## Project Overview
-
-FastMCP is a Python framework (≥3.10) for building Model Context Protocol (MCP) servers and clients. It provides decorator-based registration of tools, resources, and prompts that LLMs can interact with.
-
-## Essential Commands
-
-```bash
-uv sync                              # Install/update dependencies (always run first)
-uv run pytest -n auto                # Run full test suite in parallel
-uv run pytest -xvs tests/path/to_test.py::test_name  # Run a single test
-uv run pytest -xvs tests/path/to_test.py  # Run a single test file
-uv run prek run --all-files          # Run all static checks (ruff + prettier + ty)
-```
-
-**All tests and static checks must pass before committing.** The justfile has shortcuts (`just test`, `just build`, `just typecheck`) but the commands above are the primary workflow.
-
-## Architecture
-
-### Four Core MCP Object Types
-
-Changes to one type almost always require parallel changes to the other three:
-
-- **Tools** (`src/fastmcp/tools/`) — callable functions exposed to LLMs
-- **Resources** (`src/fastmcp/resources/`) — data sources accessible by URI
-- **Resource Templates** (`src/fastmcp/resources/`) — parameterized resource URI patterns
-- **Prompts** (`src/fastmcp/prompts/`) — prompt templates
-
-### Provider System
-
-The `Provider` base class (`src/fastmcp/server/providers/base.py`) is the core abstraction for dynamically sourcing MCP components at runtime. Providers implement `_list_tools()`, `_get_tool()`, and equivalent methods for resources/prompts/templates. Key implementations: `LocalProvider` (decorator-registered components), `ProxyProvider` (proxies to another MCP server), `FileSystemProvider`, `OpenAPIProvider`, `AggregateProvider`.
-
-Static components (via decorators) always take precedence over providers. Providers are queried in registration order; first non-None result wins.
-
-### Transform Pipeline
-
-Transforms (`src/fastmcp/server/transforms/`) modify components as they flow through the system (namespacing, visibility filtering, tool parameter modification, version filtering, converting prompts/resources into tools). Each provider carries its own transform chain.
-
-### Middleware Pipeline
-
-Server middleware (`src/fastmcp/server/middleware/`) uses typed hooks (`on_call_tool`, `on_read_resource`, `on_list_tools`, `on_initialize`, etc.) following a Django-like dispatch pattern. Built-in middleware handles error handling, logging, rate limiting, caching, timing, authorization, and tool injection.
-
-### Server Mixins
-
-`FastMCP` (`src/fastmcp/server/server.py`) composes behavior from mixins in `src/fastmcp/server/mixins/`: `LifespanMixin`, `MCPOperationsMixin` (core protocol operations), and `TransportMixin` (stdio/HTTP/SSE transport handling).
-
-### Client Architecture
-
-`Client` (`src/fastmcp/client/client.py`) uses transport abstraction (stdio, HTTP, SSE, streamable HTTP, memory) and mixin pattern for protocol operations. Sessions support reentrant context managers with reference counting.
-
-### Public API
-
-The top-level `fastmcp` package exports only: `FastMCP`, `Client`, `Context`, `settings`. Be intentional about re-exports — specialized features should live in their submodules.
-
-## Testing Conventions
-
-- `asyncio_mode = "auto"` is set globally — **never add `@pytest.mark.asyncio`** decorators
-- Default test timeout is **5 seconds** — optimize or mark slow tests with `@pytest.mark.integration`
-- Tests use in-memory transport by default (no network I/O)
-- `tests/conftest.py` provides `fastmcp_server`, `tool_server`, `tagged_resources_server` fixtures, plus autouse fixtures for settings isolation and logger propagation
-- Inline snapshots are disabled by default; use `--inline-snapshot=create` or `--inline-snapshot=fix` when needed
-- Integration tests go in `tests/integration_tests/` and are auto-marked
-
-## Code Standards
-
-- Full type annotations required; type checker is **ty** (`uv run ty check`)
-- File sizes enforced by **loq** (default 1000 lines). Edit `loq.toml` to raise limits; run `loq baseline` to ratchet down
-- Linting/formatting via **Ruff** — extended rules (bugbear, comprehensions, simplify) apply only to `src/` code
-- Never use bare `except` — always specify exception types
-- `docs/python-sdk/` is auto-generated by a bot — do not manually edit
-
-## Git Rules
-
-- Never force-push on collaborative repos
-- Never amend commits to fix prek failures — make a new commit
-- Agents must self-identify (e.g., "🤖 Generated with Claude Code" in commits/PRs)
-- Keep commit messages brief — headlines, not paragraphs
-- PR bodies: 1-2 paragraphs with a code example, not bullet summaries
diff --git a/eval-harness/.index-cache-preserve/fastmcp-flat_llm/CLAUDE.md b/eval-harness/.index-cache-preserve/fastmcp-flat_llm/CLAUDE.md
deleted file mode 100644
index ec6a741..0000000
--- a/eval-harness/.index-cache-preserve/fastmcp-flat_llm/CLAUDE.md
+++ /dev/null
@@ -1,81 +0,0 @@
-# CLAUDE.md
-
-This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
-
-## Project Overview
-
-FastMCP is a Python framework (≥3.10) for building Model Context Protocol (MCP) servers and clients. It provides decorator-based registration of tools, resources, and prompts that LLMs can interact with.
-
-## Essential Commands
-
-```bash
-uv sync                              # Install/update dependencies (always run first)
-uv run pytest -n auto                # Run full test suite in parallel
-uv run pytest -xvs tests/path/to_test.py::test_name  # Run a single test
-uv run pytest -xvs tests/path/to_test.py  # Run a single test file
-uv run prek run --all-files          # Run all static checks (ruff + prettier + ty)
-```
-
-**All tests and static checks must pass before committing.** The justfile has shortcuts (`just test`, `just build`, `just typecheck`) but the commands above are the primary workflow.
-
-## Architecture
-
-### Four Core MCP Object Types
-
-Changes to one type almost always require parallel changes to the other three:
-
-- **Tools** (`src/fastmcp/tools/`) — callable functions exposed to LLMs
-- **Resources** (`src/fastmcp/resources/`) — data sources accessible by URI
-- **Resource Templates** (`src/fastmcp/resources/`) — parameterized resource URI patterns
-- **Prompts** (`src/fastmcp/prompts/`) — prompt templates
-
-### Provider System
-
-The `Provider` base class (`src/fastmcp/server/providers/base.py`) is the core abstraction for dynamically sourcing MCP components at runtime. Providers implement `_list_tools()`, `_get_tool()`, and equivalent methods for resources/prompts/templates. Key implementations: `LocalProvider` (decorator-registered components), `ProxyProvider` (proxies to another MCP server), `FileSystemProvider`, `OpenAPIProvider`, `AggregateProvider`.
-
-Static components (via decorators) always take precedence over providers. Providers are queried in registration order; first non-None result wins.
-
-### Transform Pipeline
-
-Transforms (`src/fastmcp/server/transforms/`) modify components as they flow through the system (namespacing, visibility filtering, tool parameter modification, version filtering, converting prompts/resources into tools). Each provider carries its own transform chain.
-
-### Middleware Pipeline
-
-Server middleware (`src/fastmcp/server/middleware/`) uses typed hooks (`on_call_tool`, `on_read_resource`, `on_list_tools`, `on_initialize`, etc.) following a Django-like dispatch pattern. Built-in middleware handles error handling, logging, rate limiting, caching, timing, authorization, and tool injection.
-
-### Server Mixins
-
-`FastMCP` (`src/fastmcp/server/server.py`) composes behavior from mixins in `src/fastmcp/server/mixins/`: `LifespanMixin`, `MCPOperationsMixin` (core protocol operations), and `TransportMixin` (stdio/HTTP/SSE transport handling).
-
-### Client Architecture
-
-`Client` (`src/fastmcp/client/client.py`) uses transport abstraction (stdio, HTTP, SSE, streamable HTTP, memory) and mixin pattern for protocol operations. Sessions support reentrant context managers with reference counting.
-
-### Public API
-
-The top-level `fastmcp` package exports only: `FastMCP`, `Client`, `Context`, `settings`. Be intentional about re-exports — specialized features should live in their submodules.
-
-## Testing Conventions
-
-- `asyncio_mode = "auto"` is set globally — **never add `@pytest.mark.asyncio`** decorators
-- Default test timeout is **5 seconds** — optimize or mark slow tests with `@pytest.mark.integration`
-- Tests use in-memory transport by default (no network I/O)
-- `tests/conftest.py` provides `fastmcp_server`, `tool_server`, `tagged_resources_server` fixtures, plus autouse fixtures for settings isolation and logger propagation
-- Inline snapshots are disabled by default; use `--inline-snapshot=create` or `--inline-snapshot=fix` when needed
-- Integration tests go in `tests/integration_tests/` and are auto-marked
-
-## Code Standards
-
-- Full type annotations required; type checker is **ty** (`uv run ty check`)
-- File sizes enforced by **loq** (default 1000 lines). Edit `loq.toml` to raise limits; run `loq baseline` to ratchet down
-- Linting/formatting via **Ruff** — extended rules (bugbear, comprehensions, simplify) apply only to `src/` code
-- Never use bare `except` — always specify exception types
-- `docs/python-sdk/` is auto-generated by a bot — do not manually edit
-
-## Git Rules
-
-- Never force-push on collaborative repos
-- Never amend commits to fix prek failures — make a new commit
-- Agents must self-identify (e.g., "🤖 Generated with Claude Code" in commits/PRs)
-- Keep commit messages brief — headlines, not paragraphs
-- PR bodies: 1-2 paragraphs with a code example, not bullet summaries
diff --git a/eval-harness/.index-cache-preserve/fastmcp-intent_layer/AGENTS.md b/eval-harness/.index-cache-preserve/fastmcp-intent_layer/AGENTS.md
deleted file mode 100644
index 5883d1a..0000000
--- a/eval-harness/.index-cache-preserve/fastmcp-intent_layer/AGENTS.md
+++ /dev/null
@@ -1,133 +0,0 @@
-# FastMCP
-
-## Purpose
-
-FastMCP is a Python framework (>=3.10) for building MCP (Model Context Protocol) servers and clients — the ergonomic layer over the low-level `mcp` SDK.
-
-## Code Map
-
-| Path | Purpose |
-|------|---------|
-| `src/fastmcp/server/server.py` | `FastMCP` class — main server, 2100 lines |
-| `src/fastmcp/server/providers/` | Provider chain — how components are sourced |
-| `src/fastmcp/server/middleware/` | Request middleware pipeline |
-| `src/fastmcp/server/auth/` | Authentication (15+ OAuth providers, proxy, OIDC) |
-| `src/fastmcp/server/transforms/` | Component transforms (Namespace, Visibility, etc.) |
-| `src/fastmcp/server/tasks/` | Background tasks (SEP-1686, pydocket/Redis) |
-| `src/fastmcp/server/http.py` | Starlette ASGI app, StreamableHTTP + SSE |
-| `src/fastmcp/server/context.py` | `Context` — injected into handlers via ContextVar |
-| `src/fastmcp/client/client.py` | `Client` class — connects to MCP servers |
-| `src/fastmcp/client/transports/` | Transport layer (stdio, HTTP, SSE, memory) |
-| `src/fastmcp/tools/` | Tool definitions, function parsing, schema gen |
-| `src/fastmcp/resources/` | Resources and URI templates |
-| `src/fastmcp/prompts/` | Prompt templates |
-| `src/fastmcp/cli/` | CLI commands (cyclopts-based) |
-| `src/fastmcp/utilities/` | Shared utilities, OpenAPI parser, JSON schema |
-| `tests/` | Pytest suite mirroring src structure |
-
-## Build / Test / Run
-
-```bash
-uv sync                              # Install dependencies
-uv run pytest -n auto                # Full test suite (default timeout: 5s)
-uv run prek run --all-files          # Static checks: Ruff + Prettier + ty
-```
-
-- `asyncio_mode = "auto"` and `asyncio_default_fixture_loop_scope = "function"` in pytest config
-- Integration tests live in `tests/integration_tests/` and are auto-marked; deselect with `-m "not integration"`
-- Windows: uses `WindowsSelectorEventLoopPolicy` to avoid ProactorEventLoop crashes
-- File sizes enforced by [loq](https://github.com/jlowin/loq) — edit `loq.toml` to raise limits
-- CLI entry point: `fastmcp` (maps to `fastmcp.cli:app` via cyclopts)
-
-## Entry Points
-
-| Task | Start here |
-|------|-----------|
-| Create an MCP server | `src/fastmcp/server/server.py` → `FastMCP` class |
-| Add a tool/resource/prompt | `@mcp.tool()`, `@mcp.resource()`, `@mcp.prompt()` decorators on `FastMCP` |
-| Connect as client | `src/fastmcp/client/client.py` → `Client` class |
-| Add auth to a server | `src/fastmcp/server/auth/` — pick a provider from `auth/providers/` |
-| Add middleware | Subclass `Middleware` in `src/fastmcp/server/middleware/middleware.py` |
-| Mount/compose servers | `FastMCP.mount()` — creates providers with Namespace transforms |
-| Run via CLI | `src/fastmcp/cli/cli.py` — cyclopts-based CLI |
-| Background tasks | `src/fastmcp/server/tasks/` — requires pydocket + Redis |
-| OpenAPI → MCP | `src/fastmcp/server/providers/openapi/` |
-
-## Architecture
-
-### Core Data Model
-Four MCP component types flow through the system uniformly:
-- **Tools** (`src/fastmcp/tools/`) — callable functions with JSON schema input/output
-- **Resources** (`src/fastmcp/resources/`) — data endpoints with URIs; includes Templates for parameterized URIs
-- **Prompts** (`src/fastmcp/prompts/`) — templated prompt messages
-
-Each has a base class (`Tool`, `Resource`, `Prompt`) and a `Function*` variant created from decorated Python functions via `__fastmcp__` metadata on the callable.
-
-### Provider Chain (how components are sourced)
-`FastMCP` inherits from `Provider` (via `MCPOperationsMixin`). The resolution order:
-1. **LocalProvider** — components registered via decorators (`@mcp.tool()`)
-2. **Additional providers** — passed via `providers=[]` at construction (queried in registration order)
-3. **AggregateProvider** — merges results from all providers
-
-Provider semantics: `get_*` returns `None` to signal "not found" (search continues to next provider); first non-None wins.
-
-### Transform System (component modification)
-Transforms modify components in provider chains. Two patterns:
-- **List ops**: pure function — receive sequence, return transformed sequence
-- **Get ops**: middleware pattern with `call_next` for chaining lookups
-
-Built-in transforms: `Namespace`, `Visibility`, `ToolTransform`, `PromptsAsTools`, `ResourcesAsTools`, `VersionFilter`.
-
-### Middleware Pipeline (request modification)
-Middleware operates on MCP requests/responses (not components). Defined in `server/middleware/middleware.py`. Built-in: error handling, logging, rate limiting, caching, timing, authorization, tool injection, ping, dereference, response limiting.
-
-### Transport Layer
-- Server: Starlette-based HTTP (`server/http.py`) with StreamableHTTP + SSE; also stdio
-- Client: pluggable transports (`client/transports/`) — stdio, HTTP, SSE, memory, config-based
-
-### Auth System
-Server auth: `server/auth/` — providers (GitHub, Google, Azure, Auth0, Discord, etc.), OAuth proxy, OIDC proxy, JWT issuer. Client auth: `client/auth/` — OAuth flow, bearer tokens.
-
-### Background Tasks (SEP-1686)
-Async tool execution via pydocket (Redis-backed). Only async functions can be tasks (`task=True`). Task state tracked via key-value store with pub/sub notifications.
-
-### Server Mixins
-`FastMCP` is composed of three mixins: `LifespanMixin` (lifecycle), `MCPOperationsMixin` (list/get/call components), `TransportMixin` (run/http_app).
-
-## Contracts
-
-- **Four-type symmetry**: features touching MCP components MUST be applied to Tools, Resources, Resource Templates, AND Prompts. Forgetting one type is the #1 source of incomplete features.
-- **Provider get_* returns None, not raises**: returning `None` means "not found, keep searching"; raising is an error. `list_*` errors are logged and return empty (graceful degradation).
-- **Transforms vs Middleware**: Transforms modify _components_ (observable, used for task registration/tag filtering). Middleware modifies _requests_ (not visible to introspection). Never conflate them.
-- **Components own execution**: providers source components; components execute themselves via `run()`/`read()`/`render()`. Providers should NOT execute.
-- **ContextVar for request context**: `_current_context` ContextVar holds the `Context` object. Must be properly set before tool/resource execution. Background tasks must snapshot the access token before dispatching (stale context bug).
-- **Session isolation**: visibility marks, proxy client state, and auth tokens are per-session. Leaking state across sessions has caused multiple bugs.
-- **MCP spec compliance**: error codes must match spec (e.g., -32002 for resource not found). `additionalProperties: false` must be preserved in compressed schemas. Root-level `$ref` in outputSchema must be resolved.
-- **Re-exports are intentional**: only `FastMCP`, `Client`, `Context`, `settings` at top-level. Module-specific types import from their submodule.
-
-## Pitfalls
-
-- **Stale request context in proxy handlers**: `StatefulProxyClient` handlers captured the request context at creation time; background operations saw stale context. Fixed by snapshotting access tokens (#3138, #3172).
-- **Session visibility leaking**: Visibility marks (enable/disable components) leaked across sessions because they were stored globally. Must be per-session via ContextVar (#3132).
-- **OAuth token refresh races**: `get_access_token()` could return stale tokens after refresh. Token storage TTL calculation had off-by-one. Multi-instance deployments need refresh token stored in shared backend (#2505, #2796, #2483).
-- **Confused deputy in OAuth proxy**: consent page lacked binding cookie, allowing CSRF. Fixed with consent binding cookie (#3201).
-- **CIMD redirect allowlist bypass**: redirect URI validation could be bypassed. Cache revalidation also needed fixing (#3098).
-- **compress_schema drops additionalProperties**: schema compression stripped `additionalProperties: false`, breaking MCP validation (#3102).
-- **include_tags/exclude_tags silently ignored**: tag filtering in MCPConfig was skipped when no tools matched initial filter (#3186).
-- **OpenAPI $defs mutation**: `Tool.from_tool` transforms mutated the shared `$defs` dict. Must deep-copy before transforming (#2493).
-- **Content-type header in get_http_headers()**: including it caused HTTP 415 errors on upstream calls (#3104).
-- **Client pagination infinite loop**: misbehaving servers returning the same cursor caused infinite loops. Guard with seen-cursor set (#3167).
-- **Nested mount routing (3+ levels)**: routing broke for servers mounted more than 2 levels deep (#2586).
-- **functools.wraps + Context**: wrapped functions lost the `Context` parameter detection. `create_function_without_params` must modify the signature (#2563).
-- **Windows-specific**: use `SelectorEventLoop` (not Proactor); skip `wait_closed()` to avoid socket hang; SQLite locking causes test timeouts (#2368, #2607).
-- **Single-element list unwrapping**: tool results containing a single-element list were incorrectly unwrapped to a scalar (#1074).
-- **Resource/Prompt refactor reverted twice**: meta support refactors for resources and prompts were each reverted then re-applied — check both `meta` and `_meta` usage carefully (#2598, #2600, #2608-2611).
-
-## Downlinks
-
-| Path | Scope |
-|------|-------|
-| [`src/fastmcp/server/AGENTS.md`](src/fastmcp/server/AGENTS.md) | Server core: providers, transforms, middleware, auth, tasks, HTTP transport |
-| [`src/fastmcp/client/AGENTS.md`](src/fastmcp/client/AGENTS.md) | Client: transports, auth, mixins, session management |
-| [`src/fastmcp/tools/AGENTS.md`](src/fastmcp/tools/AGENTS.md) | Tool definitions, function parsing, schema generation, tool transforms |
-| [`tests/AGENTS.md`](tests/AGENTS.md) | Test organization, fixtures, patterns |
diff --git a/eval-harness/.index-cache-preserve/fastmcp-intent_layer/CLAUDE.md b/eval-harness/.index-cache-preserve/fastmcp-intent_layer/CLAUDE.md
deleted file mode 100644
index 5883d1a..0000000
--- a/eval-harness/.index-cache-preserve/fastmcp-intent_layer/CLAUDE.md
+++ /dev/null
@@ -1,133 +0,0 @@
-# FastMCP
-
-## Purpose
-
-FastMCP is a Python framework (>=3.10) for building MCP (Model Context Protocol) servers and clients — the ergonomic layer over the low-level `mcp` SDK.
-
-## Code Map
-
-| Path | Purpose |
-|------|---------|
-| `src/fastmcp/server/server.py` | `FastMCP` class — main server, 2100 lines |
-| `src/fastmcp/server/providers/` | Provider chain — how components are sourced |
-| `src/fastmcp/server/middleware/` | Request middleware pipeline |
-| `src/fastmcp/server/auth/` | Authentication (15+ OAuth providers, proxy, OIDC) |
-| `src/fastmcp/server/transforms/` | Component transforms (Namespace, Visibility, etc.) |
-| `src/fastmcp/server/tasks/` | Background tasks (SEP-1686, pydocket/Redis) |
-| `src/fastmcp/server/http.py` | Starlette ASGI app, StreamableHTTP + SSE |
-| `src/fastmcp/server/context.py` | `Context` — injected into handlers via ContextVar |
-| `src/fastmcp/client/client.py` | `Client` class — connects to MCP servers |
-| `src/fastmcp/client/transports/` | Transport layer (stdio, HTTP, SSE, memory) |
-| `src/fastmcp/tools/` | Tool definitions, function parsing, schema gen |
-| `src/fastmcp/resources/` | Resources and URI templates |
-| `src/fastmcp/prompts/` | Prompt templates |
-| `src/fastmcp/cli/` | CLI commands (cyclopts-based) |
-| `src/fastmcp/utilities/` | Shared utilities, OpenAPI parser, JSON schema |
-| `tests/` | Pytest suite mirroring src structure |
-
-## Build / Test / Run
-
-```bash
-uv sync                              # Install dependencies
-uv run pytest -n auto                # Full test suite (default timeout: 5s)
-uv run prek run --all-files          # Static checks: Ruff + Prettier + ty
-```
-
-- `asyncio_mode = "auto"` and `asyncio_default_fixture_loop_scope = "function"` in pytest config
-- Integration tests live in `tests/integration_tests/` and are auto-marked; deselect with `-m "not integration"`
-- Windows: uses `WindowsSelectorEventLoopPolicy` to avoid ProactorEventLoop crashes
-- File sizes enforced by [loq](https://github.com/jlowin/loq) — edit `loq.toml` to raise limits
-- CLI entry point: `fastmcp` (maps to `fastmcp.cli:app` via cyclopts)
-
-## Entry Points
-
-| Task | Start here |
-|------|-----------|
-| Create an MCP server | `src/fastmcp/server/server.py` → `FastMCP` class |
-| Add a tool/resource/prompt | `@mcp.tool()`, `@mcp.resource()`, `@mcp.prompt()` decorators on `FastMCP` |
-| Connect as client | `src/fastmcp/client/client.py` → `Client` class |
-| Add auth to a server | `src/fastmcp/server/auth/` — pick a provider from `auth/providers/` |
-| Add middleware | Subclass `Middleware` in `src/fastmcp/server/middleware/middleware.py` |
-| Mount/compose servers | `FastMCP.mount()` — creates providers with Namespace transforms |
-| Run via CLI | `src/fastmcp/cli/cli.py` — cyclopts-based CLI |
-| Background tasks | `src/fastmcp/server/tasks/` — requires pydocket + Redis |
-| OpenAPI → MCP | `src/fastmcp/server/providers/openapi/` |
-
-## Architecture
-
-### Core Data Model
-Four MCP component types flow through the system uniformly:
-- **Tools** (`src/fastmcp/tools/`) — callable functions with JSON schema input/output
-- **Resources** (`src/fastmcp/resources/`) — data endpoints with URIs; includes Templates for parameterized URIs
-- **Prompts** (`src/fastmcp/prompts/`) — templated prompt messages
-
-Each has a base class (`Tool`, `Resource`, `Prompt`) and a `Function*` variant created from decorated Python functions via `__fastmcp__` metadata on the callable.
-
-### Provider Chain (how components are sourced)
-`FastMCP` inherits from `Provider` (via `MCPOperationsMixin`). The resolution order:
-1. **LocalProvider** — components registered via decorators (`@mcp.tool()`)
-2. **Additional providers** — passed via `providers=[]` at construction (queried in registration order)
-3. **AggregateProvider** — merges results from all providers
-
-Provider semantics: `get_*` returns `None` to signal "not found" (search continues to next provider); first non-None wins.
-
-### Transform System (component modification)
-Transforms modify components in provider chains. Two patterns:
-- **List ops**: pure function — receive sequence, return transformed sequence
-- **Get ops**: middleware pattern with `call_next` for chaining lookups
-
-Built-in transforms: `Namespace`, `Visibility`, `ToolTransform`, `PromptsAsTools`, `ResourcesAsTools`, `VersionFilter`.
-
-### Middleware Pipeline (request modification)
-Middleware operates on MCP requests/responses (not components). Defined in `server/middleware/middleware.py`. Built-in: error handling, logging, rate limiting, caching, timing, authorization, tool injection, ping, dereference, response limiting.
-
-### Transport Layer
-- Server: Starlette-based HTTP (`server/http.py`) with StreamableHTTP + SSE; also stdio
-- Client: pluggable transports (`client/transports/`) — stdio, HTTP, SSE, memory, config-based
-
-### Auth System
-Server auth: `server/auth/` — providers (GitHub, Google, Azure, Auth0, Discord, etc.), OAuth proxy, OIDC proxy, JWT issuer. Client auth: `client/auth/` — OAuth flow, bearer tokens.
-
-### Background Tasks (SEP-1686)
-Async tool execution via pydocket (Redis-backed). Only async functions can be tasks (`task=True`). Task state tracked via key-value store with pub/sub notifications.
-
-### Server Mixins
-`FastMCP` is composed of three mixins: `LifespanMixin` (lifecycle), `MCPOperationsMixin` (list/get/call components), `TransportMixin` (run/http_app).
-
-## Contracts
-
-- **Four-type symmetry**: features touching MCP components MUST be applied to Tools, Resources, Resource Templates, AND Prompts. Forgetting one type is the #1 source of incomplete features.
-- **Provider get_* returns None, not raises**: returning `None` means "not found, keep searching"; raising is an error. `list_*` errors are logged and return empty (graceful degradation).
-- **Transforms vs Middleware**: Transforms modify _components_ (observable, used for task registration/tag filtering). Middleware modifies _requests_ (not visible to introspection). Never conflate them.
-- **Components own execution**: providers source components; components execute themselves via `run()`/`read()`/`render()`. Providers should NOT execute.
-- **ContextVar for request context**: `_current_context` ContextVar holds the `Context` object. Must be properly set before tool/resource execution. Background tasks must snapshot the access token before dispatching (stale context bug).
-- **Session isolation**: visibility marks, proxy client state, and auth tokens are per-session. Leaking state across sessions has caused multiple bugs.
-- **MCP spec compliance**: error codes must match spec (e.g., -32002 for resource not found). `additionalProperties: false` must be preserved in compressed schemas. Root-level `$ref` in outputSchema must be resolved.
-- **Re-exports are intentional**: only `FastMCP`, `Client`, `Context`, `settings` at top-level. Module-specific types import from their submodule.
-
-## Pitfalls
-
-- **Stale request context in proxy handlers**: `StatefulProxyClient` handlers captured the request context at creation time; background operations saw stale context. Fixed by snapshotting access tokens (#3138, #3172).
-- **Session visibility leaking**: Visibility marks (enable/disable components) leaked across sessions because they were stored globally. Must be per-session via ContextVar (#3132).
-- **OAuth token refresh races**: `get_access_token()` could return stale tokens after refresh. Token storage TTL calculation had off-by-one. Multi-instance deployments need refresh token stored in shared backend (#2505, #2796, #2483).
-- **Confused deputy in OAuth proxy**: consent page lacked binding cookie, allowing CSRF. Fixed with consent binding cookie (#3201).
-- **CIMD redirect allowlist bypass**: redirect URI validation could be bypassed. Cache revalidation also needed fixing (#3098).
-- **compress_schema drops additionalProperties**: schema compression stripped `additionalProperties: false`, breaking MCP validation (#3102).
-- **include_tags/exclude_tags silently ignored**: tag filtering in MCPConfig was skipped when no tools matched initial filter (#3186).
-- **OpenAPI $defs mutation**: `Tool.from_tool` transforms mutated the shared `$defs` dict. Must deep-copy before transforming (#2493).
-- **Content-type header in get_http_headers()**: including it caused HTTP 415 errors on upstream calls (#3104).
-- **Client pagination infinite loop**: misbehaving servers returning the same cursor caused infinite loops. Guard with seen-cursor set (#3167).
-- **Nested mount routing (3+ levels)**: routing broke for servers mounted more than 2 levels deep (#2586).
-- **functools.wraps + Context**: wrapped functions lost the `Context` parameter detection. `create_function_without_params` must modify the signature (#2563).
-- **Windows-specific**: use `SelectorEventLoop` (not Proactor); skip `wait_closed()` to avoid socket hang; SQLite locking causes test timeouts (#2368, #2607).
-- **Single-element list unwrapping**: tool results containing a single-element list were incorrectly unwrapped to a scalar (#1074).
-- **Resource/Prompt refactor reverted twice**: meta support refactors for resources and prompts were each reverted then re-applied — check both `meta` and `_meta` usage carefully (#2598, #2600, #2608-2611).
-
-## Downlinks
-
-| Path | Scope |
-|------|-------|
-| [`src/fastmcp/server/AGENTS.md`](src/fastmcp/server/AGENTS.md) | Server core: providers, transforms, middleware, auth, tasks, HTTP transport |
-| [`src/fastmcp/client/AGENTS.md`](src/fastmcp/client/AGENTS.md) | Client: transports, auth, mixins, session management |
-| [`src/fastmcp/tools/AGENTS.md`](src/fastmcp/tools/AGENTS.md) | Tool definitions, function parsing, schema generation, tool transforms |
-| [`tests/AGENTS.md`](tests/AGENTS.md) | Test organization, fixtures, patterns |
diff --git a/eval-harness/.index-cache-preserve/fastmcp-intent_layer/src/fastmcp/client/AGENTS.md b/eval-harness/.index-cache-preserve/fastmcp-intent_layer/src/fastmcp/client/AGENTS.md
deleted file mode 100644
index 6eb3c21..0000000
--- a/eval-harness/.index-cache-preserve/fastmcp-intent_layer/src/fastmcp/client/AGENTS.md
+++ /dev/null
@@ -1,79 +0,0 @@
-# Client Module
-
-## Purpose
-
-The `Client` class (`client.py`) connects to MCP servers over pluggable transports. It supports tools, resources, prompts, sampling, elicitation, roots, progress reporting, and background tasks. The class is composed of four mixins: `ClientToolsMixin`, `ClientResourcesMixin`, `ClientPromptsMixin`, `ClientTaskManagementMixin`.
-
-## Code Map
-
-| File/Dir | Role |
-|----------|------|
-| `client.py` (~700 lines) | `Client` class — session lifecycle, transport management, handler wiring |
-| `mixins/tools.py` | `call_tool()`, `list_tools()` — tool operations |
-| `mixins/resources.py` | `read_resource()`, `list_resources()` — resource operations |
-| `mixins/prompts.py` | `get_prompt()`, `list_prompts()` — prompt operations |
-| `mixins/task_management.py` | Task polling, notification handling for background tasks |
-| `elicitation.py` | Client-side elicitation handling |
-| `logging.py` | Log message handling from server |
-| `messages.py` | Message handler protocol |
-| `progress.py` | Progress reporting handler |
-| `roots.py` | Roots handler (expose filesystem roots to server) |
-| `sampling/` | Sampling handlers — Anthropic and OpenAI |
-| `tasks.py` | `ToolTask`, `ResourceTask`, `PromptTask` — task wrappers |
-| `telemetry.py` | Client-side OpenTelemetry spans |
-| `oauth_callback.py` | Local OAuth callback server for client auth flows |
-
-### transports/
-
-| File | Role |
-|------|------|
-| `__init__.py` | `infer_transport()` — auto-detects transport from connection string |
-| `base.py` | `ClientTransport` protocol |
-| `stdio.py` | `PythonStdioTransport`, `NodeStdioTransport`, `StdioTransport` |
-| `http.py` | `StreamableHttpTransport` — primary HTTP transport |
-| `sse.py` | `SSETransport` — legacy SSE transport |
-| `memory.py` | `FastMCPTransport` — in-process transport (for testing) |
-| `config.py` | `MCPConfigTransport` — multi-server from config file |
-| `inference.py` | Transport inference logic |
-
-### auth/
-
-| File | Role |
-|------|------|
-| `oauth.py` | Full OAuth 2.1 client flow with PKCE |
-| `bearer.py` | Simple bearer token auth |
-
-## Entry Points
-
-| Task | Start here |
-|------|-----------|
-| Connect to a server | `client.py` → `Client` class |
-| Add client auth | `auth/oauth.py` or `auth/bearer.py` |
-| Custom transport | Subclass `transports/base.py` → `ClientTransport` |
-| Handle sampling | `sampling/` → implement `SamplingHandler` |
-| Background task polling | `mixins/task_management.py` |
-
-## Key Exports
-
-- `Client` — re-exported at `fastmcp.Client`
-- `infer_transport()` — used by CLI and tests
-- Transport classes — `StreamableHttpTransport`, `PythonStdioTransport`, etc.
-
-## Contracts
-
-- **Client is an async context manager**: enter via `async with Client(...) as client:` or `client.connect()`. Session is not available until entered.
-- **Transport inference**: `infer_transport(target)` parses strings (`http://...` → HTTP, `path/to/server.py` → stdio, etc.) and `FastMCP` instances → `FastMCPTransport`.
-- **stdio transport is single-use**: cannot reuse a stdio transport after disconnect. Calling `.connect()` again on a used stdio client logs a warning.
-- **OAuth async_auth_flow**: the flow must not hold the MCP SDK's `context.lock` while awaiting. Previous implementations caused deadlocks (#2644).
-- **Pagination cursor tracking**: `list_*` methods must track seen cursors to avoid infinite loops from misbehaving servers (#3167).
-
-## Pitfalls
-
-- **Client concurrency**: `Client` context management was refactored to avoid concurrency issues. Multiple concurrent `async with` blocks on the same client instance caused problems (#1054).
-- **Proxy client session isolation**: multiple proxy clients sharing a session mixed up responses. Fixed with session isolation (#1083, #1245).
-- **HTTP 4xx/5xx hanging**: client would hang on HTTP error responses instead of raising. Transport must propagate errors (#2803).
-- **OAuth token stale after refresh**: `get_access_token()` could return the old token even after a successful refresh. Storage must be updated atomically (#2505).
-- **OAuth metadata discovery**: client must preserve the full URL path for RFC 8414 metadata discovery, not just the base (#2577, #2533).
-- **Azure scope mismatch**: Azure provider requires `offline_access` scope for token refresh. Client validation errors occurred when scope didn't match (#2243, #3001).
-- **Timeout not propagating**: timeout setting didn't propagate to proxy clients in multi-server MCPConfig (#2809).
-- **FastMCPTransport (memory)**: the in-process transport used for testing creates a real server session. State is shared — mutations in tools are visible across calls.
diff --git a/eval-harness/.index-cache-preserve/fastmcp-intent_layer/src/fastmcp/server/AGENTS.md b/eval-harness/.index-cache-preserve/fastmcp-intent_layer/src/fastmcp/server/AGENTS.md
deleted file mode 100644
index ef06679..0000000
--- a/eval-harness/.index-cache-preserve/fastmcp-intent_layer/src/fastmcp/server/AGENTS.md
+++ /dev/null
@@ -1,132 +0,0 @@
-# Server Module
-
-## Purpose
-
-The server module implements the MCP server side of FastMCP. `FastMCP` (in `server.py`) is the main user-facing class — it composes `LifespanMixin`, `MCPOperationsMixin`, and `TransportMixin` and inherits from `Provider`.
-
-## Code Map
-
-| File/Dir | Role |
-|----------|------|
-| `server.py` (~2100 lines) | `FastMCP` class: registration decorators, mount(), provider management |
-| `context.py` | `Context` object — injected into tool/resource/prompt functions via ContextVar |
-| `mixins/mcp_operations.py` | Implements list/get/call for all four component types |
-| `mixins/transport.py` | `run()`, `run_http()`, `http_app()` — server startup |
-| `mixins/lifespan.py` | Lifecycle management for startup/shutdown hooks |
-| `http.py` | Starlette ASGI app construction, StreamableHTTP + SSE endpoints |
-| `low_level.py` | `LowLevelServer` wrapping the `mcp` SDK's server |
-| `proxy.py` | Deprecated `FastMCPProxy` — use `ProxyProvider` instead |
-| `apps.py` | MCP Apps config (CSP, permissions, UI) |
-| `elicitation.py` | Server-side elicitation relay for background tasks |
-| `event_store.py` | SSE polling support with EventStore |
-
-### providers/
-
-| File | Role |
-|------|------|
-| `base.py` | `Provider` abstract base — override `_list_*` / `_get_*` methods |
-| `local_provider/` | `LocalProvider` — stores components registered via decorators |
-| `fastmcp_provider.py` | `FastMCPProvider` — wraps a `FastMCP` instance as a provider (used by mount) |
-| `aggregate.py` | `AggregateProvider` — merges multiple providers |
-| `proxy.py` | `ProxyProvider` — proxies to remote MCP server via client factory |
-| `openapi/` | `OpenAPIProvider` — converts OpenAPI specs to MCP tools |
-| `filesystem.py` | `FileSystemProvider` — discovers components from Python files on disk |
-| `skills/` | Skill providers (Claude skills, directory-based, vendor) |
-| `wrapped_provider.py` | Base for providers that wrap another provider with transforms |
-
-### middleware/
-
-Middleware operates on MCP **requests** (not components). Subclass `Middleware` and override hooks like `on_call_tool`, `on_list_tools`, `on_read_resource`, etc.
-
-| File | Role |
-|------|------|
-| `middleware.py` | `Middleware` base class, `MiddlewareContext`, pipeline construction |
-| `error_handling.py` | Catches exceptions, converts to MCP error responses |
-| `logging.py` | Structured logging of MCP operations |
-| `rate_limiting.py` | Per-session rate limiting |
-| `caching.py` | Response caching (cache key includes mounted server prefix) |
-| `authorization.py` | Auth check enforcement |
-| `tool_injection.py` | Injects list/read resource/prompt tools for client compatibility |
-
-### auth/
-
-| File | Role |
-|------|------|
-| `auth.py` | `AuthProvider` protocol, `AuthContext`, `run_auth_checks` |
-| `providers/` | 15+ OAuth/OIDC providers (GitHub, Google, Azure, Auth0, etc.) |
-| `oauth_proxy/` | Full OAuth proxy server — issues its own tokens, consent page |
-| `oidc_proxy.py` | OIDC proxy (lighter than full OAuth proxy) |
-| `jwt_issuer.py` | JWT token issuance for OAuth proxy |
-| `middleware.py` | `RequireAuthMiddleware` for Starlette |
-| `redirect_validation.py` | Validates redirect URIs |
-| `ssrf.py` | SSRF protection for OAuth callbacks |
-
-### transforms/
-
-Transforms modify **components** in provider chains. Unlike middleware, transforms are visible to task registration and introspection.
-
-| File | Role |
-|------|------|
-| `namespace.py` | `Namespace` — prefixes tool/resource names for mounted servers |
-| `visibility.py` | `Visibility` — enable/disable components per session |
-| `tool_transform.py` | `ToolTransform` — rename, redescribe, filter args on tools |
-| `prompts_as_tools.py` | Exposes prompts as callable tools |
-| `resources_as_tools.py` | Exposes resources as callable tools |
-| `version_filter.py` | Filters components by version |
-
-### tasks/
-
-Background task execution (SEP-1686). Requires `pydocket` (Redis-backed).
-
-| File | Role |
-|------|------|
-| `config.py` | `TaskConfig`, `TaskMeta`, `TaskMode` |
-| `handlers.py` | Task execution handlers |
-| `keys.py` | Task key construction/parsing |
-| `notifications.py` | Pub/sub notification delivery |
-| `elicitation.py` | Elicitation relay for background tasks |
-
-## Entry Points
-
-| Task | Start here |
-|------|-----------|
-| Create a server | `server.py` → `FastMCP` class |
-| Add a custom provider | Subclass `providers/base.py` → `Provider` |
-| Add middleware | Subclass `middleware/middleware.py` → `Middleware` |
-| Add auth | Pick a provider from `auth/providers/` |
-| Mount another server | `FastMCP.mount()` in `server.py` |
-| Add a transform | Subclass `transforms/__init__.py` → `Transform` |
-| Background tasks | `tasks/config.py` → `TaskConfig` |
-
-## Key Exports Used by Other Modules
-
-- `FastMCP` — main server class (re-exported at `fastmcp.FastMCP`)
-- `Context` — request context (re-exported at `fastmcp.Context`)
-- `Provider` — base class for custom providers
-- `Middleware`, `MiddlewareContext` — middleware system
-- `Transform`, `Namespace`, `Visibility`, `ToolTransform` — transform system
-- `AuthProvider`, `AuthContext` — auth system
-
-## Contracts
-
-- **Provider resolution order**: LocalProvider first, then additional providers in registration order. Static components (decorators) always win.
-- **get_* returns None, never raises for "not found"**: returning None means "I don't have it, keep searching." Raising is an error.
-- **list_* errors degrade gracefully**: logged, returns empty. Other providers still contribute.
-- **Middleware call_next chain**: each middleware gets `(context, call_next)`. Must call `call_next(context)` to continue the chain.
-- **Transform list ops are pure functions**: receive sequence, return sequence. No side effects.
-- **Transform get ops use call_next pattern**: must call `call_next(name, version=version)` to delegate.
-- **Components execute themselves**: providers source components; `Tool.run()`, `Resource.read()`, `Prompt.render()` do execution.
-- **ASGI lifespan must be passed through**: when mounting FastMCP in FastAPI, the ASGI app's lifespan MUST be passed to the parent app. Missing this causes "Task group is not initialized" errors.
-
-## Pitfalls
-
-- **Stale request context in proxy**: `StatefulProxyClient` captured context at creation. Background tasks saw stale access tokens. Always snapshot tokens before dispatching to background (#3138, #3172).
-- **Session visibility leaks**: visibility marks stored globally leaked across sessions. Must use per-session ContextVar (#3132).
-- **Caching with mounted prefixes**: cache keys must include the mounted server prefix, otherwise different mounted servers share cache entries (#2762).
-- **OAuth proxy consent CSRF**: consent page needed binding cookie to prevent confused deputy attacks (#3201).
-- **CIMD redirect bypass**: redirect URI allowlist validation had bypass; ensure strict matching (#3098).
-- **content-type in get_http_headers()**: including it caused HTTP 415 on upstream calls. Excluded since #3104.
-- **Tags ignored without tools**: include_tags/exclude_tags in MCPConfig silently did nothing when no tools matched initial filter (#3186).
-- **Docket function name collisions**: multi-mount setups need prefixed Docket function names to avoid collisions (#2575).
-- **Race condition with mounted server task results**: tasks from mounted servers could race with result delivery (#2575).
-- **on_initialize middleware got wrong params**: was receiving the whole request instead of just params (#2357).
diff --git a/eval-harness/.index-cache-preserve/fastmcp-intent_layer/src/fastmcp/tools/AGENTS.md b/eval-harness/.index-cache-preserve/fastmcp-intent_layer/src/fastmcp/tools/AGENTS.md
deleted file mode 100644
index 5e1765e..0000000
--- a/eval-harness/.index-cache-preserve/fastmcp-intent_layer/src/fastmcp/tools/AGENTS.md
+++ /dev/null
@@ -1,74 +0,0 @@
-# Tools Module
-
-## Purpose
-
-Defines the MCP tool abstraction — from Python function to JSON-schema-validated MCP tool. This module handles function parsing, schema generation, tool transformation, and result serialization.
-
-## Code Map
-
-| File | Role |
-|------|------|
-| `tool.py` | `Tool` base class, `ToolResult` model, serialization |
-| `function_tool.py` | `FunctionTool` — created from decorated Python functions via `@tool` / `@mcp.tool()` |
-| `function_parsing.py` | `ParsedFunction` — inspects Python functions to extract JSON schema, handles Pydantic models, `Context` injection |
-| `tool_transform.py` | `TransformedTool`, `ToolTransformConfig` — rename, redescribe, add/remove/transform arguments |
-
-## Design
-
-### Function → Tool pipeline
-1. `@mcp.tool()` decorator attaches `ToolMeta` as `fn.__fastmcp__`
-2. `FunctionTool.from_function(fn)` uses `ParsedFunction` to extract parameter schema
-3. `ParsedFunction` introspects the function signature, builds JSON schema via Pydantic's `TypeAdapter`
-4. Schema generation handles: `Context` parameter removal, dependency injection, `exclude_args`, `SkipJsonSchema`
-5. `Tool.to_mcp_tool()` produces the MCP SDK `Tool` object with input/output schemas
-
-### Execution flow
-1. `FunctionTool.run(arguments, context)` is called by the server
-2. Arguments are validated against the JSON schema
-3. Injected parameters (Context, dependencies) are resolved
-4. Sync functions are run in a thread pool via `call_sync_fn_in_threadpool`
-5. Return value is serialized via `ToolResult` → `CallToolResult`
-
-### Result serialization
-- Return values are converted to `list[ContentBlock]` via `ToolResult`
-- `Image`, `Audio`, `File` types from `utilities/types.py` are converted to appropriate content blocks
-- Lists of content blocks are preserved; single values are wrapped
-- Custom serializer can be set per-tool via `serializer` parameter
-- Default serializer: `pydantic_core.to_json(data, fallback=str)`
-
-## Entry Points
-
-| Task | Start here |
-|------|-----------|
-| Define a tool | `function_tool.py` → `@tool` decorator or `FunctionTool.from_function()` |
-| Customize tool schema | `function_parsing.py` → `ParsedFunction` |
-| Transform a tool | `tool_transform.py` → `TransformedTool` |
-| Serialize tool results | `tool.py` → `ToolResult` |
-
-## Key Exports
-
-- `Tool` — base class (used by providers to source tools)
-- `FunctionTool` — the decorator-created variant
-- `ToolResult` — structured result model
-- `TransformedTool`, `ToolTransformConfig` — for tool transforms
-
-## Contracts
-
-- **Tool names must be MCP-valid**: validated by `mcp.shared.tool_name_validation.validate_and_warn_tool_name`
-- **Context parameter is auto-removed from schema**: `ParsedFunction` detects `Context` type-annotated params and excludes them from the JSON schema, injecting them at call time
-- **output_schema generation**: by default, tools get an output schema from their return type annotation. Set `output_schema=None` to disable. `output_schema=False` is legacy and still supported.
-- **$defs must not be mutated**: schema `$defs` are shared across tool instances. Transforms must deep-copy before modifying (#2493).
-- **Root-level $ref must be resolved**: MCP spec requires resolved schemas, not top-level `$ref` (#2720).
-- **additionalProperties: false must be preserved**: schema compression must not strip this (#3102).
-
-## Pitfalls
-
-- **$defs mutation in transforms**: `Tool.from_tool()` transforms shared the `$defs` dict. Transforms that modify `$defs` corrupt all tools sharing that schema. Always deep-copy (#2493).
-- **compress_schema stripping additionalProperties**: the schema compression utility removed `additionalProperties: false`, which MCP validation requires (#3102).
-- **Single-element list unwrapping**: a tool returning `[single_item]` was unwrapped to just `single_item`, changing the response structure. Lists must be preserved (#1074).
-- **Field() in function parameters**: using Pydantic `Field()` as default values in tool functions required special handling (#3050).
-- **title metadata in JSON schema**: schema generation added `title` fields that conflicted with parameters actually named `title`. Must strip metadata titles while preserving real ones (#1872).
-- **functools.wraps breaks Context detection**: wrapped functions lose their signature. `create_function_without_params` must also update the signature, not just the function body (#2563).
-- **Exclude_args with non-serializable types**: `exclude_args` failed when the excluded parameter had a non-serializable default value (#2440).
-- **Union type output schemas**: non-object union types needed special schema wrapping to produce valid MCP output schemas (#995).
-- **OpenAPI tool name registration**: tool names modified by `mcp_component_fn` weren't registered under the new name (#1096).
diff --git a/eval-harness/.index-cache-preserve/fastmcp-intent_layer/tests/AGENTS.md b/eval-harness/.index-cache-preserve/fastmcp-intent_layer/tests/AGENTS.md
deleted file mode 100644
index 2c1106b..0000000
--- a/eval-harness/.index-cache-preserve/fastmcp-intent_layer/tests/AGENTS.md
+++ /dev/null
@@ -1,71 +0,0 @@
-# Tests
-
-## Purpose
-
-Pytest-based test suite mirroring the `src/fastmcp/` structure. Uses `pytest-asyncio` (auto mode), `pytest-xdist` for parallel execution, and `inline-snapshot` for snapshot testing.
-
-## Entry Points
-
-| Task | Start here |
-|------|-----------|
-| Run all tests | `uv run pytest -n auto` |
-| Run specific area | `uv run pytest tests/server/` or `tests/tools/` etc. |
-| Add a test | Mirror the `src/` path in `tests/`; use `FastMCPTransport` for in-process testing |
-| Shared fixtures | `conftest.py` → `fastmcp_server`, `tool_server`, `free_port` |
-
-## Code Map
-
-Same structure as `src/fastmcp/` — each subdirectory corresponds to the source module it tests.
-
-## Structure
-
-| Directory | Tests for |
-|-----------|----------|
-| `server/` | Server core — context, providers, middleware, auth, transforms, tasks, HTTP, mount, sampling, telemetry |
-| `client/` | Client operations, auth, transports, concurrent usage |
-| `tools/` | Tool creation, function parsing, schema generation, tool transforms |
-| `resources/` | Resource and template creation, URI handling |
-| `prompts/` | Prompt creation, argument handling |
-| `cli/` | CLI commands — run, install, config, discovery, generate |
-| `utilities/` | JSON schema, OpenAPI parsing, async utils |
-| `integration_tests/` | Auto-marked with `@pytest.mark.integration`; tests against real servers |
-| `contrib/` | Community contrib modules |
-| `deprecated/` | Tests for deprecated features (preserved for backwards-compat verification) |
-| `telemetry/` | OpenTelemetry tracing tests |
-
-## Key Fixtures (conftest.py)
-
-- **`isolate_settings_home`** (autouse): each test gets isolated `settings.home` in `tmp_path` — prevents file locking issues from shared OAuth storage
-- **`enable_fastmcp_logger_propagation`** (autouse): enables FastMCP logger propagation so `caplog` captures log messages (FastMCP loggers have `propagate=False` by default)
-- **`import_rich_rule`** (autouse): pre-imports `rich.rule` to avoid import timing issues
-- **`fastmcp_server`**: creates a standard `FastMCP` with tools, resources, prompts for reuse
-- **`tool_server`**: comprehensive tool set for provider tests (images, audio, files, mixed content, errors)
-- **`tagged_resources_server`**: server with tagged resources/templates
-- **`free_port` / `free_port_factory`**: allocates free TCP ports for HTTP server tests
-- **`otel_trace_provider` / `trace_exporter`**: session-scoped OpenTelemetry tracing for test spans
-
-## Config
-
-- `asyncio_mode = "auto"` — no `@pytest.mark.asyncio` needed
-- `asyncio_default_fixture_loop_scope = "function"` — fresh event loop per test
-- Default timeout: **5 seconds** — keep tests fast; use `@pytest.mark.timeout(30)` for slower tests
-- `FASTMCP_TEST_MODE=1` env var set in all tests
-- `addopts = ["--inline-snapshot=disable"]` — snapshots disabled by default; enable with `--inline-snapshot=update`
-- Unawaited coroutine warnings are treated as errors
-
-## Contracts
-
-- **Test isolation**: each test gets its own `settings.home` directory. Don't share state between tests via the filesystem.
-- **No subprocess-based servers in unit tests**: replaced with in-process async servers (#2006). Use `FastMCPTransport` for testing.
-- **Integration tests are separate**: anything in `tests/integration_tests/` is auto-marked and excluded from default runs.
-- **Windows compatibility**: `SelectorEventLoop` is forced on Windows. Some tests are skipped on Windows due to platform differences.
-
-## Pitfalls
-
-- **Shared FakeServer state**: tests using `FakeServer` must create a fresh instance per test to prevent shared state issues (#2540).
-- **Docket strike monitoring**: disable Docket strike monitoring in tests using `fakeredis` to avoid busy-loops (#2540).
-- **caplog not capturing**: FastMCP loggers have `propagate=False`. The `enable_fastmcp_logger_propagation` fixture handles this, but if you create a new logger in test code, ensure it propagates.
-- **OAuth proxy file locking**: tests that use OAuth proxy must use `MemoryStore` (not disk) to avoid SQLite locking issues on Windows (#2368, #3123).
-- **OTel TracerProvider can only be set once**: `otel_trace_provider` is session-scoped for this reason. Don't create new providers in test functions.
-- **Port conflicts**: use `free_port` fixture instead of hardcoded ports. The `--reload` flag had port conflict issues with explicit ports (#3070).
-- **inline-snapshot serialization**: MCP SDK version changes can alter snapshot serialization (e.g., field ordering). Update snapshots with `--inline-snapshot=update`.
diff --git a/eval-harness/.index-cache-preserve/graphiti-flat_llm/AGENTS.md b/eval-harness/.index-cache-preserve/graphiti-flat_llm/AGENTS.md
deleted file mode 100644
index f22ebc2..0000000
--- a/eval-harness/.index-cache-preserve/graphiti-flat_llm/AGENTS.md
+++ /dev/null
@@ -1,161 +0,0 @@
-# CLAUDE.md
-
-This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
-
-## Project overview
-
-Graphiti (`graphiti-core`) is a Python framework for building temporally-aware knowledge graphs for AI agents. Instead of static RAG, it continuously ingests "episodes" (messages, JSON, text), extracts entities and relationships via LLMs, and builds a queryable graph that tracks when facts were valid. Built by Zep Software.
-
-## Development commands
-
-```bash
-# Install
-uv sync --extra dev
-
-# Format (ruff import sort + formatting)
-make format
-
-# Lint (ruff + pyright type checking)
-make lint
-
-# Unit tests only (skips FalkorDB, Kuzu, Neptune backends)
-make test
-
-# All checks (format + lint + test)
-make check
-
-# Run a single test file
-uv run pytest tests/test_graphiti_mock.py
-
-# Run a single test method
-uv run pytest tests/test_graphiti_mock.py::test_method_name
-
-# Integration tests only (need running databases)
-uv run pytest tests/ -k "_int"
-
-# Unit tests only (explicit)
-uv run pytest tests/ -k "not _int"
-
-# Start databases for integration tests
-docker-compose -f docker-compose.test.yml up
-```
-
-### Server (FastAPI REST service)
-
-```bash
-cd server/
-uv sync --extra dev
-uvicorn graph_service.main:app --reload
-make format / make lint / make test
-```
-
-### MCP server
-
-```bash
-cd mcp_server/
-uv sync
-docker-compose up
-```
-
-## Architecture
-
-### Episode ingestion pipeline
-
-The core workflow — `Graphiti.add_episode()` — runs this pipeline:
-
-1. Retrieve recent episodes for context
-2. Extract entity nodes via LLM (`extract_nodes()`)
-3. Resolve/deduplicate against existing graph nodes (`resolve_extracted_nodes()`)
-4. Extract relationship edges via LLM (`extract_edges()`)
-5. Resolve/deduplicate edges (`resolve_extracted_edges()`)
-6. Generate embeddings for new nodes and edges
-7. Save everything to the graph database
-8. Optionally update community clusters
-
-Batch variant: `add_episode_bulk()` processes multiple episodes with cross-episode deduplication.
-
-### Key abstractions
-
-**`Graphiti`** (`graphiti.py`) — main entry point. Constructor takes optional `graph_driver`, `llm_client`, `embedder`, `cross_encoder`, `tracer`. Defaults to Neo4j + OpenAI.
-
-**`GraphitiClients`** (`graphiti_types.py`) — Pydantic model bundling `driver + llm_client + embedder + cross_encoder + tracer`. Passed as a single object through internal functions instead of threading individual clients.
-
-**`GraphDriver`** (`driver/driver.py`) — ABC for graph database backends. Four implementations: `Neo4jDriver`, `FalkorDriver`, `KuzuDriver`, `NeptuneDriver`. Each provides `ops` properties (e.g., `driver.entity_node_ops`, `driver.search_ops`) implementing abstract operation interfaces from `driver/operations/`.
-
-**Namespace accessors** — `graphiti.nodes.entity`, `graphiti.edges.entity`, etc. are typed namespace objects that wrap driver operations with embedding generation. Preferred API for direct node/edge CRUD (e.g., `graphiti.nodes.entity.save(node)`).
-
-**`group_id`** — partition key on every node and edge. Different users/agents get isolated graphs by using different group IDs. Maps to database name in Neo4j/FalkorDB, property filter in Kuzu.
-
-### Driver operations architecture (v0.28.0)
-
-```
-driver/
-├── driver.py                    # GraphDriver ABC, GraphProvider enum
-├── operations/                  # Abstract operation interfaces
-│   ├── entity_node_ops.py       # EntityNodeOperations ABC
-│   ├── entity_edge_ops.py       # EntityEdgeOperations ABC
-│   ├── search_ops.py            # SearchOperations ABC
-│   └── ...                      # 11 operation ABCs total
-├── neo4j/operations/            # Neo4j implementations
-├── falkordb/operations/         # FalkorDB implementations
-├── kuzu/operations/             # Kuzu implementations
-└── neptune/operations/          # Neptune implementations
-```
-
-Each driver instantiates its own implementations of the abstract operation interfaces. The `GraphDriver` base returns `None` from `ops` properties; concrete drivers override with real implementations.
-
-Transactions: `async with driver.transaction() as tx:` — drivers without real transaction support (FalkorDB, Kuzu) get a no-op wrapper where queries execute immediately.
-
-### Search pipeline
-
-`search/search.py` → `search()` function orchestrated by `SearchConfig`.
-
-- **Search methods per layer**: `cosine_similarity`, `bm25`, `bfs` (graph traversal)
-- **Rerankers**: `rrf` (Reciprocal Rank Fusion), `mmr` (Maximal Marginal Relevance), `node_distance`, `episode_mentions`, `cross_encoder`
-- **Pre-built recipes** in `search_config_recipes.py`: `EDGE_HYBRID_SEARCH_RRF`, `COMBINED_HYBRID_SEARCH_CROSS_ENCODER`, etc.
-- Four searchable layers: edges, nodes, episodes, communities — each independently configurable
-
-### LLM client pattern
-
-`LLMClient` ABC (`llm_client/client.py`) features:
-- Diskcache for optional response caching
-- Tenacity retry (4 attempts, exponential backoff) on rate limits and server errors
-- Structured output via Pydantic models appended as JSON schema to prompts
-- Two model sizes: `model` (medium, default `gpt-4.1-mini`) and `small_model` (small, default `gpt-4.1-nano`)
-
-### Content chunking
-
-`helpers.py` has density-based chunking — only chunks high-entity-density content (large JSON dumps), leaving prose unchanged. Controlled by env vars: `CHUNK_TOKEN_SIZE`, `CHUNK_DENSITY_THRESHOLD`, `CHUNK_MIN_TOKENS`, `CHUNK_OVERLAP_TOKENS`.
-
-### Graph data model
-
-Nodes: `EntityNode`, `EpisodicNode`, `CommunityNode`, `SagaNode`
-Edges: `EntityEdge`, `EpisodicEdge`, `CommunityEdge`, `HasEpisodeEdge`, `NextEpisodeEdge`
-
-All defined in `nodes.py` and `edges.py`. `EpisodeType` enum: `message`, `json`, `text`.
-
-## Code style
-
-- Ruff: 100-char lines, single quotes, rules E/F/UP/B/SIM/I
-- `typing.TypedDict` is banned — use `typing_extensions.TypedDict` (Pydantic requirement on Python <3.12)
-- Pyright: `typeCheckingMode = "basic"` for core library, `"standard"` for server
-- Python 3.10+ target
-
-## Testing
-
-- pytest with `pytest-asyncio` (`asyncio_mode = auto`) and `pytest-xdist`
-- Integration tests: `@pytest.mark.integration` decorator, filenames use `_int` suffix
-- `conftest.py` exports `graph_driver` and `mock_embedder` fixtures (from `tests/helpers_test.py`)
-- Driver selection via env vars: `DISABLE_NEO4J`, `DISABLE_FALKORDB`, `DISABLE_KUZU`, `DISABLE_NEPTUNE`
-- `make test` disables FalkorDB, Kuzu, Neptune by default — only runs Neo4j + unit tests
-- Neptune is force-disabled in test fixtures (`os.environ['DISABLE_NEPTUNE'] = 'True'`)
-
-## Environment variables
-
-Required: `OPENAI_API_KEY` (or equivalent for your LLM provider)
-
-Database connection: `NEO4J_URI`, `NEO4J_USER`, `NEO4J_PASSWORD`, `FALKORDB_HOST`, `FALKORDB_PORT`
-
-Tuning: `SEMAPHORE_LIMIT` (default 20, controls async concurrency), `USE_PARALLEL_RUNTIME` (Neo4j enterprise only)
-
-Provider keys: `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`, `GROQ_API_KEY`, `VOYAGE_API_KEY`
diff --git a/eval-harness/.index-cache-preserve/graphiti-flat_llm/CLAUDE.md b/eval-harness/.index-cache-preserve/graphiti-flat_llm/CLAUDE.md
deleted file mode 100644
index 938c3e6..0000000
--- a/eval-harness/.index-cache-preserve/graphiti-flat_llm/CLAUDE.md
+++ /dev/null
@@ -1,120 +0,0 @@
-# CLAUDE.md
-
-This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
-
-## Project overview
-
-Graphiti (`graphiti-core`) is a Python framework for building temporally-aware knowledge graphs for AI agents. Instead of static RAG, it continuously ingests "episodes" (messages, JSON, text), extracts entities and relationships via LLMs, and builds a queryable graph that tracks when facts were valid. Built by Zep Software.
-
-## Testing
-
-Run only the specific test file relevant to the bug, not the full test suite. For example:
-
-```bash
-uv run pytest tests/test_specific_file.py -x --tb=short
-uv run pytest tests/test_specific_file.py::test_method_name
-```
-
-Environment variables needed for unit tests: `DISABLE_NEPTUNE=1 DISABLE_NEO4J=1 DISABLE_FALKORDB=1 DISABLE_KUZU=1`
-
-## Architecture
-
-### Episode ingestion pipeline
-
-The core workflow — `Graphiti.add_episode()` — runs this pipeline:
-
-1. Retrieve recent episodes for context
-2. Extract entity nodes via LLM (`extract_nodes()`)
-3. Resolve/deduplicate against existing graph nodes (`resolve_extracted_nodes()`)
-4. Extract relationship edges via LLM (`extract_edges()`)
-5. Resolve/deduplicate edges (`resolve_extracted_edges()`)
-6. Generate embeddings for new nodes and edges
-7. Save everything to the graph database
-8. Optionally update community clusters
-
-Batch variant: `add_episode_bulk()` processes multiple episodes with cross-episode deduplication.
-
-### Key abstractions
-
-**`Graphiti`** (`graphiti.py`) — main entry point. Constructor takes optional `graph_driver`, `llm_client`, `embedder`, `cross_encoder`, `tracer`. Defaults to Neo4j + OpenAI.
-
-**`GraphitiClients`** (`graphiti_types.py`) — Pydantic model bundling `driver + llm_client + embedder + cross_encoder + tracer`. Passed as a single object through internal functions instead of threading individual clients.
-
-**`GraphDriver`** (`driver/driver.py`) — ABC for graph database backends. Four implementations: `Neo4jDriver`, `FalkorDriver`, `KuzuDriver`, `NeptuneDriver`. Each provides `ops` properties (e.g., `driver.entity_node_ops`, `driver.search_ops`) implementing abstract operation interfaces from `driver/operations/`.
-
-**Namespace accessors** — `graphiti.nodes.entity`, `graphiti.edges.entity`, etc. are typed namespace objects that wrap driver operations with embedding generation. Preferred API for direct node/edge CRUD (e.g., `graphiti.nodes.entity.save(node)`).
-
-**`group_id`** — partition key on every node and edge. Different users/agents get isolated graphs by using different group IDs. Maps to database name in Neo4j/FalkorDB, property filter in Kuzu.
-
-### Driver operations architecture (v0.28.0)
-
-```
-driver/
-├── driver.py                    # GraphDriver ABC, GraphProvider enum
-├── operations/                  # Abstract operation interfaces
-│   ├── entity_node_ops.py       # EntityNodeOperations ABC
-│   ├── entity_edge_ops.py       # EntityEdgeOperations ABC
-│   ├── search_ops.py            # SearchOperations ABC
-│   └── ...                      # 11 operation ABCs total
-├── neo4j/operations/            # Neo4j implementations
-├── falkordb/operations/         # FalkorDB implementations
-├── kuzu/operations/             # Kuzu implementations
-└── neptune/operations/          # Neptune implementations
-```
-
-Each driver instantiates its own implementations of the abstract operation interfaces. The `GraphDriver` base returns `None` from `ops` properties; concrete drivers override with real implementations.
-
-Transactions: `async with driver.transaction() as tx:` — drivers without real transaction support (FalkorDB, Kuzu) get a no-op wrapper where queries execute immediately.
-
-### Search pipeline
-
-`search/search.py` → `search()` function orchestrated by `SearchConfig`.
-
-- **Search methods per layer**: `cosine_similarity`, `bm25`, `bfs` (graph traversal)
-- **Rerankers**: `rrf` (Reciprocal Rank Fusion), `mmr` (Maximal Marginal Relevance), `node_distance`, `episode_mentions`, `cross_encoder`
-- **Pre-built recipes** in `search_config_recipes.py`: `EDGE_HYBRID_SEARCH_RRF`, `COMBINED_HYBRID_SEARCH_CROSS_ENCODER`, etc.
-- Four searchable layers: edges, nodes, episodes, communities — each independently configurable
-
-### LLM client pattern
-
-`LLMClient` ABC (`llm_client/client.py`) features:
-- Diskcache for optional response caching
-- Tenacity retry (4 attempts, exponential backoff) on rate limits and server errors
-- Structured output via Pydantic models appended as JSON schema to prompts
-- Two model sizes: `model` (medium, default `gpt-4.1-mini`) and `small_model` (small, default `gpt-4.1-nano`)
-
-### Content chunking
-
-`helpers.py` has density-based chunking — only chunks high-entity-density content (large JSON dumps), leaving prose unchanged. Controlled by env vars: `CHUNK_TOKEN_SIZE`, `CHUNK_DENSITY_THRESHOLD`, `CHUNK_MIN_TOKENS`, `CHUNK_OVERLAP_TOKENS`.
-
-### Graph data model
-
-Nodes: `EntityNode`, `EpisodicNode`, `CommunityNode`, `SagaNode`
-Edges: `EntityEdge`, `EpisodicEdge`, `CommunityEdge`, `HasEpisodeEdge`, `NextEpisodeEdge`
-
-All defined in `nodes.py` and `edges.py`. `EpisodeType` enum: `message`, `json`, `text`.
-
-## Code style
-
-- Ruff: 100-char lines, single quotes, rules E/F/UP/B/SIM/I
-- `typing.TypedDict` is banned — use `typing_extensions.TypedDict` (Pydantic requirement on Python <3.12)
-- Pyright: `typeCheckingMode = "basic"` for core library, `"standard"` for server
-- Python 3.10+ target
-
-## Test infrastructure
-
-- pytest with `pytest-asyncio` (`asyncio_mode = auto`) and `pytest-xdist`
-- Integration tests: `@pytest.mark.integration` decorator, filenames use `_int` suffix
-- `conftest.py` exports `graph_driver` and `mock_embedder` fixtures (from `tests/helpers_test.py`)
-- Driver selection via env vars: `DISABLE_NEO4J`, `DISABLE_FALKORDB`, `DISABLE_KUZU`, `DISABLE_NEPTUNE`
-- Neptune is force-disabled in test fixtures (`os.environ['DISABLE_NEPTUNE'] = 'True'`)
-
-## Environment variables
-
-Required: `OPENAI_API_KEY` (or equivalent for your LLM provider)
-
-Database connection: `NEO4J_URI`, `NEO4J_USER`, `NEO4J_PASSWORD`, `FALKORDB_HOST`, `FALKORDB_PORT`
-
-Tuning: `SEMAPHORE_LIMIT` (default 20, controls async concurrency), `USE_PARALLEL_RUNTIME` (Neo4j enterprise only)
-
-Provider keys: `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`, `GROQ_API_KEY`, `VOYAGE_API_KEY`
diff --git a/eval-harness/.index-cache-preserve/graphiti-intent_layer/CLAUDE.md b/eval-harness/.index-cache-preserve/graphiti-intent_layer/CLAUDE.md
deleted file mode 100644
index 3f65af1..0000000
--- a/eval-harness/.index-cache-preserve/graphiti-intent_layer/CLAUDE.md
+++ /dev/null
@@ -1,64 +0,0 @@
-# CLAUDE.md
-
-This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
-
-## Overview
-
-Graphiti (`graphiti-core`) is a Python framework for building temporally-aware knowledge graphs for AI agents. It ingests "episodes" (messages, JSON, text), extracts entities and relationships via LLMs, and builds a queryable graph that tracks when facts were valid. Built by Zep Software.
-
-## Testing
-
-Run only the specific test file relevant to the bug, not the full test suite. For example:
-
-```bash
-uv run pytest tests/test_specific_file.py -x --tb=short
-uv run pytest tests/test_specific_file.py::test_method_name
-```
-
-Environment variables needed for unit tests: `DISABLE_NEPTUNE=1 DISABLE_NEO4J=1 DISABLE_FALKORDB=1 DISABLE_KUZU=1`
-
-## Architecture
-
-### Episode ingestion pipeline
-
-`Graphiti.add_episode()` runs: retrieve context → extract nodes via LLM → resolve/dedup nodes → extract edges via LLM → resolve/dedup edges → generate embeddings → save to graph → update communities.
-
-### Key abstractions
-
-- **`Graphiti`** (`graphiti.py`) — main entry point, constructor takes `graph_driver`, `llm_client`, `embedder`, `cross_encoder`, `tracer`
-- **`GraphitiClients`** (`graphiti_types.py`) — bundles all clients into one Pydantic model passed through internal functions
-- **`GraphDriver`** (`driver/driver.py`) — ABC for graph databases: Neo4j, FalkorDB, Kuzu, Neptune
-- **`group_id`** — partition key on every node/edge, isolates graphs per user/agent
-
-### Graph data model
-
-Nodes: `EntityNode`, `EpisodicNode`, `CommunityNode`, `SagaNode` (in `nodes.py`)
-Edges: `EntityEdge`, `EpisodicEdge`, `CommunityEdge`, `HasEpisodeEdge`, `NextEpisodeEdge` (in `edges.py`)
-
-## Contracts
-
-- Modules use ruff: 100-char lines, single quotes, rules E/F/UP/B/SIM/I
-- `typing.TypedDict` is banned — use `typing_extensions.TypedDict` (Pydantic on Python <3.12)
-- Integration tests use `_int` suffix and `@pytest.mark.integration`
-- `module_utils` cannot import from outside itself (remote execution boundary)
-
-## Pitfalls
-
-- `make test` disables FalkorDB, Kuzu, Neptune via env vars — only runs Neo4j + unit tests
-- Neptune is force-disabled in test fixtures (`os.environ['DISABLE_NEPTUNE'] = 'True'`)
-- Content chunking is density-based (large JSON only), controlled by env vars: `CHUNK_TOKEN_SIZE`, `CHUNK_DENSITY_THRESHOLD`
-- Namespace accessors (`graphiti.nodes.entity`) wrap driver operations with embedding generation — preferred API for CRUD
-
-## Downlinks
-
-| Area | Node |
-|------|------|
-| Core package | `graphiti_core/AGENTS.md` |
-| Driver system | `graphiti_core/driver/AGENTS.md` |
-| Search pipeline | `graphiti_core/search/AGENTS.md` |
-| LLM client | `graphiti_core/llm_client/AGENTS.md` |
-| Namespaces | `graphiti_core/namespaces/AGENTS.md` |
-| Utils & maintenance | `graphiti_core/utils/AGENTS.md` |
-| MCP server | `mcp_server/AGENTS.md` |
-| Tests | `tests/AGENTS.md` |
-| Server | `server/AGENTS.md` |
diff --git a/eval-harness/.index-cache-preserve/graphiti-intent_layer/graphiti_core/AGENTS.md b/eval-harness/.index-cache-preserve/graphiti-intent_layer/graphiti_core/AGENTS.md
deleted file mode 100644
index 01f989c..0000000
--- a/eval-harness/.index-cache-preserve/graphiti-intent_layer/graphiti_core/AGENTS.md
+++ /dev/null
@@ -1,66 +0,0 @@
-# graphiti_core
-
-> Core Python package: episode ingestion, entity/edge extraction, deduplication, and graph persistence.
-
-## Entry points
-
-- `graphiti.py` — `Graphiti` class is the only public export (`__init__.py` re-exports it). Constructor takes `graph_driver`, `llm_client`, `embedder`, `cross_encoder`, `tracer`.
-- `nodes.py` — `Node` (ABC), `EntityNode`, `EpisodicNode`, `CommunityNode`, `SagaNode`. All use Pydantic `BaseModel`.
-- `edges.py` — `Edge` (ABC), `EntityEdge`, `EpisodicEdge`, `CommunityEdge`, `HasEpisodeEdge`, `NextEpisodeEdge`.
-- `helpers.py` — `semaphore_gather()`, `parse_db_date()`, `lucene_sanitize()`, chunking env vars.
-
-## Module layout
-
-| Module | Purpose |
-|--------|---------|
-| `graphiti.py` | Orchestration: `add_episode`, `add_episode_bulk`, `search`, `search_`, `add_triplet`, `remove_episode` |
-| `nodes.py` / `edges.py` | Data models + per-driver CRUD (save/get/delete dispatch via `match driver.provider`) |
-| `helpers.py` | Shared utilities, env-based config (`SEMAPHORE_LIMIT`, `CHUNK_*` vars) |
-| `graphiti_types.py` | `GraphitiClients` Pydantic model bundling driver+llm+embedder+cross_encoder+tracer |
-| `decorators.py` | `@handle_multiple_group_ids` — auto-iterates over group_ids list |
-| `errors.py` | `NodeNotFoundError`, `EdgeNotFoundError`, `GroupIdValidationError` |
-| `prompts/` | LLM prompt templates (extract_nodes, extract_edges, dedupe_nodes, dedupe_edges, summarize) |
-| `tracer.py` | OpenTelemetry integration, `create_tracer()` |
-| `telemetry/` | PostHog-based usage telemetry |
-
-## Contracts
-
-- Every node/edge has `uuid` (str, auto-generated UUID4) and `group_id` (partition key).
-- `group_id` must match `^[a-zA-Z0-9_-]+$` or be empty string. FalkorDB default is `"\_"`, others use `""`.
-- Driver dispatch uses `match driver.provider` pattern in `save()`/`delete()`. Kuzu stores edges as nodes (`RelatesToNode_`) — special delete logic required.
-- `graph_operations_interface` on driver is checked first (try/except `NotImplementedError` fallback) for all CRUD operations.
-- `semaphore_gather()` bounds all concurrent coroutines. Default limit is `SEMAPHORE_LIMIT=20` env var.
-
-## Ingestion pipeline
-
-`add_episode()` sequence:
-1. Validate entity types, resolve group_id
-2. Retrieve previous episodes for context
-3. Extract nodes via LLM (`extract_nodes`)
-4. Resolve/dedup nodes against existing graph (`resolve_extracted_nodes`)
-5. Extract edges via LLM (`extract_edges`)
-6. Resolve/dedup edges (`resolve_extracted_edges`)
-7. Extract node attributes + summaries (only new edges to avoid duplicating summaries)
-8. Save all via `add_nodes_and_edges_bulk` (single transaction)
-9. Optionally update communities
-
-`add_episode_bulk()` differs: no edge invalidation, no date extraction. For those, use single `add_episode()`.
-
-## Pitfalls
-
-- `add_episode` episodes must be added sequentially per group_id. Parallel adds to the same group cause race conditions.
-- `add_triplet` checks if an edge UUID already exists with different source/target nodes. If so, it generates a new UUID to avoid overwriting unrelated edges.
-- FalkorDB fulltext queries need pipe and slash character sanitization (see `lucene_sanitize`).
-- Datetime comparison requires UTC normalization. Use `utc_now()` and `ensure_utc()` from `datetime_utils`.
-- Neptune stores embeddings as comma-separated strings, requiring `split()/toFloat()` conversion on read.
-- Content chunking is density-based, not size-based. Controlled by `CHUNK_TOKEN_SIZE`, `CHUNK_MIN_TOKENS`, `CHUNK_DENSITY_THRESHOLD` env vars.
-
-## Downlinks
-
-| Area | Node |
-|------|------|
-| Driver system | `graphiti_core/driver/AGENTS.md` |
-| LLM clients | `graphiti_core/llm_client/AGENTS.md` |
-| Namespaces | `graphiti_core/namespaces/AGENTS.md` |
-| Search | `graphiti_core/search/AGENTS.md` |
-| Utils | `graphiti_core/utils/AGENTS.md` |
diff --git a/eval-harness/.index-cache-preserve/graphiti-intent_layer/graphiti_core/driver/AGENTS.md b/eval-harness/.index-cache-preserve/graphiti-intent_layer/graphiti_core/driver/AGENTS.md
deleted file mode 100644
index f08219a..0000000
--- a/eval-harness/.index-cache-preserve/graphiti-intent_layer/graphiti_core/driver/AGENTS.md
+++ /dev/null
@@ -1,32 +0,0 @@
-# Driver System
-
-Graph database abstraction layer with four backends.
-
-## Architecture
-
-`GraphDriver` (ABC in `driver.py`) defines abstract `ops` properties returning operation interfaces. Each backend implements all 11 operation ABCs.
-
-```
-driver.py                    # GraphDriver ABC, GraphProvider enum
-operations/                  # Abstract operation interfaces (11 ABCs)
-  entity_node_ops.py         # EntityNodeOperations
-  entity_edge_ops.py         # EntityEdgeOperations
-  search_ops.py              # SearchOperations
-  ...
-neo4j/operations/            # Neo4j implementations
-falkordb/operations/         # FalkorDB implementations
-kuzu/operations/             # Kuzu implementations
-neptune/operations/          # Neptune implementations
-```
-
-## Contracts
-
-- `GraphDriver` base returns `None` from `ops` properties — concrete drivers override
-- Transactions: `async with driver.transaction() as tx:` — FalkorDB and Kuzu get no-op wrappers where queries execute immediately
-- Each driver instantiates its own operation implementations
-
-## Pitfalls
-
-- Neo4j `USE_PARALLEL_RUNTIME` env var only works with Neo4j Enterprise — causes errors on Community Edition
-- FalkorDB and Kuzu lack real transaction support — the `transaction()` context manager is a no-op
-- Neptune requires AWS credentials and specific IAM permissions, not just host/port
diff --git a/eval-harness/.index-cache-preserve/graphiti-intent_layer/graphiti_core/llm_client/AGENTS.md b/eval-harness/.index-cache-preserve/graphiti-intent_layer/graphiti_core/llm_client/AGENTS.md
deleted file mode 100644
index a3cbb08..0000000
--- a/eval-harness/.index-cache-preserve/graphiti-intent_layer/graphiti_core/llm_client/AGENTS.md
+++ /dev/null
@@ -1,24 +0,0 @@
-# LLM Client
-
-Pluggable LLM abstraction with caching and retry logic.
-
-## Architecture
-
-`LLMClient` ABC (`client.py`) with implementations for OpenAI, Anthropic, Google, Groq, Voyage.
-
-### Features
-- Diskcache for optional response caching
-- Tenacity retry: 4 attempts, exponential backoff on rate limits and server errors
-- Structured output via Pydantic models appended as JSON schema to prompts
-- Two model sizes: `model` (medium, default `gpt-4.1-mini`) and `small_model` (small, default `gpt-4.1-nano`)
-
-## Contracts
-
-- All implementations must handle both `model` and `small_model` calls
-- Structured output requires a Pydantic model class — schema is auto-appended to prompt
-- Cache key includes model name + prompt text — different models get separate cache entries
-
-## Pitfalls
-
-- Default models are OpenAI-specific names — other providers need explicit model configuration
-- Cache is per-model, so switching models doesn't use cached results from the old model
diff --git a/eval-harness/.index-cache-preserve/graphiti-intent_layer/graphiti_core/namespaces/AGENTS.md b/eval-harness/.index-cache-preserve/graphiti-intent_layer/graphiti_core/namespaces/AGENTS.md
deleted file mode 100644
index 6dcebce..0000000
--- a/eval-harness/.index-cache-preserve/graphiti-intent_layer/graphiti_core/namespaces/AGENTS.md
+++ /dev/null
@@ -1,18 +0,0 @@
-# Namespaces
-
-Typed accessor objects for entity/edge CRUD with automatic embedding generation.
-
-## Architecture
-
-`graphiti.nodes.entity`, `graphiti.edges.entity`, etc. are namespace objects that wrap driver operations. They handle embedding generation before save, so callers don't need to manage embeddings manually.
-
-## Contracts
-
-- Namespace accessors are the preferred public API for direct CRUD operations
-- Always generates embeddings before persisting — don't bypass to driver directly
-- Available via `Graphiti` instance properties (e.g., `graphiti.nodes`, `graphiti.edges`)
-
-## Key files
-
-- `nodes.py` — `EntityNodeNamespace`, `EpisodicNodeNamespace`, etc.
-- `edges.py` — `EntityEdgeNamespace`, `EpisodicEdgeNamespace`, etc.
diff --git a/eval-harness/.index-cache-preserve/graphiti-intent_layer/graphiti_core/search/AGENTS.md b/eval-harness/.index-cache-preserve/graphiti-intent_layer/graphiti_core/search/AGENTS.md
deleted file mode 100644
index 5c69961..0000000
--- a/eval-harness/.index-cache-preserve/graphiti-intent_layer/graphiti_core/search/AGENTS.md
+++ /dev/null
@@ -1,31 +0,0 @@
-# Search Pipeline
-
-Hybrid search across four graph layers with configurable reranking.
-
-## Architecture
-
-`search/search.py` → `search()` orchestrated by `SearchConfig`.
-
-Four searchable layers: edges, nodes, episodes, communities — each independently configurable.
-
-### Search methods (per layer)
-- `cosine_similarity` — embedding-based
-- `bm25` — text-based
-- `bfs` — graph traversal
-
-### Rerankers
-- `rrf` — Reciprocal Rank Fusion
-- `mmr` — Maximal Marginal Relevance
-- `node_distance` — graph proximity
-- `episode_mentions` — recency weighting
-- `cross_encoder` — ML reranking
-
-### Pre-built recipes
-
-`search_config_recipes.py` has ready-made configs: `EDGE_HYBRID_SEARCH_RRF`, `COMBINED_HYBRID_SEARCH_CROSS_ENCODER`, etc.
-
-## Contracts
-
-- Search configs specify methods and rerankers per layer
-- Results are unified across layers before final reranking
-- `cross_encoder` reranker requires a `CrossEncoderClient` in `GraphitiClients`
diff --git a/eval-harness/.index-cache-preserve/graphiti-intent_layer/graphiti_core/utils/AGENTS.md b/eval-harness/.index-cache-preserve/graphiti-intent_layer/graphiti_core/utils/AGENTS.md
deleted file mode 100644
index 94377d5..0000000
--- a/eval-harness/.index-cache-preserve/graphiti-intent_layer/graphiti_core/utils/AGENTS.md
+++ /dev/null
@@ -1,48 +0,0 @@
-# graphiti_core/utils
-
-> Maintenance operations, content chunking, bulk graph operations, and deduplication logic.
-
-## Entry points
-
-- `maintenance/` — node extraction, edge extraction, dedup, community ops, graph data ops
-- `bulk_utils.py` — `RawEpisode`, `add_nodes_and_edges_bulk`, `dedupe_nodes_bulk`, `dedupe_edges_bulk`, `extract_nodes_and_edges_bulk`
-- `content_chunking.py` — density-based chunking for JSON, text, and message content
-
-## Module layout
-
-| File | Purpose |
-|------|---------|
-| `maintenance/node_operations.py` | `extract_nodes()`, `resolve_extracted_nodes()`, `extract_attributes_from_nodes()` |
-| `maintenance/edge_operations.py` | `extract_edges()`, `resolve_extracted_edges()`, `resolve_extracted_edge()`, `build_episodic_edges()` |
-| `maintenance/community_operations.py` | `build_communities()`, `remove_communities()`, `update_community()` |
-| `maintenance/graph_data_operations.py` | `retrieve_episodes()`, `clear_data()`, `EPISODE_WINDOW_LEN` |
-| `maintenance/dedup_helpers.py` | `_build_candidate_indexes()`, `_resolve_with_similarity()`, MinHash + Jaccard similarity |
-| `bulk_utils.py` | Bulk save/extract/dedup with transaction support, union-find for UUID map compression |
-| `content_chunking.py` | `should_chunk()`, `chunk_json_content()`, `chunk_text_content()`, `chunk_message_content()`, `generate_covering_chunks()` |
-| `datetime_utils.py` | `utc_now()`, `ensure_utc()`, `convert_datetimes_to_strings()` |
-| `text_utils.py` | `truncate_at_sentence()`, `MAX_SUMMARY_CHARS` |
-| `ontology_utils/entity_types_utils.py` | `validate_entity_types()` — checks Pydantic model constraints |
-
-## Contracts
-
-- `maintenance/__init__.py` exports: `extract_edges`, `build_episodic_edges`, `extract_nodes`, `clear_data`, `retrieve_episodes`
-- Bulk operations use `GraphDriverSession.execute_write()` for transactional saves. Kuzu falls back to one-by-one inserts (no `UNWIND` support for `STRUCT[]`).
-- `dedupe_nodes_bulk` runs a two-pass strategy: (1) resolve each episode against the live graph in parallel, (2) cross-dedupe the batch using deterministic similarity heuristics + union-find.
-- Union-find for UUID maps uses directed path compression: `_build_directed_uuid_map()` in `bulk_utils.py`.
-
-## Chunking behavior
-
-Content is only chunked when **both** conditions hold:
-1. Token count >= `CHUNK_MIN_TOKENS` (default 1000)
-2. Entity density exceeds `CHUNK_DENSITY_THRESHOLD` (default 0.15)
-
-JSON density = elements per 1000 tokens. Text density = capitalized words per 1000 tokens (half the threshold).
-
-`generate_covering_chunks()` solves the Handshake Flights / Covering Design problem: given N items and chunk size K, greedily selects chunks to cover all pairs. Falls back to random sampling when C(n,k) > 1000.
-
-## Pitfalls
-
-- `dedupe_nodes_bulk` rebuilds the MinHash candidate index for each new node against the canonical pool. O(n^2) in batch size. Fine for typical batches (<=10) but would need incremental indexing for larger ones.
-- `add_nodes_and_edges_bulk_tx` generates embeddings inline if missing. This means embedding generation happens inside the DB transaction on Kuzu.
-- Edge dedup uses word-overlap + cosine similarity (min 0.6) as a pre-filter before LLM resolution. Edges between different node pairs are never compared.
-- `clear_data()` deletes by group_id. Calling without group_ids deletes everything in the database.
diff --git a/eval-harness/.index-cache-preserve/graphiti-intent_layer/mcp_server/AGENTS.md b/eval-harness/.index-cache-preserve/graphiti-intent_layer/mcp_server/AGENTS.md
deleted file mode 100644
index aacbbde..0000000
--- a/eval-harness/.index-cache-preserve/graphiti-intent_layer/mcp_server/AGENTS.md
+++ /dev/null
@@ -1,53 +0,0 @@
-# mcp_server
-
-> MCP (Model Context Protocol) server exposing Graphiti as tools for AI agents. Separate package with its own pyproject.toml.
-
-## Entry points
-
-- `main.py` — entry point, calls `graphiti_mcp_server.main()`
-- `src/graphiti_mcp_server.py` — FastMCP server definition, all MCP tools, `initialize_server()`, CLI arg parsing
-- `config/config.yaml` — YAML-based configuration (providers, models, transport)
-
-## Module layout
-
-| Path | Purpose |
-|------|---------|
-| `src/graphiti_mcp_server.py` | MCP tool definitions, `GraphitiService` wrapper, server startup |
-| `src/config/schema.py` | `GraphitiConfig`, `ServerConfig` Pydantic settings with env var + YAML support |
-| `src/models/response_types.py` | `SuccessResponse`, `ErrorResponse`, `FactSearchResponse`, `NodeSearchResponse`, `StatusResponse` |
-| `src/models/entity_types.py` | Dynamic entity type model generation |
-| `src/services/factories.py` | `LLMClientFactory`, `EmbedderFactory`, `DatabaseDriverFactory` |
-| `src/services/queue_service.py` | `QueueService` — per-group_id sequential episode processing |
-| `src/utils/formatting.py` | `format_fact_result()` — strips embeddings from edge responses |
-| `docker/` | Dockerfiles and compose files for Neo4j and FalkorDB deployments |
-| `config/` | YAML configs for different deployment scenarios |
-
-## MCP tools exposed
-
-| Tool | Description |
-|------|-------------|
-| `add_memory` | Queue an episode for background processing |
-| `search_nodes` | Search entity nodes by natural language query |
-| `search_memory_facts` | Search entity edges (facts/relationships) |
-| `get_entity_edge` | Get a single edge by UUID |
-| `get_episodes` | List episodes by group_id |
-| `delete_entity_edge` | Delete a single edge |
-| `delete_episode` | Delete an episode and its exclusive nodes/edges |
-| `clear_graph` | Delete all data for specified group_ids |
-| `get_status` | Health check for DB connection |
-
-## Contracts
-
-- `add_memory` is async: returns immediately, processes in background via `QueueService`. Episodes within the same `group_id` are processed sequentially.
-- Config resolution order: YAML file -> env vars -> CLI args (CLI wins).
-- `SEMAPHORE_LIMIT` (default 10) controls concurrent Graphiti operations. Each episode involves multiple LLM calls.
-- Transport options: `stdio` (default for local), `sse` (deprecated), `http` (recommended for deployment, streamable HTTP).
-- `/health` endpoint available at all times (returns `{"status": "healthy"}`).
-
-## Pitfalls
-
-- The `graphiti_service` and `queue_service` are module-level globals. All tools check `if graphiti_service is None` before proceeding.
-- FalkorDB default group_id is `"\_"` (escaped underscore). Don't confuse with empty string (Neo4j default).
-- Docker: FalkorDB Browser UI runs on port 3000 by default when `BROWSER=1`.
-- `source` parameter in `add_memory` must be one of `text`, `json`, `message`. Unknown values fall back to `text` with a warning.
-- Dependency pinning: `mcp_server` pins its own `graphiti-core` version in pyproject.toml. Version mismatches between the server and core library can cause subtle failures.
diff --git a/eval-harness/.index-cache-preserve/graphiti-intent_layer/server/AGENTS.md b/eval-harness/.index-cache-preserve/graphiti-intent_layer/server/AGENTS.md
deleted file mode 100644
index 8b310e1..0000000
--- a/eval-harness/.index-cache-preserve/graphiti-intent_layer/server/AGENTS.md
+++ /dev/null
@@ -1,23 +0,0 @@
-# Server
-
-FastAPI REST service exposing Graphiti functionality.
-
-## Development
-
-```bash
-cd server/
-uv sync --extra dev
-uvicorn graph_service.main:app --reload
-make format / make lint / make test
-```
-
-## Architecture
-
-- `graph_service/main.py` — FastAPI app entry point
-- Separate pyproject.toml and dev dependencies from the core library
-- Wraps `Graphiti` client for HTTP access
-
-## Contracts
-
-- Pyright `typeCheckingMode = "standard"` (stricter than core library's `"basic"`)
-- Has its own test suite separate from the core library
diff --git a/eval-harness/.index-cache-preserve/graphiti-intent_layer/tests/AGENTS.md b/eval-harness/.index-cache-preserve/graphiti-intent_layer/tests/AGENTS.md
deleted file mode 100644
index 8b16743..0000000
--- a/eval-harness/.index-cache-preserve/graphiti-intent_layer/tests/AGENTS.md
+++ /dev/null
@@ -1,54 +0,0 @@
-# tests
-
-> Test suite for graphiti_core. Mix of unit tests, mock-based tests, and integration tests requiring live databases.
-
-## Entry points
-
-- `test_graphiti_mock.py` — largest test file (65k), comprehensive mock-based tests of the full pipeline
-- `test_graphiti_int.py` — integration test for end-to-end episode ingestion
-- `test_add_triplet.py` — tests for `add_triplet()` flow
-
-## Test organization
-
-| Path | Type | What it tests |
-|------|------|---------------|
-| `test_graphiti_mock.py` | Unit (mocked) | Full pipeline: add_episode, add_episode_bulk, search, add_triplet, remove_episode |
-| `test_graphiti_int.py` | Integration | End-to-end with live DB |
-| `test_add_triplet.py` | Unit + Integration | Triplet creation, dedup, edge resolution |
-| `test_edge_int.py` | Integration | Edge CRUD operations per driver |
-| `test_node_int.py` | Integration | Node CRUD operations per driver |
-| `test_entity_exclusion_int.py` | Integration | Entity type exclusion filtering |
-| `test_text_utils.py` | Unit | Text truncation utilities |
-| `helpers_test.py` | Unit | `semaphore_gather`, `validate_group_id`, `lucene_sanitize` |
-| `llm_client/` | Unit | Per-provider LLM client tests (OpenAI, Anthropic, Gemini, Groq, Azure) |
-| `embedder/` | Unit | Per-provider embedder tests (OpenAI, Gemini, Voyage) |
-| `driver/` | Unit | FalkorDB driver tests |
-| `cross_encoder/` | Unit | BGE reranker, Gemini reranker |
-| `utils/` | Unit | Content chunking, maintenance operations |
-| `evals/` | Evaluation | LongMemEval benchmark (large JSON data file in `evals/data/`) |
-
-## Running tests
-
-```bash
-# Unit tests only (disable all DB drivers)
-DISABLE_NEPTUNE=1 DISABLE_NEO4J=1 DISABLE_FALKORDB=1 DISABLE_KUZU=1 uv run pytest tests/ -x --tb=short
-
-# Single test file
-uv run pytest tests/test_graphiti_mock.py -x
-
-# Integration tests (need live Neo4j)
-uv run pytest tests/test_graphiti_int.py -x --tb=short
-```
-
-## Contracts
-
-- Integration tests use `_int` suffix in filename and `@pytest.mark.integration` decorator.
-- Tests separated into unit, database, and API integration categories (see commit `e72f810`).
-- `evals/data/longmemeval_data/longmemeval_oracle.json` is ~3.8M tokens. Don't try to read or process it in normal test runs.
-
-## Pitfalls
-
-- `make test` disables FalkorDB, Kuzu, Neptune via env vars. Only runs Neo4j + pure unit tests.
-- Neptune is force-disabled in test fixtures (`os.environ['DISABLE_NEPTUNE'] = 'True'`).
-- Mock tests patch at the module level where functions are imported, not where they're defined. Watch import paths.
-- FalkorDB driver tests need a running FalkorDB instance (default port 6379).
diff --git a/eval-harness/.index-cache-preserve/pdm-flat_llm/AGENTS.md b/eval-harness/.index-cache-preserve/pdm-flat_llm/AGENTS.md
deleted file mode 100644
index 70477c4..0000000
--- a/eval-harness/.index-cache-preserve/pdm-flat_llm/AGENTS.md
+++ /dev/null
@@ -1,124 +0,0 @@
-# CLAUDE.md
-
-This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
-
-## Project
-
-PDM is a Python package and dependency manager supporting PEP 517/621. It provides dependency resolution (via resolvelib or uv), virtual environment management, a plugin system, and a build frontend. Entry point: `pdm.core:main`.
-
-## Development commands
-
-```bash
-# Install all dev dependencies
-pdm install
-
-# Run full test suite
-pdm run test
-
-# Run tests in parallel (much faster)
-pdm run test -n auto
-
-# Skip slow integration tests
-pdm run test -n auto -m "not integration"
-
-# Run a single test file
-pdm run test tests/cli/test_add.py
-
-# Run a single test by name
-pdm run test -k "test_add_package"
-
-# Run tests with coverage
-pdm run coverage
-
-# Lint (ruff format + ruff lint + codespell + mypy)
-pdm run lint
-
-# Serve docs locally
-pdm run doc
-```
-
-Linting uses `prek` (pre-commit runner). Install it separately, then `prek install` to set up hooks.
-
-### News fragments
-
-Every PR needs a news fragment in `news/` named `<issue_num>.<type>.md` where type is one of: `feature`, `bugfix`, `refactor`, `doc`, `dep`, `removal`, `misc`. Content is a single imperative-mood sentence.
-
-## Architecture
-
-### Core → Project → Environment pipeline
-
-The central flow is: `Core` creates a `Project`, which manages a `BaseEnvironment`, which the resolver and installer operate against.
-
-- **`Core`** (`src/pdm/core.py`): Top-level DI container. Holds `project_class`, `repository_class`, `install_manager_class` as swappable class attributes. Auto-discovers CLI commands via `pkgutil.iter_modules` on `pdm.cli.commands`. Loads plugins from `pdm` and `pdm.plugin` entry point groups.
-
-- **`Project`** (`src/pdm/project/core.py`): Represents a PDM project. Owns `PyProject` (toml parsing), lockfile, config, environment, and Python info. The `root` path is discovered by `find_project_root()` walking up the directory tree.
-
-- **Environments** (`src/pdm/environments/`): `BaseEnvironment` → `PythonEnvironment` (venv-based) and `PythonLocalEnvironment` (PEP 582). `BareEnvironment` is for operations that don't need a real Python.
-
-### Command system
-
-All commands live in `src/pdm/cli/commands/`, one file per command (or directory for sub-commands like `venv/`, `fix/`, `publish/`). Each exports a `Command` class inheriting `BaseCommand`. Registration is automatic — just create the file.
-
-`BaseCommand.arguments` controls which standard options (verbose, project, global) are attached. Override `add_arguments()` for custom args. Override `handle(project, options)` for logic.
-
-The `Option` class (`src/pdm/cli/options.py`) wraps argparse args as reusable objects. The `CallbackAction` pattern lets options register deferred callbacks that run after project creation.
-
-### Resolver
-
-Two resolver backends behind the `Resolver` ABC (`src/pdm/resolver/base.py`):
-- `RLResolver` — resolvelib-based, the default
-- `UvResolver` — delegates to uv (experimental, set `use_uv = true` in config)
-
-Both produce a `Resolution` containing `Package` entries with pinned `Candidate` objects.
-
-### Installer / Synchronizer
-
-- `InstallManager` (`src/pdm/installers/manager.py`): Handles individual package install/uninstall
-- `Synchronizer` / `UvSynchronizer` (`src/pdm/installers/synchronizers.py`, `uv.py`): Orchestrates syncing the full environment against a lockfile
-- `install_wheel` in `installers/installers.py`: Low-level wheel installation
-
-### Plugin system
-
-Plugins are callables loaded from entry points (`pdm` or `pdm.plugin` groups). They receive the `Core` instance and can:
-- Register new commands via `core.register_command()`
-- Add config items via `core.add_config()`
-- Connect to signals for lifecycle hooks
-- Replace `core.project_class`, `core.repository_class`, or `core.install_manager_class`
-
-Project-local plugins go in `.pdm-plugins/` directory.
-
-### Signal system
-
-`src/pdm/signals.py` uses `blinker.NamedSignal`. Key signals: `pre_lock`, `post_lock`, `pre_install`, `post_install`, `pre_build`, `post_build`, `pre_publish`, `post_publish`, `pre_run`, `post_run`, `pre_invoke`. The `HookManager` (`src/pdm/cli/hooks.py`) wraps signal emission with skip logic (`:all`, `:pre`, `:post`, or individual names).
-
-### Lockfile formats
-
-Two lockfile implementations behind `Lockfile` ABC (`src/pdm/project/lockfile/base.py`):
-- `PDMLock` — the default `pdm.lock` format
-- `PyLock` — PEP pylock.toml format
-
-### Format converters
-
-`src/pdm/formats/` contains importers/exporters for pipfile, poetry, flit, setup.py, requirements.txt, uv, and pylock. Each module implements `check_fingerprint()`, `convert()`, and `export()`.
-
-### Group selection
-
-`GroupSelection` (`src/pdm/cli/filters.py`) handles dependency group filtering (default, dev, optional groups, exclusions). Many commands accept `--group`, `--dev`, `--no-default` flags that feed into this.
-
-## Test infrastructure
-
-Tests use `pdm.pytest` (`src/pdm/pytest.py`), a public fixture module also usable by plugin developers. Key fixtures:
-- `pdm` callable fixture: invokes CLI commands programmatically and captures output
-- `project`: a pre-configured test `Project` with mocked PyPI indexes
-- `pypi_indexes` / `index`: mock package index serving from `tests/fixtures/`
-- `build_env_wheels`: pre-built wheels for build backends
-
-Test data lives in `tests/fixtures/` — artifacts (wheels/tarballs), mock index HTML, sample projects, and lockfiles.
-
-Markers: `@pytest.mark.network` (needs internet), `@pytest.mark.integration` (run with all Python versions), `@pytest.mark.path` (system path comparison), `@pytest.mark.uv` (needs uv installed).
-
-## Key config
-
-- **ruff**: line-length 120, target py38, isort + bugbear + comprehensions enabled. `tests/fixtures` excluded.
-- **mypy**: strict (disallow_untyped_defs/decorators), namespace packages, `src/` as mypy_path. Excludes `pep582/`, `models/in_process/`, `misc/`.
-- Python support: 3.9+
diff --git a/eval-harness/.index-cache-preserve/pdm-flat_llm/CLAUDE.md b/eval-harness/.index-cache-preserve/pdm-flat_llm/CLAUDE.md
deleted file mode 100644
index 70477c4..0000000
--- a/eval-harness/.index-cache-preserve/pdm-flat_llm/CLAUDE.md
+++ /dev/null
@@ -1,124 +0,0 @@
-# CLAUDE.md
-
-This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
-
-## Project
-
-PDM is a Python package and dependency manager supporting PEP 517/621. It provides dependency resolution (via resolvelib or uv), virtual environment management, a plugin system, and a build frontend. Entry point: `pdm.core:main`.
-
-## Development commands
-
-```bash
-# Install all dev dependencies
-pdm install
-
-# Run full test suite
-pdm run test
-
-# Run tests in parallel (much faster)
-pdm run test -n auto
-
-# Skip slow integration tests
-pdm run test -n auto -m "not integration"
-
-# Run a single test file
-pdm run test tests/cli/test_add.py
-
-# Run a single test by name
-pdm run test -k "test_add_package"
-
-# Run tests with coverage
-pdm run coverage
-
-# Lint (ruff format + ruff lint + codespell + mypy)
-pdm run lint
-
-# Serve docs locally
-pdm run doc
-```
-
-Linting uses `prek` (pre-commit runner). Install it separately, then `prek install` to set up hooks.
-
-### News fragments
-
-Every PR needs a news fragment in `news/` named `<issue_num>.<type>.md` where type is one of: `feature`, `bugfix`, `refactor`, `doc`, `dep`, `removal`, `misc`. Content is a single imperative-mood sentence.
-
-## Architecture
-
-### Core → Project → Environment pipeline
-
-The central flow is: `Core` creates a `Project`, which manages a `BaseEnvironment`, which the resolver and installer operate against.
-
-- **`Core`** (`src/pdm/core.py`): Top-level DI container. Holds `project_class`, `repository_class`, `install_manager_class` as swappable class attributes. Auto-discovers CLI commands via `pkgutil.iter_modules` on `pdm.cli.commands`. Loads plugins from `pdm` and `pdm.plugin` entry point groups.
-
-- **`Project`** (`src/pdm/project/core.py`): Represents a PDM project. Owns `PyProject` (toml parsing), lockfile, config, environment, and Python info. The `root` path is discovered by `find_project_root()` walking up the directory tree.
-
-- **Environments** (`src/pdm/environments/`): `BaseEnvironment` → `PythonEnvironment` (venv-based) and `PythonLocalEnvironment` (PEP 582). `BareEnvironment` is for operations that don't need a real Python.
-
-### Command system
-
-All commands live in `src/pdm/cli/commands/`, one file per command (or directory for sub-commands like `venv/`, `fix/`, `publish/`). Each exports a `Command` class inheriting `BaseCommand`. Registration is automatic — just create the file.
-
-`BaseCommand.arguments` controls which standard options (verbose, project, global) are attached. Override `add_arguments()` for custom args. Override `handle(project, options)` for logic.
-
-The `Option` class (`src/pdm/cli/options.py`) wraps argparse args as reusable objects. The `CallbackAction` pattern lets options register deferred callbacks that run after project creation.
-
-### Resolver
-
-Two resolver backends behind the `Resolver` ABC (`src/pdm/resolver/base.py`):
-- `RLResolver` — resolvelib-based, the default
-- `UvResolver` — delegates to uv (experimental, set `use_uv = true` in config)
-
-Both produce a `Resolution` containing `Package` entries with pinned `Candidate` objects.
-
-### Installer / Synchronizer
-
-- `InstallManager` (`src/pdm/installers/manager.py`): Handles individual package install/uninstall
-- `Synchronizer` / `UvSynchronizer` (`src/pdm/installers/synchronizers.py`, `uv.py`): Orchestrates syncing the full environment against a lockfile
-- `install_wheel` in `installers/installers.py`: Low-level wheel installation
-
-### Plugin system
-
-Plugins are callables loaded from entry points (`pdm` or `pdm.plugin` groups). They receive the `Core` instance and can:
-- Register new commands via `core.register_command()`
-- Add config items via `core.add_config()`
-- Connect to signals for lifecycle hooks
-- Replace `core.project_class`, `core.repository_class`, or `core.install_manager_class`
-
-Project-local plugins go in `.pdm-plugins/` directory.
-
-### Signal system
-
-`src/pdm/signals.py` uses `blinker.NamedSignal`. Key signals: `pre_lock`, `post_lock`, `pre_install`, `post_install`, `pre_build`, `post_build`, `pre_publish`, `post_publish`, `pre_run`, `post_run`, `pre_invoke`. The `HookManager` (`src/pdm/cli/hooks.py`) wraps signal emission with skip logic (`:all`, `:pre`, `:post`, or individual names).
-
-### Lockfile formats
-
-Two lockfile implementations behind `Lockfile` ABC (`src/pdm/project/lockfile/base.py`):
-- `PDMLock` — the default `pdm.lock` format
-- `PyLock` — PEP pylock.toml format
-
-### Format converters
-
-`src/pdm/formats/` contains importers/exporters for pipfile, poetry, flit, setup.py, requirements.txt, uv, and pylock. Each module implements `check_fingerprint()`, `convert()`, and `export()`.
-
-### Group selection
-
-`GroupSelection` (`src/pdm/cli/filters.py`) handles dependency group filtering (default, dev, optional groups, exclusions). Many commands accept `--group`, `--dev`, `--no-default` flags that feed into this.
-
-## Test infrastructure
-
-Tests use `pdm.pytest` (`src/pdm/pytest.py`), a public fixture module also usable by plugin developers. Key fixtures:
-- `pdm` callable fixture: invokes CLI commands programmatically and captures output
-- `project`: a pre-configured test `Project` with mocked PyPI indexes
-- `pypi_indexes` / `index`: mock package index serving from `tests/fixtures/`
-- `build_env_wheels`: pre-built wheels for build backends
-
-Test data lives in `tests/fixtures/` — artifacts (wheels/tarballs), mock index HTML, sample projects, and lockfiles.
-
-Markers: `@pytest.mark.network` (needs internet), `@pytest.mark.integration` (run with all Python versions), `@pytest.mark.path` (system path comparison), `@pytest.mark.uv` (needs uv installed).
-
-## Key config
-
-- **ruff**: line-length 120, target py38, isort + bugbear + comprehensions enabled. `tests/fixtures` excluded.
-- **mypy**: strict (disallow_untyped_defs/decorators), namespace packages, `src/` as mypy_path. Excludes `pep582/`, `models/in_process/`, `misc/`.
-- Python support: 3.9+
diff --git a/eval-harness/.index-cache-preserve/pdm-intent_layer/CLAUDE.md b/eval-harness/.index-cache-preserve/pdm-intent_layer/CLAUDE.md
deleted file mode 100644
index 5b5462a..0000000
--- a/eval-harness/.index-cache-preserve/pdm-intent_layer/CLAUDE.md
+++ /dev/null
@@ -1,152 +0,0 @@
-# CLAUDE.md
-
-## Purpose
-
-PDM — a Python package and dependency manager supporting PEP standards. Dual resolver (resolvelib + uv), dual lockfile format (pdm.lock + pylock.toml), plugin system via entry points.
-
-## Build, Test, Run
-
-```bash
-# Install dev dependencies
-pdm install -d
-
-# Run tests (uses pytest, pdm.pytest plugin provides fixtures)
-pdm run pytest tests/ -x
-pdm run pytest tests/cli/test_add.py -k "test_add_package"  # single test
-
-# Tests requiring network: marked with @pytest.mark.network
-# Tests requiring uv: marked with @pytest.mark.uv (skipped if uv not installed)
-
-# Build
-pdm build
-
-# Lint (pre-commit)
-pre-commit run --all-files
-```
-
-## Entry Points
-
-| Task | Start Here |
-|------|------------|
-| Add a new CLI command | `src/pdm/cli/commands/` — create module with `Command` class |
-| Fix dependency resolution | `src/pdm/resolver/` — `providers.py` for strategy, `resolvelib.py`/`uv.py` for backends |
-| Fix install/sync issues | `src/pdm/installers/synchronizers.py` — diff logic in `compare_with_working_set` |
-| Fix lockfile read/write | `src/pdm/project/lockfile/` — `pdmlock.py` or `pylock.py` |
-| Fix requirement parsing | `src/pdm/models/requirements.py` — `parse_line()` / `parse_requirement()` |
-| Fix marker/specifier logic | `src/pdm/models/markers.py` and `specifiers.py` |
-| Fix project config | `src/pdm/project/config.py` — `Config._config_map` has all keys |
-| Fix format import/export | `src/pdm/formats/` — protocol: `check_fingerprint`, `convert`, `export` |
-| Write a plugin | `src/pdm/core.py` — entry point group `"pdm"`, receives `Core` instance |
-| Understand test fixtures | `src/pdm/pytest.py` — `pdm`, `project`, `repository`, `working_set` fixtures |
-
-## Code Map
-
-```
-CLI Layer (cli/)
-  ├── commands/*.py      argparse subcommands
-  ├── actions.py         shared operations (do_lock, do_sync)
-  ├── hooks.py           signal-based lifecycle hooks
-  └── filters.py         group selection logic
-
-Domain Layer
-  ├── models/            requirements, candidates, markers, specifiers, caches
-  ├── project/           Project class, config, pyproject, lockfile
-  ├── resolver/          resolvelib + uv backends, providers, graph
-  └── formats/           import from poetry/flit/pipfile, export to requirements.txt/pylock
-
-Execution Layer
-  ├── installers/        synchronizers (diff + apply), wheel install, uninstall
-  ├── environments/      venv, PEP 582, bare env
-  └── builders/          sdist, wheel, editable builds
-```
-
-Data flows top-to-bottom: CLI calls `actions.py`, which uses `resolver` to produce `Resolution`, then `installers` to apply it. `models/` and `project/` are shared across all layers.
-
-### Plugin System
-
-Plugins load from `"pdm"` and `"pdm.plugin"` entry point groups. A plugin is a callable receiving `Core`:
-
-```python
-def my_plugin(core):
-    core.register_command(MyCommand)        # add CLI command
-    Config.add_config("key", ConfigItem())  # add config key (modifies class-level dict)
-    Core.project_class = MyProject          # swap project class
-```
-
-Failures are logged but don't abort startup.
-
-### Dual Resolver
-
-- `RLResolver` — wraps `resolvelib`. Four strategies: `all`, `reuse`, `eager`, `reuse-installed`.
-- `UvResolver` — shells out to `uv lock`. Only supports `all` and `reuse`; warns on others.
-
-### Dual Lockfile
-
-- `pdm.lock` — PDM native format. Detected by `metadata.lock_version` key.
-- `pylock.toml` — PEP 751 format. Detected by `lock-version` key. Requires `FLAG_INHERIT_METADATA`.
-- Format detection is content-based, not filename-based.
-
-## Contracts
-
-- **Command registration**: auto-discovery looks for `module.Command` exactly. Other names are silently skipped.
-- **`Requirement` identity**: `__eq__` and `__hash__` use `(key, extras, marker)` — NOT version specifier. Two reqs for the same package at different versions are "equal."
-- **`Requirement.key` is always lowercase-normalized**: never call `.lower()` on it again.
-- **`TOMLFile` write gate**: `open_for_write()` must be called before `write()`. Read uses fast `tomllib`; write re-parses with `tomlkit` to preserve formatting.
-- **`Config._config_map` is a class variable**: `add_config()` affects all instances globally.
-- **`content_hash()` scope**: covers dependencies, requires-python, resolution config. Does NOT include build backend config.
-- **Group name `"default"` is reserved**: `GroupSelection` always sorts it first. Can't be used as a dependency group name.
-
-## Pitfalls
-
-### Lockfile hash clearing on env_spec append (#3611)
-
-Appending to a lockfile with a new `env_spec` was clearing hashes for existing entries. The fix ensures hashes are preserved when merging lock targets.
-
-### Circular file dependencies cause infinite recursion (#3539)
-
-`FileRequirement.__post_init__` calls `Setup.from_directory()` which walks the filesystem. A project depending on itself via path creates infinite recursion. Guarded by `_checked_paths` module-level set — but it's not thread-safe and not cleared between test runs.
-
-### Adding dependency duplicates lockfile entries (#3546)
-
-`do_lock` with `--update-reuse` could duplicate entries when appending. Fixed by deduplicating on the candidate key.
-
-### Resolution overrides drop extra dependencies (#3428)
-
-Using `[tool.pdm.resolution.overrides]` could silently drop extras from transitive dependencies. The override was applied too broadly.
-
-### `pdm add`/`update` remove dependency groups incorrectly (#3419)
-
-Group manipulation in add/update was using incorrect group filtering, removing groups that shouldn't be touched.
-
-### UV mode: transitive extras not installed (#3559)
-
-When `USE_UV=true`, extra dependencies of transitive dependencies weren't properly forwarded to `uv sync`.
-
-### `packaging` 26 compatibility (#3730)
-
-`packaging` 26 changed APIs. PDM's `specifiers.py` and related code needed updates to handle the new version.
-
-### pylock.toml + git dependency lock failure (#3695)
-
-Git dependencies caused `format_lockfile()` to fail because the pylock converter didn't handle VCS URLs.
-
-### `PdmUsageError` suppresses tracebacks
-
-Any subclass of `PdmUsageError` prints without traceback at normal verbosity. Use `-v` to see the stack. Other `PdmException` subclasses show "add -v" hint.
-
-### `Project` stored as `weakref.proxy` in environments
-
-`isinstance(env.project, Project)` returns `False`. Don't pass `env.project` to functions that check `isinstance`.
-
-## Intent Layer
-
-### Downlinks
-
-| Area | Node | Description |
-|------|------|-------------|
-| CLI | `src/pdm/cli/AGENTS.md` | Command registration, actions, hooks, group selection |
-| Models | `src/pdm/models/AGENTS.md` | Requirements, candidates, markers, specifiers, caches, repositories |
-| Project | `src/pdm/project/AGENTS.md` | Project core, config, pyproject, lockfile formats |
-| Resolver | `src/pdm/resolver/AGENTS.md` | Dual resolver (resolvelib + uv), providers, graph |
-| Installers | `src/pdm/installers/AGENTS.md` | Synchronizers, wheel install, uninstall, caching |
-| Formats | `src/pdm/formats/AGENTS.md` | Import/export: poetry, flit, pipfile, requirements.txt, pylock |
diff --git a/eval-harness/.index-cache-preserve/pdm-intent_layer/src/pdm/cli/AGENTS.md b/eval-harness/.index-cache-preserve/pdm-intent_layer/src/pdm/cli/AGENTS.md
deleted file mode 100644
index 0827710..0000000
--- a/eval-harness/.index-cache-preserve/pdm-intent_layer/src/pdm/cli/AGENTS.md
+++ /dev/null
@@ -1,81 +0,0 @@
-# CLI — AGENTS.md
-
-## Purpose
-
-Commands, argument parsing, shared actions, hook lifecycle, and group selection.
-
-## Entry Points
-
-| Task | Start Here |
-|------|------------|
-| Add a new CLI command | `commands/` — create module with `Command` class |
-| Fix locking behavior | `actions.py` — `do_lock` |
-| Fix install/sync flow | `actions.py` — `do_sync` |
-| Fix group selection | `filters.py` — `GroupSelection` |
-| Fix hook lifecycle | `hooks.py` — `HookManager` |
-| Fix CLI option behavior | `options.py` — shared option definitions |
-
-## Code Map
-
-| Looking for... | Go to |
-|---|---|
-| Shared CLI option definitions (`-G`, `--lockfile`, etc.) | `options.py` |
-| Dependency group resolution (`:all`, `--dev`, `--prod`) | `filters.py` — `GroupSelection` |
-| Pre/post hook lifecycle | `hooks.py` — `HookManager.try_emit` |
-| Locking algorithm entry point | `actions.py` — `do_lock` |
-| Install/sync entry point | `actions.py` — `do_sync` |
-| Lockfile staleness check | `actions.py` — `check_lockfile` |
-| Dependency graph for `pdm list` | `utils.py` — `build_dependency_graph` |
-| How commands are auto-registered | `core.py` — `pkgutil.iter_modules` + `register_command` |
-| Venv sub-commands | `commands/venv/__init__.py` |
-| Script runner / composite tasks | `commands/run.py` — `TaskRunner` |
-| Save strategy (compatible/wildcard/exact) | `utils.py` — `save_version_specifiers` |
-
-## Key Relationships
-
-```
-commands/*.py  →  actions.py  →  models/, project/, installers/
-                  filters.py
-                  hooks.py
-                  options.py
-```
-
-Commands are consumers of `actions.py`, never the reverse. `actions.py` calls into `resolver`, `installers`, and `project` layers.
-
-**Command registration**: `Core.init_parser()` auto-discovers modules in `pdm.cli.commands` via `pkgutil.iter_modules`. Looks for `module.Command` — other names silently skipped. The command instance is stored via `parser.set_defaults(command=cmd)`.
-
-**Options callback flow**: Some options (`--frozen-lockfile`, `--no-isolation`) queue callbacks in `namespace.callbacks`. `Core.main()` runs these AFTER project creation. Options that mutate the project (like `enable_write_lockfile`) won't take effect if `do_lock` is called before callbacks run.
-
-## Contracts
-
-- `Command.arguments` is a tuple of `Option`/`ArgumentGroup` instances. Order matters for help output.
-- `HookManager` skip semantics: `:all` skips everything, `:pre`/`:post` skip by prefix, individual names are exact matches. OR-combined.
-- `GroupSelection.all()` returns `None` (meaning "all groups") vs `list(selection)` which returns concrete groups. Passing `None` to `do_lock(groups=...)` triggers different behavior than passing an explicit list.
-- `check_lockfile` returns `"all"` (missing), `"reuse"` (incompatible), or `None` (up to date). Not a boolean.
-- `do_add`/`do_remove` on command classes are `@staticmethod` — stable public API despite being on `Command` classes.
-
-## Pitfalls
-
-### `GroupSelection.all()` vs `list(selection)` are semantically different
-
-`all()` returns `None` when unset (means "use all project groups"). `list(selection)` always returns a concrete list. Several commands pass `selection.all()` deliberately for the fallback behavior.
-
-### `save_version_specifiers` mutates `Requirement` objects in place
-
-Modifies `r.specifier` directly. If the same requirement objects are used elsewhere, both callers see the mutated value.
-
-### `PdmFormatter` is skipped on Python 3.14+
-
-`utils.py` switches to `RawDescriptionHelpFormatter` on 3.14+. Don't add logic to `PdmFormatter` expecting it to run everywhere.
-
-### `GroupSelection.validate()` raises on extra groups, not missing ones
-
-It compares requested groups against lockfile groups. A group in `pyproject.toml` but not in the lockfile is silently dropped when `exclude_non_existing=True`.
-
-### `pdm add`/`update` removed dependency groups incorrectly (#3419, #3454)
-
-Group filtering in add/update was removing groups that shouldn't be touched. The override URL and some groups were being dropped from `pdm.lock` upon adding a new dependency.
-
-### `ExtendMapAction` builds a dict, not a list
-
-`--config-setting key=value` produces `{"key": "value"}`, not a list. Repeated keys become `{"key": ["v1", "v2"]}`.
diff --git a/eval-harness/.index-cache-preserve/pdm-intent_layer/src/pdm/formats/AGENTS.md b/eval-harness/.index-cache-preserve/pdm-intent_layer/src/pdm/formats/AGENTS.md
deleted file mode 100644
index 32c9d56..0000000
--- a/eval-harness/.index-cache-preserve/pdm-intent_layer/src/pdm/formats/AGENTS.md
+++ /dev/null
@@ -1,84 +0,0 @@
-# Formats — AGENTS.md
-
-## Purpose
-
-Import from poetry/flit/pipfile, export to requirements.txt/pylock.toml.
-
-## Entry Points
-
-| Task | Start Here |
-|------|------------|
-| Fix requirements.txt export | `requirements.py` — `export()` |
-| Fix poetry import | `poetry.py` — `PoetryMetaConverter` |
-| Fix flit import | `flit.py` — `FlitMetaConverter` |
-| Fix pylock.toml output | `pylock.py` — `PyLockConverter` |
-| Add new format | `__init__.py` — add to `FORMATS` dict |
-
-## Design Rationale
-
-**Protocol-based dispatch**: `FORMATS` dict maps names to modules conforming to an informal protocol: `check_fingerprint`, `convert`, `export`. Import-only formats raise `NotImplementedError` in `export`.
-
-**`MetaConverter` metaclass pattern**: Subclasses decorate methods with `@convert_from(field, name)`. The metaclass collects these into `._converters`. `convert()` iterates them, collecting errors rather than aborting, then raises `MetaConvertError` with partial results.
-
-## Code Map
-
-| Looking for... | Go to |
-|---|---|
-| Registered import/export formats | `__init__.py` — `FORMATS` dict |
-| Base metaclass converter | `base.py` — `class MetaConverter` |
-| `@convert_from` decorator | `base.py` — `convert_from()` |
-| Parse requirements.txt | `requirements.py` — `RequirementParser` |
-| Export to requirements.txt | `requirements.py` — `export()` |
-| Import from Poetry | `poetry.py` — `PoetryMetaConverter` |
-| Import from flit | `flit.py` — `FlitMetaConverter` |
-| Write pylock.toml content | `pylock.py` — `PyLockConverter` |
-
-## Key Relationships
-
-- `FORMATS` registers: `pipfile`, `poetry`, `flit`, `setup_py`, `requirements`. **`pylock` and `uv` are NOT in `FORMATS`** — used internally only.
-- `PyProject._convert_pyproject()` imports `flit` and `poetry` directly for auto-conversion at parse time.
-- `PyLock.format_lockfile()` instantiates `PyLockConverter` — only hard dependency from lockfile layer into formats.
-- `@convert_from(field=None)` means the method receives the entire source dict and is responsible for `.pop()`ing what it consumes.
-
-## Contracts
-
-- Every format module must implement `check_fingerprint`, `convert`, `export`. No enforcement at import time.
-- `convert()` returns `(metadata, settings)` tuple. `metadata` → `[project]`, `settings` → `[tool.pdm]`.
-- `MetaConverter` on error raises `MetaConvertError` with `.data` and `.settings` holding partial results.
-- `@convert_from` methods that raise `Unset` produce no output key.
-- `PyLockConverter.convert()` requires `FLAG_INHERIT_METADATA`. Raises `ProjectError` without it.
-- `PyLockConverter._populate_hashes()` makes network calls. Runs inside a spinner.
-
-## Pitfalls
-
-### `check_fingerprint` called with `project=None`
-
-`PyProject._convert_pyproject()` passes `None` as project. Implementations that access `project.something` will raise `AttributeError`.
-
-### Poetry `^` operator is fully expanded
-
-`_convert_specifier("^1.2.3")` → `>=1.2.3,<2.0.0`. No way to recover original constraint. Exporting back to poetry gives expanded form.
-
-### `PoetryMetaConverter.requires_python()` has side effects
-
-It pops `"python"` from `source["dependencies"]` dict. The `dependencies` converter runs after and sees `python` already removed. Converter ordering matters.
-
-### `FORMATS` doesn't include `pylock` or `uv`
-
-Code iterating `FORMATS` for "all supported formats" misses these. The pylock format is write-only. The uv format is invoked directly by import, not registered.
-
-### `RequirementParser` silently drops per-requirement options
-
-Lines like `requests --global-option="..."` — anything after ` -` is stripped. Parser does `line.split(" -", 1)[0]`.
-
-### pylock.toml + git dependency lock failure (#3695)
-
-Git dependencies caused `format_lockfile()` to fail because the pylock converter didn't handle VCS URLs.
-
-### Export from pylock produces empty requirements.txt (#3573)
-
-`pdm export -f pylock` then re-export to requirements.txt produced empty output due to missing URL population.
-
-### Editable local packages cause empty URLs in pylock (#3566)
-
-Editable local packages weren't getting proper URLs when pylock format was used.
diff --git a/eval-harness/.index-cache-preserve/pdm-intent_layer/src/pdm/installers/AGENTS.md b/eval-harness/.index-cache-preserve/pdm-intent_layer/src/pdm/installers/AGENTS.md
deleted file mode 100644
index 9fd56ae..0000000
--- a/eval-harness/.index-cache-preserve/pdm-intent_layer/src/pdm/installers/AGENTS.md
+++ /dev/null
@@ -1,81 +0,0 @@
-# Installers — AGENTS.md
-
-## Purpose
-
-Synchronizers (diff + apply), wheel installation, uninstallation, install caching.
-
-## Entry Points
-
-| Task | Start Here |
-|------|------------|
-| Fix install diff logic | `base.py` — `BaseSynchronizer.compare_with_working_set` |
-| Fix parallel install | `synchronizers.py` — `Synchronizer` |
-| Fix uv sync | `uv.py` — `UvSynchronizer` |
-| Fix wheel install | `installers.py` — `install_wheel` |
-| Fix uninstall | `uninstallers.py` — `StashedRemovePaths` |
-| Fix install caching | `manager.py` — `InstallManager` |
-
-## Design Rationale
-
-Two parallel hierarchies:
-1. **Synchronizer**: computes diff between resolved candidates and working set, dispatches install/update/remove. `Synchronizer` adds Rich UI + parallel execution. `UvSynchronizer` delegates to `uv sync`.
-2. **InstallManager**: handles individual wheel install/uninstall. Used by `Synchronizer` but NOT by `UvSynchronizer`.
-
-Uninstallation is transactional via `StashedRemovePaths` — files moved to temp dir before deletion, rollback on failure.
-
-## Code Map
-
-| File | Purpose |
-|---|---|
-| `base.py` | `BaseSynchronizer` — diff logic (`compare_with_working_set`) |
-| `synchronizers.py` | `Synchronizer` — Rich UI, parallel install via `ThreadPoolExecutor` |
-| `uv.py` | `UvSynchronizer` — delegates to `uv sync` subprocess |
-| `manager.py` | `InstallManager` — single-dist install/uninstall/overwrite |
-| `installers.py` | `install_wheel`, `InstallDestination` — low-level wheel install |
-| `uninstallers.py` | `StashedRemovePaths` — transactional file removal |
-| `core.py` | `install_requirements` — convenience: resolve + sync in one call |
-
-## Contracts
-
-- `BaseSynchronizer.synchronize()` must be called after construction — construction doesn't touch filesystem.
-- `compare_with_working_set` returns `(to_add, to_update, to_remove)` as sorted lists of string keys.
-- `InstallManager.overwrite` installs new first, then removes only non-overlapping old files. Not remove-then-install.
-- `StashedRemovePaths`: must call `remove()` then `commit()` in sequence. `rollback()` without prior `remove()` is a no-op.
-- `UvSynchronizer` requires a venv. Raises `ProjectError` without one. Also rejects PEP 582 mode.
-- `install_wheel` writes `INSTALLER: pdm` metadata and optionally `direct_url.json`.
-- Sequential packages: `pip`, `setuptools`, `wheel` — always installed sequentially, never in parallel.
-- Editable packages are always installed sequentially.
-
-## Pitfalls
-
-### `.pdmtmp` pth files persist on crash
-
-Parallel installation uses `.pdmtmp` suffix on `.pth` files. If process is killed before `_fix_pth_files` runs, packages won't be importable until suffix is stripped. Running `pdm install` again fixes it.
-
-### `editables` package bypasses install cache
-
-`InstallManager.NO_CACHE_PACKAGES = ("editables",)`. The `editables` helper writes `.pth` files referencing paths — caching it causes incorrect behavior.
-
-### `overwrite` leaves orphan files
-
-`StashedRemovePaths.difference_update` excludes directories containing new install files from removal. Old files in those directories that the new install doesn't cover are silently left behind.
-
-### `UvSynchronizer` with `dry_run=True` provides no output
-
-uv has no dry-run mode. The synchronizer prints a warning and exits — callers get no information.
-
-### `compare_with_working_set` uses `locked_repository.all_candidates`
-
-A package removed from resolution but still in old lockfile won't be cleaned unless `--clean` or `--only-keep` is used.
-
-### Install self for BaseSynchronizer (#3491)
-
-Self-installation logic had a bug where the project itself wasn't being installed in certain synchronizer configurations.
-
-### Non-existent library paths skipped (#3561)
-
-The synchronizer was failing on non-existent library paths. Fixed by adding existence checks before attempting to process them.
-
-### Reinstalling local wheel should check signature (#3514)
-
-Local wheel reinstalls weren't checking the package signature, potentially leaving stale installs.
diff --git a/eval-harness/.index-cache-preserve/pdm-intent_layer/src/pdm/models/AGENTS.md b/eval-harness/.index-cache-preserve/pdm-intent_layer/src/pdm/models/AGENTS.md
deleted file mode 100644
index 8f0039b..0000000
--- a/eval-harness/.index-cache-preserve/pdm-intent_layer/src/pdm/models/AGENTS.md
+++ /dev/null
@@ -1,98 +0,0 @@
-# Models — AGENTS.md
-
-## Purpose
-
-Core domain types: requirements, candidates, markers, specifiers, caches, repositories, working set.
-
-## Entry Points
-
-| Task | Start Here |
-|------|------------|
-| Fix requirement parsing | `requirements.py` — `parse_line()` / `parse_requirement()` |
-| Fix marker evaluation | `markers.py` — `Marker.matches()` / `split_pyspec()` |
-| Fix version specifiers | `specifiers.py` — `PySpecSet` |
-| Fix package discovery | `repositories/pypi.py` — `PyPIRepository` |
-| Fix lockfile candidate reading | `repositories/lock.py` — `LockedRepository` |
-| Fix HTTP/auth issues | `session.py` / `auth.py` |
-
-## Design Rationale
-
-Three-tier split: `Requirement` (spec) → `Candidate` (concrete match) → `PreparedCandidate` (ready to install). This governs most interactions between resolver, installers, and lockfile.
-
-Marker/specifier logic wraps `dep_logic` and `packaging` with PDM-specific merge operators and Python-version splitting.
-
-## Code Map
-
-| Looking for... | Go to |
-|---|---|
-| Parse requirement string `"requests>=2.0"` | `requirements.py` → `parse_line()` / `parse_requirement()` |
-| Parse `-e ./local/path` | `requirements.py` → `parse_line()` (strips `-e`, sets `editable=True`) |
-| VCS URL normalization | `requirements.py` → `VcsRequirement._parse_url()` |
-| Combine Python version constraints | `specifiers.py` → `PySpecSet.__and__` / `__or__` |
-| Convert specifier to marker string | `specifiers.py` → `PySpecSet.as_marker_string()` |
-| Split `python_version` from marker | `markers.py` → `Marker.split_pyspec()` |
-| Candidate → lockfile dict | `candidates.py` → `Candidate.as_lockfile_entry()` |
-| Build/download a candidate | `candidates.py` → `PreparedCandidate.build()` / `_obtain()` |
-| HTTP client with caching | `session.py` → `PDMPyPIClient` |
-| Credential resolution (netrc, keyring) | `auth.py` → `PdmBasicAuth` |
-| Installed packages (sys.path) | `working_set.py` → `WorkingSet` |
-| Read lockfile into candidates | `repositories/lock.py` → `LockedRepository` |
-| Fetch candidates from PyPI | `repositories/pypi.py` → `PyPIRepository` |
-
-## Type Hierarchy
-
-```
-Requirement (base, @dataclass eq=False)
-  ├── NamedRequirement       versioned deps
-  ├── FileRequirement        local paths + URLs
-  └── VcsRequirement         git/hg/svn/bzr
-
-BaseRepository
-  ├── PyPIRepository         live index lookups
-  └── LockedRepository       reads pdm.lock or pylock.toml
-```
-
-## Contracts
-
-- **Requirement identity**: `__eq__`/`__hash__` based on `(key, extras, marker)` — NOT version. Two reqs for same package at different versions hash the same. Intentional for resolver identity.
-- **`Requirement.key` is always lowercase-normalized**.
-- **`FileRequirement` has filesystem side effects**: `__post_init__` calls `Setup.from_directory()`. Guarded by `_checked_paths` set (not thread-safe, not cleared between tests).
-- **`PreparedCandidate.metadata` triggers build on first access**: accessing `.metadata` may download, unpack, and PEP 517 build. Must be inside an active environment context.
-- **`CandidateInfoCache` key requires both name and version**: missing either always causes a cache miss (intentional).
-- **`WorkingSet` normalizes names on insertion**: keys are `normalize_name(dist.metadata["Name"])`. Lookups must use normalized form.
-- **`LockedRepository` format detection**: checks for `"lock-version"` key (pylock) vs `"metadata.lock_version"` (pdm native). Absence of `"lock-version"` signals pdm format.
-- **`PySpecSet("<empty>")` is a sentinel**: round-trips through `str()` + constructor, but NOT through `packaging.SpecifierSet`.
-
-## Pitfalls
-
-### `FileRequirement` constructor walks the filesystem
-
-Constructing a `FileRequirement` with a local path triggers `Setup.from_directory()` during `__post_init__`. The `_checked_paths` guard is a module-level set — not thread-safe and not cleared between test runs without module reload.
-
-### Requirement equality ignores specifier version
-
-`parse_requirement("requests>=1.0")` and `parse_requirement("requests>=2.0")` are equal and produce the same hash. Storing in a `dict` keyed by the requirement object silently loses version constraints.
-
-### `PySpecSet.as_marker_string()` raises on empty specifiers
-
-Callers must check `is_empty()` first. The resolver guards this, but code in formats or CLI may not.
-
-### `BaseRepository.get_dependencies` swallows intermediate errors
-
-`dependency_generators()` defines a priority chain. All `CandidateInfoNotFound` exceptions except the final one are silently retried with the next getter. Debugging why a build was triggered is non-obvious.
-
-### `PDMPackageFinder` with `minimal_version=True` uses `ReverseVersion`
-
-Subclasses `packaging.version.Version` to flip comparison. Mixing `ReverseVersion` with normal `Version` comparisons gives inverted results silently.
-
-### `Marker.split_pyspec()` LRU cache is per-class, not per-instance
-
-1024-slot cache fills with stale entries under long-running processes.
-
-### Huge debug logging with keyring + AWS index (#3642)
-
-`PdmBasicAuth` was generating excessive debug logs when keyring was active with private AWS indexes.
-
-### `resolution.excludes` not applied to lock candidates (#3727)
-
-Lock file candidate evaluation wasn't respecting `resolution.excludes`, allowing excluded packages through during reuse.
diff --git a/eval-harness/.index-cache-preserve/pdm-intent_layer/src/pdm/project/AGENTS.md b/eval-harness/.index-cache-preserve/pdm-intent_layer/src/pdm/project/AGENTS.md
deleted file mode 100644
index b6245b8..0000000
--- a/eval-harness/.index-cache-preserve/pdm-intent_layer/src/pdm/project/AGENTS.md
+++ /dev/null
@@ -1,87 +0,0 @@
-# Project — AGENTS.md
-
-## Purpose
-
-Project core class, configuration, pyproject.toml handling, lockfile formats (pdm.lock + pylock.toml).
-
-## Entry Points
-
-| Task | Start Here |
-|------|------------|
-| Fix config key behavior | `config.py` — `Config._config_map` |
-| Fix pyproject.toml parsing | `project_file.py` — `PyProject` |
-| Fix lockfile read/write | `lockfile/pdmlock.py` or `lockfile/pylock.py` |
-| Fix lockfile format detection | `lockfile/__init__.py` — `load_lockfile()` |
-| Fix content hash staleness | `project_file.py` — `content_hash()` |
-
-## Design Rationale
-
-`Project` is the central service locator — holds config, pyproject, lockfile, environment, and cache objects. It doesn't resolve or install; it provides the configured objects that do.
-
-**Two-tier config**: `Config` uses `ChainMap` layering (env vars → file → defaults). Env vars always win.
-
-**Lazy TOML parsing**: `TOMLFile` uses fast `tomllib` for reads, switches to `tomlkit` only on `open_for_write()` to preserve formatting.
-
-**Auto-conversion**: `PyProject._parse()` silently converts flit/poetry formats at read time. Consumers never see the original.
-
-## Code Map
-
-| Looking for... | Go to |
-|---|---|
-| Main project object | `core.py` — `class Project` |
-| Read pyproject.toml | `project_file.py` — `class PyProject` |
-| All config keys and defaults | `config.py` — `Config._config_map` |
-| Config key lookup with env var fallback | `config.py` — `class EnvMap` |
-| Load lockfile (auto-detect format) | `lockfile/__init__.py` — `load_lockfile()` |
-| PDM native lockfile | `lockfile/pdmlock.py` — `class PDMLock` |
-| PEP 751 lockfile | `lockfile/pylock.py` — `class PyLock` |
-| Content hash for staleness | `project_file.py` — `PyProject.content_hash()` |
-| All dependencies by group | `core.py` — `Project.get_dependencies()` |
-
-## Contracts
-
-- `TOMLFile.open_for_write()` must precede `write()`. Read mode uses `tomllib`; write re-parses with `tomlkit`.
-- `Config._config_map` is a class variable. `add_config()` modifies it globally.
-- `Config.__getitem__` applies `ConfigItem.coerce` on every get, not on set.
-- `load_lockfile()` detects format from content, not filename. `config["lock.format"]` only applies to new lockfiles.
-- `Lockfile.format_lockfile()` replaces all content, not a merge.
-- `FLAG_INHERIT_METADATA` is required for pylock format — `PyLock.format_lockfile()` raises without it.
-- `content_hash()` covers: source, dependencies, dev-dependencies, optional-dependencies, requires-python, resolution. Build config is NOT included.
-
-## Pitfalls
-
-### `dev_dependencies` merges two sources silently
-
-`PyProject.dev_dependencies` reads from both `[dependency-groups]` (PEP 735) and `[tool.pdm.dev-dependencies]` (legacy). Same normalized name in both sections? Cross-section merges silently stack via `setdefault().extend()`. Only `[dependency-groups]` internal duplicates raise `ProjectError`.
-
-### `_convert_pyproject()` runs on `open_for_write()`
-
-Opening a flit/poetry project for write mutates the in-memory tomlkit doc. `write()` produces a PDM-format file. No opt-out.
-
-### `Lockfile.compatibility()` returns `SAME` when file doesn't exist
-
-A missing lockfile is treated as "up to date." Callers must check existence separately.
-
-### Config env var shadowing is a warning, not an error
-
-Setting a config key that has an active env var writes the value but the env var still wins on next read.
-
-### Project config has no defaults layer
-
-Global `Config` ChainMap has three layers; project config only has the file layer. Missing keys fall through to `NoConfigError` even if `_config_map` has a default.
-
-### Hash clearing on lockfile env_spec append (#3611)
-
-Appending to a lockfile with a new `env_spec` was clearing hashes for existing entries.
-
-### Package metadata missing from lockfile after 2.25 (#3547)
-
-Reading locked candidates after version 2.25 format changes could lose metadata. Fixed by searching lock file metadata first when reusing.
-
-### Adding dependency duplicates lockfile entries (#3546)
-
-`do_lock` with `--update-reuse` could duplicate entries. Fixed by deduplicating on candidate key.
-
-### `pdm.toml` not found during pre_build hook (#3621)
-
-`pdm.toml` created on-the-fly in a `pre_build` hook wasn't picked up by `pdm build` because config was loaded before hooks ran.
diff --git a/eval-harness/.index-cache-preserve/pdm-intent_layer/src/pdm/resolver/AGENTS.md b/eval-harness/.index-cache-preserve/pdm-intent_layer/src/pdm/resolver/AGENTS.md
deleted file mode 100644
index e7f3279..0000000
--- a/eval-harness/.index-cache-preserve/pdm-intent_layer/src/pdm/resolver/AGENTS.md
+++ /dev/null
@@ -1,78 +0,0 @@
-# Resolver — AGENTS.md
-
-## Purpose
-
-Dual resolver: `RLResolver` (resolvelib) and `UvResolver` (uv lock subprocess). Provider strategies, graph post-processing.
-
-## Entry Points
-
-| Task | Start Here |
-|------|------------|
-| Fix resolution strategy | `providers.py` — `BaseProvider` + `_PROVIDER_REGISTRY` |
-| Fix resolvelib integration | `resolvelib.py` — `RLResolver` |
-| Fix uv resolver | `uv.py` — `UvResolver` |
-| Fix marker propagation | `graph.py` — `merge_markers` |
-| Fix Python constraint handling | `python.py` — `PythonRequirement` |
-
-## Code Map
-
-| File | Purpose |
-|---|---|
-| `base.py` | Abstract `Resolver`, `Resolution` named tuple |
-| `providers.py` | `BaseProvider` + strategy subclasses, `_PROVIDER_REGISTRY` |
-| `resolvelib.py` | `RLResolver` — calls resolvelib, builds `Package` list |
-| `uv.py` | `UvResolver` — shells out to `uv lock`, parses `uv.lock` |
-| `python.py` | `PythonRequirement`/`PythonCandidate` — Python as a synthetic dep |
-| `graph.py` | `merge_markers`, `populate_groups` — post-resolution marker propagation |
-| `reporters.py` | `LockReporter` (log) and `RichLockReporter` (progress UI) |
-
-## Key Relationships
-
-- `RLResolver.__post_init__` calls `project.get_provider(...)` — provider construction is delegated.
-- `BaseProvider` takes a `repository: BaseRepository`. Provider doesn't touch network directly.
-- `ReusePinProvider` checks `locked_repository` for cached deps before hitting live repo.
-- `UvResolver` uses `formats/uv.py:uv_file_builder` to generate temp pyproject + uv.lock.
-- Python interpreter is a first-class synthetic requirement — allows resolvelib to backtrack on Python conflicts.
-
-## Contracts
-
-- `update_strategy` must be in `_PROVIDER_REGISTRY`: `"all"`, `"reuse"`, `"eager"`, `"reuse-installed"`.
-- `UvResolver` only supports `all` and `reuse`. Others fallback to `reuse` with warning.
-- `BaseProvider.get_preference` always prioritizes Python (`not is_python` is first tuple element).
-- `find_matches` returns a callable returning an iterator (resolvelib's lazy contract).
-- `merge_markers` handles circular deps with a two-pass approach. Don't assume single-pass resolution.
-- `UvResolver` requires a virtual environment. Sets `UV_PROJECT_ENVIRONMENT` in subprocess.
-
-## Pitfalls
-
-### `UvResolver` doesn't support cross-platform resolution
-
-When `target.platform` differs from current machine, uv resolves against the *current* machine. Warns but doesn't error — result may be incorrect.
-
-### `eager` strategy mutates `tracked_names` as side effect
-
-`EagerUpdateProvider.get_dependencies` adds dep keys to `self.tracked_names` cumulatively. Multiple calls expand tracking.
-
-### `BaseProvider.overrides` is a `cached_property`
-
-Override files parsed once and cached. State changes after construction aren't reflected.
-
-### `:empty:` key in resolver mapping
-
-Source distributions whose name wasn't known until after build get a `:empty:` key. `RLResolver._do_resolve` renames them, but code consuming `result.mapping` directly will see `:empty:`.
-
-### Resolution overrides drop extra dependencies (#3428)
-
-Using `[tool.pdm.resolution.overrides]` could silently drop extras from transitive deps.
-
-### UV mode: transitive extras not installed (#3559)
-
-Extra dependencies of transitive deps weren't forwarded to `uv sync` properly.
-
-### Prerelease condition logic (#3645)
-
-`BaseProvider` had incorrect prerelease condition logic — prereleases were allowed/disallowed in wrong contexts.
-
-### `pdm lock --update-reuse` with URL deps (#3463)
-
-URL dependencies generated invalid lock files when using `--update-reuse`.
diff --git a/eval-harness/lib/agentbench_runner.py b/eval-harness/lib/agentbench_runner.py
index 679f3a3..800ba61 100644
--- a/eval-harness/lib/agentbench_runner.py
+++ b/eval-harness/lib/agentbench_runner.py
@@ -41,7 +41,11 @@ def _build_docker_cmd(setup_commands: list[str], commands: list[str]) -> str:
 
 
 def strip_docs(workspace: Path) -> int:
-    """Delete all .md files, .github/, docs/, .claude/, .cursor/, .codex/.
+    """Remove context/doc files that could leak hints, preserving build-required files.
+
+    Keeps README.md (and readme.md variants) because many setup.py/pyproject.toml
+    files read them for long_description.  Only strips known AI-context files and
+    documentation directories.
 
     Returns count of files/dirs removed.
     """
@@ -53,8 +57,11 @@ def strip_docs(workspace: Path) -> int:
             shutil.rmtree(dirpath)
             count += 1
 
-    # Remove markdown files
+    # Remove markdown files, but preserve README variants that builds depend on
+    _readme_names = {"readme.md", "readme.rst", "readme.txt", "readme"}
     for md_file in workspace.rglob("*.md"):
+        if md_file.name.lower() in _readme_names:
+            continue
         md_file.unlink()
         count += 1
 
@@ -180,20 +187,26 @@ def evaluate_instance(
     repo_total = len(repo_results)
     repo_passed = sum(1 for v in repo_results.values() if v)
 
-    # Check which tests flipped compared to expected (repo_test_after_pr_patch)
+    # Match paper's evaluation: only fail if a test that PASSED in the golden
+    # baseline (repo_test_after_pr_patch) now FAILS.  Pre-existing failures
+    # and tests missing from the golden baseline are ignored.
     expected = instance.repo_test_after_pr_patch
-    flipped = []
-    for test_id, actual_pass in repo_results.items():
-        expected_pass = expected.get(test_id)
-        if expected_pass is not None and actual_pass != expected_pass:
-            flipped.append(test_id)
-
-    repo_all_pass = repo_passed == repo_total and not flipped
-    parts.append(f"REGRESSION: {repo_passed}/{repo_total} passed")
-    if flipped:
-        parts.append(f"{len(flipped)} flipped: {', '.join(flipped[:5])}")
-
-    return repo_all_pass, " | ".join(parts)
+    regressions = []
+    for test_id, expected_pass in expected.items():
+        if not expected_pass:
+            continue  # already failing in golden — ignore
+        actual_pass = repo_results.get(test_id, True)  # missing defaults to True (paper behavior)
+        if not actual_pass:
+            regressions.append(test_id)
+
+    golden_pass = sum(1 for v in expected.values() if v)
+    golden_total = len(expected)
+    repo_ok = not regressions
+    parts.append(f"REGRESSION: {repo_passed}/{repo_total} passed (golden: {golden_pass}/{golden_total})")
+    if regressions:
+        parts.append(f"{len(regressions)} regressions: {', '.join(regressions[:5])}")
+
+    return repo_ok, " | ".join(parts)
 
 
 def build_prompt(
diff --git a/eval-harness/lib/docker_runner.py b/eval-harness/lib/docker_runner.py
index 002895d..a154479 100644
--- a/eval-harness/lib/docker_runner.py
+++ b/eval-harness/lib/docker_runner.py
@@ -1,5 +1,6 @@
 # lib/docker_runner.py
 from __future__ import annotations
+import logging
 import os
 import subprocess
 import threading
@@ -8,6 +9,14 @@
 from pathlib import Path
 from typing import Callable
 
+logger = logging.getLogger(__name__)
+
+# Remote Docker host for native x86 execution (avoids QEMU on Apple Silicon).
+# Set EVAL_DOCKER_HOST=ryo@chronos to enable.  Workspace files are rsynced
+# to/from the remote host; Docker runs there with native bind mounts.
+REMOTE_DOCKER_HOST: str | None = os.environ.get("EVAL_DOCKER_HOST")
+REMOTE_WORKSPACE_BASE: str = os.environ.get("EVAL_REMOTE_WORKSPACE_BASE", "/tmp/eval-workspaces")
+
 
 @dataclass
 class DockerResult:
@@ -17,6 +26,71 @@ class DockerResult:
     timed_out: bool = False
 
 
+# ---------------------------------------------------------------------------
+# Remote helpers (rsync workspace ↔ remote host)
+# ---------------------------------------------------------------------------
+
+def _rsync_to_remote(local_path: str, remote_host: str, remote_path: str) -> None:
+    """Sync local workspace to remote host. Creates remote dir if needed."""
+    subprocess.run(
+        ["ssh", remote_host, "mkdir", "-p", remote_path],
+        check=True, capture_output=True,
+    )
+    subprocess.run(
+        ["rsync", "-az", "--delete", f"{local_path}/", f"{remote_host}:{remote_path}/"],
+        check=True, capture_output=True,
+    )
+
+
+def _rsync_from_remote(remote_host: str, remote_path: str, local_path: str) -> None:
+    """Sync remote workspace back to local (picks up test result files etc)."""
+    subprocess.run(
+        ["rsync", "-az", f"{remote_host}:{remote_path}/", f"{local_path}/"],
+        check=True, capture_output=True,
+    )
+
+
+def _remote_docker_cmd(
+    remote_host: str,
+    remote_workspace: str,
+    image: str,
+    command: str,
+    memory: str,
+    cpus: str,
+    cache_volume: str | None,
+    network: str,
+) -> list[str]:
+    """Build an ssh command that runs `docker run` on the remote host."""
+    docker_parts = [
+        "docker", "run", "--rm",
+        "-v", f"{remote_workspace}:/work",
+    ]
+    if cache_volume:
+        docker_parts.extend(["-v", f"{cache_volume}:/root/.cache"])
+    docker_parts.extend([
+        "-w", "/work",
+        "--network", network,
+        "--memory", memory,
+        "--cpus", cpus,
+        image,
+        "bash", "-lc", command,
+    ])
+    # Shell-quote each arg for ssh
+    escaped = " ".join(_shell_quote(p) for p in docker_parts)
+    return ["ssh", remote_host, escaped]
+
+
+def _shell_quote(s: str) -> str:
+    """Simple single-quote escaping for ssh remote commands."""
+    if not s or any(c in s for c in " \t\n'\"\\$`!#&|;(){}[]<>?*~"):
+        return "'" + s.replace("'", "'\"'\"'") + "'"
+    return s
+
+
+# ---------------------------------------------------------------------------
+# Main entry point
+# ---------------------------------------------------------------------------
+
 def run_in_docker(
     workspace: str,
     image: str,
@@ -32,13 +106,110 @@ def run_in_docker(
 ) -> DockerResult:
     """Run a command in a Docker container with workspace mounted.
 
+    When EVAL_DOCKER_HOST is set, the workspace is rsynced to the remote host
+    and Docker runs there natively (no QEMU).  Results are rsynced back.
+
     Args:
         cache_volume: Docker named volume for pip/uv cache persistence.
             Survives across container runs, so ``uv sync`` only downloads
             packages once. Set to None to disable.
     """
-    # Docker requires absolute paths for bind mounts
+    remote_host = REMOTE_DOCKER_HOST
     abs_workspace = os.path.abspath(workspace)
+
+    if remote_host:
+        return _run_remote(
+            abs_workspace, remote_host, image, command,
+            timeout=timeout, memory=memory, cpus=cpus,
+            cache_volume=cache_volume, network=network,
+            stream_log=stream_log,
+            heartbeat_interval=heartbeat_interval,
+            heartbeat_callback=heartbeat_callback,
+        )
+
+    return _run_local(
+        abs_workspace, image, command,
+        timeout=timeout, memory=memory, cpus=cpus,
+        cache_volume=cache_volume, network=network,
+        stream_log=stream_log,
+        heartbeat_interval=heartbeat_interval,
+        heartbeat_callback=heartbeat_callback,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Remote execution path
+# ---------------------------------------------------------------------------
+
+def _run_remote(
+    abs_workspace: str,
+    remote_host: str,
+    image: str,
+    command: str,
+    *,
+    timeout: int,
+    memory: str,
+    cpus: str,
+    cache_volume: str | None,
+    network: str,
+    stream_log: str | Path | None,
+    heartbeat_interval: int,
+    heartbeat_callback: Callable[[float, int, int], None] | None,
+) -> DockerResult:
+    """Rsync workspace to remote, run Docker there, rsync results back."""
+    workspace_name = Path(abs_workspace).name
+    remote_path = f"{REMOTE_WORKSPACE_BASE}/{workspace_name}"
+
+    # 1. Sync workspace to remote
+    try:
+        _rsync_to_remote(abs_workspace, remote_host, remote_path)
+    except subprocess.CalledProcessError as e:
+        return DockerResult(
+            exit_code=-1, stdout="",
+            stderr=f"rsync to remote failed: {e.stderr if e.stderr else e}",
+        )
+
+    # 2. Run Docker on remote via SSH
+    cmd = _remote_docker_cmd(
+        remote_host, remote_path, image, command,
+        memory=memory, cpus=cpus, cache_volume=cache_volume, network=network,
+    )
+
+    result = _exec_cmd(
+        cmd, timeout=timeout,
+        stream_log=stream_log,
+        heartbeat_interval=heartbeat_interval,
+        heartbeat_callback=heartbeat_callback,
+    )
+
+    # 3. Sync results back (even on failure — we want test_results.json)
+    try:
+        _rsync_from_remote(remote_host, remote_path, abs_workspace)
+    except subprocess.CalledProcessError as e:
+        logger.warning("rsync from remote failed: %s", e)
+
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Local execution path
+# ---------------------------------------------------------------------------
+
+def _run_local(
+    abs_workspace: str,
+    image: str,
+    command: str,
+    *,
+    timeout: int,
+    memory: str,
+    cpus: str,
+    cache_volume: str | None,
+    network: str,
+    stream_log: str | Path | None,
+    heartbeat_interval: int,
+    heartbeat_callback: Callable[[float, int, int], None] | None,
+) -> DockerResult:
+    """Run Docker locally with bind mount (original behavior)."""
     cmd = [
         "docker", "run", "--rm",
         "-v", f"{abs_workspace}:/work",
@@ -51,32 +222,49 @@ def run_in_docker(
         "--memory", memory,
         "--cpus", cpus,
         image,
-        "bash", "-lc", command
+        "bash", "-lc", command,
     ])
 
-    # Fast path: keep existing behavior when no streaming/heartbeat is needed.
+    return _exec_cmd(
+        cmd, timeout=timeout,
+        stream_log=stream_log,
+        heartbeat_interval=heartbeat_interval,
+        heartbeat_callback=heartbeat_callback,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Shared command execution (handles timeout, streaming, heartbeat)
+# ---------------------------------------------------------------------------
+
+def _exec_cmd(
+    cmd: list[str],
+    *,
+    timeout: int,
+    stream_log: str | Path | None,
+    heartbeat_interval: int,
+    heartbeat_callback: Callable[[float, int, int], None] | None,
+) -> DockerResult:
+    """Execute a command with optional streaming and heartbeat."""
+
+    # Fast path: no streaming/heartbeat needed
     if stream_log is None and heartbeat_callback is None:
         try:
             result = subprocess.run(
-                cmd,
-                capture_output=True,
-                text=True,
-                timeout=timeout
+                cmd, capture_output=True, text=True, timeout=timeout,
             )
             return DockerResult(
                 exit_code=result.returncode,
                 stdout=result.stdout,
                 stderr=result.stderr,
-                timed_out=False
             )
         except subprocess.TimeoutExpired:
             return DockerResult(
-                exit_code=-1,
-                stdout="",
-                stderr="Command timed out",
-                timed_out=True
+                exit_code=-1, stdout="",
+                stderr="Command timed out", timed_out=True,
             )
 
+    # Streaming path with heartbeat support
     log_file = None
     if stream_log is not None:
         stream_path = Path(stream_log)
@@ -99,18 +287,15 @@ def _drain(stream, target: list[str], key: str):
 
     try:
         proc = subprocess.Popen(
-            cmd,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            text=True,
-            bufsize=1
+            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+            text=True, bufsize=1,
         )
 
         out_thread = threading.Thread(
-            target=_drain, args=(proc.stdout, stdout_lines, "stdout"), daemon=True
+            target=_drain, args=(proc.stdout, stdout_lines, "stdout"), daemon=True,
         )
         err_thread = threading.Thread(
-            target=_drain, args=(proc.stderr, stderr_lines, "stderr"), daemon=True
+            target=_drain, args=(proc.stderr, stderr_lines, "stderr"), daemon=True,
         )
         out_thread.start()
         err_thread.start()
@@ -149,22 +334,19 @@ def _drain(stream, target: list[str], key: str):
             return DockerResult(
                 exit_code=-1,
                 stdout="".join(stdout_lines),
-                stderr=("".join(stderr_lines) or "Command timed out"),
-                timed_out=True
+                stderr="".join(stderr_lines) or "Command timed out",
+                timed_out=True,
             )
 
         return DockerResult(
             exit_code=proc.returncode,
             stdout="".join(stdout_lines),
             stderr="".join(stderr_lines),
-            timed_out=False
         )
     except OSError as e:
         return DockerResult(
-            exit_code=-1,
-            stdout="",
-            stderr=f"Failed to start docker process: {e}",
-            timed_out=False
+            exit_code=-1, stdout="",
+            stderr=f"Failed to start process: {e}",
         )
     finally:
         if log_file:
diff --git a/eval-harness/tests/test_agentbench.py b/eval-harness/tests/test_agentbench.py
index dcf55d1..fa65c01 100644
--- a/eval-harness/tests/test_agentbench.py
+++ b/eval-harness/tests/test_agentbench.py
@@ -168,8 +168,9 @@ def test_strips_md_files(self):
 
             count = strip_docs(ws)
 
-            assert count == 3  # README.md, CONTRIBUTING.md, src/notes.md
-            assert not (ws / "README.md").exists()
+            assert count == 2  # CONTRIBUTING.md, src/notes.md (README.md preserved)
+            assert (ws / "README.md").exists()  # preserved for setup.py
+            assert not (ws / "CONTRIBUTING.md").exists()
             assert not (ws / "src" / "notes.md").exists()
             assert (ws / "src" / "main.py").exists()
 
@@ -356,7 +357,7 @@ def test_regression_flipped_tests(self, mock_docker):
 
             success, output = evaluate_instance(ws, inst, "test-image")
             assert success is False
-            assert "flipped" in output
+            assert "regressions" in output
 
 
 # ── Runner: build_prompt ───────────────────────────────────────────────

From d3ecadcf159cb697219f91f0368192bbf07a6834 Mon Sep 17 00:00:00 2001
From: Ryan Orban <ryan@orban.dev>
Date: Fri, 27 Feb 2026 13:17:36 -0800
Subject: [PATCH 14/21] replace rsync with tar-over-SSH for remote docker
 workspace sync

UGREEN NAS (chronos) runs an rsync daemon that intercepts all rsync
connections and rejects paths outside configured modules. tar piped
through SSH bypasses this entirely and works reliably.

Entire-Checkpoint: e86fbbd52da2
---
 eval-harness/lib/docker_runner.py | 54 ++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 23 deletions(-)

diff --git a/eval-harness/lib/docker_runner.py b/eval-harness/lib/docker_runner.py
index a154479..eb8116a 100644
--- a/eval-harness/lib/docker_runner.py
+++ b/eval-harness/lib/docker_runner.py
@@ -12,8 +12,8 @@
 logger = logging.getLogger(__name__)
 
 # Remote Docker host for native x86 execution (avoids QEMU on Apple Silicon).
-# Set EVAL_DOCKER_HOST=ryo@chronos to enable.  Workspace files are rsynced
-# to/from the remote host; Docker runs there with native bind mounts.
+# Set EVAL_DOCKER_HOST=chronos to enable.  Workspace files are synced via
+# tar-over-SSH to the remote host; Docker runs there with native bind mounts.
 REMOTE_DOCKER_HOST: str | None = os.environ.get("EVAL_DOCKER_HOST")
 REMOTE_WORKSPACE_BASE: str = os.environ.get("EVAL_REMOTE_WORKSPACE_BASE", "/tmp/eval-workspaces")
 
@@ -27,29 +27,44 @@ class DockerResult:
 
 
 # ---------------------------------------------------------------------------
-# Remote helpers (rsync workspace ↔ remote host)
+# Remote helpers (tar-over-SSH workspace ↔ remote host)
 # ---------------------------------------------------------------------------
 
-def _rsync_to_remote(local_path: str, remote_host: str, remote_path: str) -> None:
-    """Sync local workspace to remote host. Creates remote dir if needed."""
+def _sync_to_remote(local_path: str, remote_host: str, remote_path: str) -> None:
+    """Sync local workspace to remote host via tar-over-SSH.
+
+    Uses tar piped through SSH instead of rsync, because some hosts
+    (e.g. UGREEN NAS) run an rsync daemon that intercepts all rsync
+    connections and rejects paths outside configured modules.
+    """
     subprocess.run(
         ["ssh", remote_host, "mkdir", "-p", remote_path],
         check=True, capture_output=True,
     )
+    # tar from local, extract on remote — --delete equivalent via rm first
     subprocess.run(
-        ["rsync", "-az", "--delete", f"{local_path}/", f"{remote_host}:{remote_path}/"],
+        ["ssh", remote_host, "rm", "-rf", f"{remote_path}/*"],
         check=True, capture_output=True,
     )
+    subprocess.run(
+        f"tar -cf - -C {_sh_quote(local_path)} . | ssh {remote_host} 'tar -xf - -C {_sh_quote(remote_path)}'",
+        shell=True, check=True, capture_output=True,
+    )
 
 
-def _rsync_from_remote(remote_host: str, remote_path: str, local_path: str) -> None:
-    """Sync remote workspace back to local (picks up test result files etc)."""
+def _sync_from_remote(remote_host: str, remote_path: str, local_path: str) -> None:
+    """Sync remote workspace back to local via tar-over-SSH."""
     subprocess.run(
-        ["rsync", "-az", f"{remote_host}:{remote_path}/", f"{local_path}/"],
-        check=True, capture_output=True,
+        f"ssh {remote_host} 'tar -cf - -C {_sh_quote(remote_path)} .' | tar -xf - -C {_sh_quote(local_path)}",
+        shell=True, check=True, capture_output=True,
     )
 
 
+def _sh_quote(s: str) -> str:
+    """Shell-quote a string for safe use in shell commands."""
+    return "'" + s.replace("'", "'\"'\"'") + "'"
+
+
 def _remote_docker_cmd(
     remote_host: str,
     remote_workspace: str,
@@ -76,17 +91,10 @@ def _remote_docker_cmd(
         "bash", "-lc", command,
     ])
     # Shell-quote each arg for ssh
-    escaped = " ".join(_shell_quote(p) for p in docker_parts)
+    escaped = " ".join(_sh_quote(p) for p in docker_parts)
     return ["ssh", remote_host, escaped]
 
 
-def _shell_quote(s: str) -> str:
-    """Simple single-quote escaping for ssh remote commands."""
-    if not s or any(c in s for c in " \t\n'\"\\$`!#&|;(){}[]<>?*~"):
-        return "'" + s.replace("'", "'\"'\"'") + "'"
-    return s
-
-
 # ---------------------------------------------------------------------------
 # Main entry point
 # ---------------------------------------------------------------------------
@@ -156,17 +164,17 @@ def _run_remote(
     heartbeat_interval: int,
     heartbeat_callback: Callable[[float, int, int], None] | None,
 ) -> DockerResult:
-    """Rsync workspace to remote, run Docker there, rsync results back."""
+    """Sync workspace to remote via tar-over-SSH, run Docker there, sync back."""
     workspace_name = Path(abs_workspace).name
     remote_path = f"{REMOTE_WORKSPACE_BASE}/{workspace_name}"
 
     # 1. Sync workspace to remote
     try:
-        _rsync_to_remote(abs_workspace, remote_host, remote_path)
+        _sync_to_remote(abs_workspace, remote_host, remote_path)
     except subprocess.CalledProcessError as e:
         return DockerResult(
             exit_code=-1, stdout="",
-            stderr=f"rsync to remote failed: {e.stderr if e.stderr else e}",
+            stderr=f"sync to remote failed: {e.stderr if e.stderr else e}",
         )
 
     # 2. Run Docker on remote via SSH
@@ -184,9 +192,9 @@ def _run_remote(
 
     # 3. Sync results back (even on failure — we want test_results.json)
     try:
-        _rsync_from_remote(remote_host, remote_path, abs_workspace)
+        _sync_from_remote(remote_host, remote_path, abs_workspace)
     except subprocess.CalledProcessError as e:
-        logger.warning("rsync from remote failed: %s", e)
+        logger.warning("sync from remote failed: %s", e)
 
     return result
 

From 2c9ea2ee2019c484026e6545449c22d2ac3e13cf Mon Sep 17 00:00:00 2001
From: Ryan Orban <ryan@orban.dev>
Date: Fri, 27 Feb 2026 13:22:19 -0800
Subject: [PATCH 15/21] pull docker images on remote host when EVAL_DOCKER_HOST
 is set

The pre-pull step was running `docker pull` locally even when Docker
execution happens on a remote host via SSH. Now uses `ssh $host docker
pull` when EVAL_DOCKER_HOST is configured.

Entire-Checkpoint: cfe3bf0bccca
---
 eval-harness/lib/cli.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/eval-harness/lib/cli.py b/eval-harness/lib/cli.py
index 8e9a993..3feada2 100644
--- a/eval-harness/lib/cli.py
+++ b/eval-harness/lib/cli.py
@@ -1037,14 +1037,22 @@ def run_agentbench(parallel, output, timeout, verbose, condition, model, repetit
                f"({len(work_queue)} total) with {parallel} workers")
 
     # --- Pre-pull Docker images ---
+    from lib.docker_runner import REMOTE_DOCKER_HOST
     unique_images = sorted(set(i.docker_image for i in instances))
-    click.echo(f"Pre-pulling {len(unique_images)} Docker image(s)...")
+    remote_host = REMOTE_DOCKER_HOST
+    pull_target = f" on {remote_host}" if remote_host else ""
+    click.echo(f"Pre-pulling {len(unique_images)} Docker image(s){pull_target}...")
     for image in unique_images:
+        pull_cmd = (
+            ["ssh", remote_host, "docker", "pull", image]
+            if remote_host else
+            ["docker", "pull", image]
+        )
         for attempt in range(3):
             try:
                 subprocess.run(
-                    ["docker", "pull", image],
-                    capture_output=True, timeout=300, check=True,
+                    pull_cmd,
+                    capture_output=True, timeout=600, check=True,
                 )
                 click.echo(f"  {image}: ready")
                 break

From fb0e5b68b1c58e2836ef407fc046b0bf64d039de Mon Sep 17 00:00:00 2001
From: Ryan Orban <ryan@orban.dev>
Date: Sun, 1 Mar 2026 11:53:38 -0800
Subject: [PATCH 16/21] add SSH timeouts and sync-back excludes to docker
 runner

Prevents two issues from overnight eval runs:
- SSH agent key expiry caused all workers to hang indefinitely on
  stale connections. Added ConnectTimeout, ServerAliveInterval, and
  subprocess timeout=300s so failures surface within 5 minutes.
- sync_from_remote was transferring .venv dirs (4GB+) back from
  chronos. Added excludes for .venv, node_modules, __pycache__, etc.

Entire-Checkpoint: aa17a443b57d
---
 eval-harness/lib/docker_runner.py | 53 +++++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 13 deletions(-)

diff --git a/eval-harness/lib/docker_runner.py b/eval-harness/lib/docker_runner.py
index eb8116a..b1ef5bf 100644
--- a/eval-harness/lib/docker_runner.py
+++ b/eval-harness/lib/docker_runner.py
@@ -17,6 +17,20 @@
 REMOTE_DOCKER_HOST: str | None = os.environ.get("EVAL_DOCKER_HOST")
 REMOTE_WORKSPACE_BASE: str = os.environ.get("EVAL_REMOTE_WORKSPACE_BASE", "/tmp/eval-workspaces")
 
+# SSH options to prevent indefinite hangs when the agent drops keys or the
+# remote becomes unreachable.  ConnectTimeout caps the initial handshake,
+# ServerAlive detects dead connections mid-transfer.
+_SSH_OPTS = [
+    "-o", "ConnectTimeout=10",
+    "-o", "ServerAliveInterval=30",
+    "-o", "ServerAliveCountMax=3",
+]
+
+# Timeout (seconds) for sync operations (mkdir, tar upload/download).
+# These are workspace-sized transfers (~20-50 MB), not Docker execution,
+# so 5 minutes is generous.
+_SYNC_TIMEOUT = 300
+
 
 @dataclass
 class DockerResult:
@@ -38,25 +52,38 @@ def _sync_to_remote(local_path: str, remote_host: str, remote_path: str) -> None
     connections and rejects paths outside configured modules.
     """
     subprocess.run(
-        ["ssh", remote_host, "mkdir", "-p", remote_path],
-        check=True, capture_output=True,
+        ["ssh", *_SSH_OPTS, remote_host, "mkdir", "-p", remote_path],
+        check=True, capture_output=True, timeout=_SYNC_TIMEOUT,
     )
     # tar from local, extract on remote — --delete equivalent via rm first
     subprocess.run(
-        ["ssh", remote_host, "rm", "-rf", f"{remote_path}/*"],
-        check=True, capture_output=True,
+        ["ssh", *_SSH_OPTS, remote_host, "rm", "-rf", f"{remote_path}/*"],
+        check=True, capture_output=True, timeout=_SYNC_TIMEOUT,
     )
+    ssh_opts_str = " ".join(_SSH_OPTS)
     subprocess.run(
-        f"tar -cf - -C {_sh_quote(local_path)} . | ssh {remote_host} 'tar -xf - -C {_sh_quote(remote_path)}'",
-        shell=True, check=True, capture_output=True,
+        f"tar -cf - -C {_sh_quote(local_path)} . | ssh {ssh_opts_str} {remote_host} 'tar -xf - -C {_sh_quote(remote_path)}'",
+        shell=True, check=True, capture_output=True, timeout=_SYNC_TIMEOUT,
     )
 
 
+_SYNC_BACK_EXCLUDES = [
+    ".venv", "node_modules", "__pycache__", ".tox",
+    "*.pyc", ".mypy_cache", ".pytest_cache",
+]
+
+
 def _sync_from_remote(remote_host: str, remote_path: str, local_path: str) -> None:
-    """Sync remote workspace back to local via tar-over-SSH."""
+    """Sync remote workspace back to local via tar-over-SSH.
+
+    Excludes large build artifacts (.venv, node_modules, etc.) that Claude may
+    have created during execution — we only need source changes and test results.
+    """
+    excludes = " ".join(f"--exclude={_sh_quote(e)}" for e in _SYNC_BACK_EXCLUDES)
+    ssh_opts_str = " ".join(_SSH_OPTS)
     subprocess.run(
-        f"ssh {remote_host} 'tar -cf - -C {_sh_quote(remote_path)} .' | tar -xf - -C {_sh_quote(local_path)}",
-        shell=True, check=True, capture_output=True,
+        f"ssh {ssh_opts_str} {remote_host} 'tar -cf - {excludes} -C {_sh_quote(remote_path)} .' | tar -xf - -C {_sh_quote(local_path)}",
+        shell=True, check=True, capture_output=True, timeout=_SYNC_TIMEOUT,
     )
 
 
@@ -92,7 +119,7 @@ def _remote_docker_cmd(
     ])
     # Shell-quote each arg for ssh
     escaped = " ".join(_sh_quote(p) for p in docker_parts)
-    return ["ssh", remote_host, escaped]
+    return ["ssh", *_SSH_OPTS, remote_host, escaped]
 
 
 # ---------------------------------------------------------------------------
@@ -171,10 +198,10 @@ def _run_remote(
     # 1. Sync workspace to remote
     try:
         _sync_to_remote(abs_workspace, remote_host, remote_path)
-    except subprocess.CalledProcessError as e:
+    except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
         return DockerResult(
             exit_code=-1, stdout="",
-            stderr=f"sync to remote failed: {e.stderr if e.stderr else e}",
+            stderr=f"sync to remote failed: {e}",
         )
 
     # 2. Run Docker on remote via SSH
@@ -193,7 +220,7 @@ def _run_remote(
     # 3. Sync results back (even on failure — we want test_results.json)
     try:
         _sync_from_remote(remote_host, remote_path, abs_workspace)
-    except subprocess.CalledProcessError as e:
+    except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
         logger.warning("sync from remote failed: %s", e)
 
     return result

From 03c1ecedbbf649a538ab5a765f16184c7bfbc932 Mon Sep 17 00:00:00 2001
From: Ryan Orban <ryan@orban.dev>
Date: Sun, 1 Mar 2026 20:24:15 -0800
Subject: [PATCH 17/21] switch agentbench to persistent containers, fix tar
 overlay UID bug

Replace ephemeral docker run with persistent containers (docker run -d +
docker exec) so setup runs once per task instead of 3x. Add
start_container, exec_in_container, stop_container, copy_into_container
to docker_runner.py.

Fix git "dubious ownership" error caused by macOS tar overlay changing
file UIDs inside containers (CVE-2022-24765). Add safe.directory config
before overlay. Include stderr/stdout tail in setup error messages.

Entire-Checkpoint: c11a4a4ced3d
---
 eval-harness/lib/agentbench_runner.py | 369 +++++++++++++++++++-------
 eval-harness/lib/cli.py               |  13 +-
 eval-harness/lib/docker_runner.py     | 129 +++++++++
 eval-harness/tests/test_agentbench.py | 125 ++++-----
 4 files changed, 476 insertions(+), 160 deletions(-)

diff --git a/eval-harness/lib/agentbench_runner.py b/eval-harness/lib/agentbench_runner.py
index 800ba61..f6b0826 100644
--- a/eval-harness/lib/agentbench_runner.py
+++ b/eval-harness/lib/agentbench_runner.py
@@ -13,6 +13,7 @@ class doesn't know about AGENTbench.
 import json
 import logging
 import os
+import re
 import shutil
 import subprocess
 import threading
@@ -22,7 +23,14 @@ class doesn't know about AGENTbench.
 
 from lib.agentbench_loader import AgentbenchInstance
 from lib.claude_runner import run_claude
-from lib.docker_runner import run_in_docker
+from lib.docker_runner import (
+    REMOTE_DOCKER_HOST,
+    _SSH_OPTS,
+    _sh_quote,
+    copy_into_container,
+    exec_in_container,
+    persistent_container,
+)
 from lib.git_ops import clone_repo, checkout_commit, create_baseline_commit, get_diff_stats
 from lib.prompt_builder import FLAT_PREAMBLE, INTENT_LAYER_PREAMBLE
 from lib.task_runner import Condition, TaskResult, SkillGenerationMetrics
@@ -35,11 +43,38 @@ class doesn't know about AGENTbench.
 
 
 def _build_docker_cmd(setup_commands: list[str], commands: list[str]) -> str:
-    """Join setup + test commands into a single shell command string."""
+    """Join setup + test commands into a single shell command string.
+
+    Strips leading 'sudo' from commands since Docker containers run as root.
+    """
     parts = setup_commands + commands
+    parts = [cmd.removeprefix("sudo ") for cmd in parts]
     return " && ".join(parts)
 
 
+def _build_activate_cmd(setup_commands: list[str], test_commands: list[str]) -> str:
+    """Prepend venv activation (if any) to test commands.
+
+    Setup commands often include `source venv/bin/activate`, which doesn't
+    persist across docker exec calls. Extract it and prepend to test commands.
+    """
+    activate = ""
+    for cmd in setup_commands:
+        if re.match(r'^\s*(source|\.)\s+\S*activate', cmd):
+            activate = cmd.removeprefix("sudo ") + " && "
+            break
+    parts = [c.removeprefix("sudo ") for c in test_commands]
+    return activate + " && ".join(parts)
+
+
+# Paths that should never be copied from the local workspace into the
+# container after Claude's edit pass — they'd clobber installed deps.
+_DIFF_EXCLUDES = {
+    ".venv", "venv", "node_modules", "__pycache__",
+    ".pytest_cache", ".mypy_cache", ".tox",
+}
+
+
 def strip_docs(workspace: Path) -> int:
     """Remove context/doc files that could leak hints, preserving build-required files.
 
@@ -143,23 +178,43 @@ def _parse_test_results(workspace: Path, filename: str) -> dict[str, bool] | Non
 
 
 def evaluate_instance(
-    workspace: Path,
+    container_name: str,
     instance: AgentbenchInstance,
-    docker_image: str,
+    remote_host: str | None = None,
     timeout: int = DOCKER_STEP_TIMEOUT,
 ) -> tuple[bool, str]:
-    """Two-tier evaluation. Returns (success, test_output_summary).
+    """Two-tier evaluation inside a persistent container.
+
+    Returns (success, test_output_summary).
 
-    Tier 1: Instance tests (test_file_runner → pr_test_results.json)
-    Tier 2: Repo regression tests (repo_test_runner → test_results.json)
+    Tier 1: Instance tests (test_file_runner -> pr_test_results.json)
+    Tier 2: Repo regression tests (repo_test_runner -> test_results.json)
     Both must pass for success=True.
     """
     parts = []
 
+    # Clear stale results from pre-validation phase
+    exec_in_container(
+        container_name,
+        "rm -f /testbed/pr_test_results.json /testbed/test_results.json",
+        timeout=10, remote_host=remote_host,
+    )
+
     # Tier 1: Instance tests
-    test_cmd = _build_docker_cmd(instance.setup_commands, instance.test_commands)
-    result = run_in_docker(str(workspace), docker_image, test_cmd, timeout=timeout)
-    pr_results = _parse_test_results(workspace, "pr_test_results.json")
+    test_cmd = _build_activate_cmd(instance.setup_commands, instance.test_commands)
+    result = exec_in_container(
+        container_name, test_cmd, timeout=timeout, remote_host=remote_host,
+    )
+    pr_json = exec_in_container(
+        container_name, "cat /testbed/pr_test_results.json",
+        timeout=10, remote_host=remote_host,
+    )
+    try:
+        pr_results = json.loads(pr_json.stdout) if pr_json.exit_code == 0 else None
+        if not isinstance(pr_results, dict):
+            pr_results = None
+    except (json.JSONDecodeError, ValueError):
+        pr_results = None
 
     if pr_results is None:
         return False, f"INSTANCE: pr_test_results.json missing or corrupt | docker exit={result.exit_code}"
@@ -175,12 +230,22 @@ def evaluate_instance(
         return False, " | ".join(parts)
 
     # Tier 2: Regression tests
-    repo_test_cmd = _build_docker_cmd(instance.setup_commands, instance.repo_test_commands)
-    result = run_in_docker(str(workspace), docker_image, repo_test_cmd, timeout=timeout)
-    repo_results = _parse_test_results(workspace, "test_results.json")
+    repo_test_cmd = _build_activate_cmd(instance.setup_commands, instance.repo_test_commands)
+    result = exec_in_container(
+        container_name, repo_test_cmd, timeout=timeout, remote_host=remote_host,
+    )
+    repo_json = exec_in_container(
+        container_name, "cat /testbed/test_results.json",
+        timeout=10, remote_host=remote_host,
+    )
+    try:
+        repo_results = json.loads(repo_json.stdout) if repo_json.exit_code == 0 else None
+        if not isinstance(repo_results, dict):
+            repo_results = None
+    except (json.JSONDecodeError, ValueError):
+        repo_results = None
 
     if repo_results is None:
-        # Regression runner failed — instance tests passed though
         parts.append("REGRESSION: test_results.json missing or corrupt")
         return False, " | ".join(parts)
 
@@ -301,6 +366,59 @@ def _inject_cached_context(
     )
 
 
+def _copy_diffs_into_container(
+    workspace: Path,
+    container_name: str,
+    remote_host: str | None,
+) -> list[str]:
+    """Copy only Claude's changed files into the container.
+
+    Uses git diff to find changed files, filters out build artifacts that
+    could clobber installed deps, then sends a minimal tar (<100 KB typical)
+    instead of the full workspace (20-50 MB).
+
+    Returns list of files copied.
+    """
+    # Stage everything so diff --cached sees new untracked files too
+    subprocess.run(["git", "add", "-A"], cwd=workspace, capture_output=True)
+    diff_result = subprocess.run(
+        ["git", "diff", "--cached", "--name-only", "HEAD"],
+        cwd=workspace, capture_output=True, text=True,
+    )
+
+    changed = []
+    for f in diff_result.stdout.strip().split("\n"):
+        if not f:
+            continue
+        if any(part in _DIFF_EXCLUDES for part in Path(f).parts):
+            continue
+        if f.endswith((".pyc", ".egg-info")):
+            continue
+        changed.append(f)
+
+    if not changed:
+        return []
+
+    tar_cmd = ["tar", "-cf", "-", "-C", str(workspace)] + changed
+    if remote_host:
+        ssh_opts_str = " ".join(_SSH_OPTS)
+        subprocess.run(
+            f"{' '.join(_sh_quote(p) for p in tar_cmd)} | "
+            f"ssh {ssh_opts_str} {remote_host} "
+            f"'docker exec -i {_sh_quote(container_name)} tar -xf - -C /testbed'",
+            shell=True, check=True, capture_output=True, timeout=60,
+        )
+    else:
+        tar_proc = subprocess.Popen(tar_cmd, stdout=subprocess.PIPE)
+        subprocess.run(
+            ["docker", "exec", "-i", container_name, "tar", "-xf", "-", "-C", "/testbed"],
+            stdin=tar_proc.stdout, capture_output=True, timeout=60,
+        )
+        tar_proc.wait()
+
+    return changed
+
+
 def run_single(
     instance: AgentbenchInstance,
     condition: Condition,
@@ -312,28 +430,49 @@ def run_single(
     model: str = "sonnet",
     progress_callback=None,
 ) -> TaskResult:
-    """Full per-instance execution. Returns a standard TaskResult.
+    """Full per-instance execution using a persistent Docker container.
 
     Steps:
-    1. Clone repo at base_sha (from reference clone)
-    2. strip_docs()
-    3. Inject condition context
-    4. write_test_infrastructure()
-    5. Pre-validate: instance tests should fail
-    6. Create baseline commit
-    7. build_prompt() + run_claude()
-    8. evaluate_instance()
-    9. Return TaskResult
+     1. Clone repo locally at base_sha
+     2. strip_docs()
+     3. Inject condition context
+     4. write_test_infrastructure()
+     5. Start persistent container
+     6. git checkout base_sha + strip docs INSIDE container
+     7. Overlay context + test files onto /testbed (excludes .git)
+     8. Run setup_commands ONCE in container
+     9. Pre-validate: run test_commands, read results via exec+cat
+    10. Create baseline commit (local workspace)
+    11. run_claude() (operates on local workspace)
+    12. Copy ONLY Claude's changed files into container
+    13. Evaluate: run test commands, read results via exec+cat
+    14. Collect diff stats
+    15. Container auto-cleanup via context manager
     """
     task_id = instance.instance_id
     start = time.time()
     docker_image = instance.docker_image
+    remote_host = REMOTE_DOCKER_HOST
 
     def _progress(step: str, msg: str = ""):
         if progress_callback:
             progress_callback(task_id, condition.value, step, msg)
 
-    # --- Step 1: Setup workspace ---
+    def _fail(error: str, **kwargs) -> TaskResult:
+        return TaskResult(
+            task_id=task_id, condition=condition, success=False,
+            test_output="", wall_clock_seconds=time.time() - start,
+            input_tokens=kwargs.get("input_tokens", 0),
+            output_tokens=kwargs.get("output_tokens", 0),
+            tool_calls=kwargs.get("tool_calls", 0),
+            lines_changed=0, files_touched=[], rep=rep,
+            error=error,
+            skill_generation=kwargs.get("skill_generation"),
+            exit_code=kwargs.get("exit_code"),
+            is_timeout=kwargs.get("is_timeout", False),
+        )
+
+    # --- Step 1: Setup local workspace ---
     _progress("setup", "cloning workspace")
     task_hash = format(hash(task_id) % 0xFFFF, '04x')
     workspace_name = f"{instance.repo}-{instance.base_sha[:8]}-{task_hash}-{condition.value}-r{rep}"
@@ -379,92 +518,142 @@ def _progress(step: str, msg: str = ""):
     elif condition == Condition.INTENT_LAYER:
         plugin_root = os.environ.get("INTENT_LAYER_PLUGIN_ROOT", "")
         if not plugin_root:
-            return TaskResult(
-                task_id=task_id, condition=condition, success=False,
-                test_output="", wall_clock_seconds=time.time() - start,
-                input_tokens=0, output_tokens=0, tool_calls=0,
-                lines_changed=0, files_touched=[], rep=rep,
-                error="[infrastructure] INTENT_LAYER_PLUGIN_ROOT not set",
-            )
+            return _fail("[infrastructure] INTENT_LAYER_PLUGIN_ROOT not set")
         skill_metrics = _inject_cached_context(
             instance, workspace, workspaces_dir, index_cache, model,
             cache_key="intent_layer",
-            generate_fn=lambda ws, inst, ws_dir, mdl: _generate_il_context(ws, plugin_root, mdl),
+            generate_fn=lambda ws, _inst, _ws_dir, mdl: _generate_il_context(ws, plugin_root, mdl),
         )
 
     # --- Step 4: Write test infrastructure ---
     _progress("test-infra", "writing test files")
     write_test_infrastructure(workspace, instance)
 
-    # --- Step 5: Pre-validation ---
-    _progress("pre-validate", "checking instance tests fail at base")
-    test_cmd = _build_docker_cmd(instance.setup_commands, instance.test_commands)
-    run_in_docker(  # side effect: writes pr_test_results.json
-        str(workspace), docker_image, test_cmd, timeout=PRE_VALIDATION_TIMEOUT
-    )
-    pre_pr_results = _parse_test_results(workspace, "pr_test_results.json")
+    # --- Steps 5-13: Persistent container ---
+    _progress("container", "starting persistent container")
+    with persistent_container(docker_image, remote_host=remote_host) as ctr:
+
+        # Mark /testbed as safe — the tar overlay from macOS writes files
+        # with a different UID, triggering git's CVE-2022-24765 ownership
+        # check ("dubious ownership").  Without this, git commands and
+        # SCM-based version detection (pdm-backend, setuptools_scm) fail.
+        exec_in_container(
+            ctr, "git config --global --add safe.directory /testbed",
+            timeout=10, remote_host=remote_host,
+        )
 
-    if pre_pr_results is not None and pre_pr_results and all(pre_pr_results.values()):
-        return TaskResult(
-                task_id=task_id, condition=condition, success=False,
-                test_output="", wall_clock_seconds=time.time() - start,
-                input_tokens=0, output_tokens=0, tool_calls=0,
-                lines_changed=0, files_touched=[], rep=rep,
-                error="[pre-validation] instance tests already pass at base_sha",
+        # Step 6: Bring container's /testbed to exact base_sha state
+        checkout_result = exec_in_container(
+            ctr,
+            f"git checkout {instance.base_sha} 2>/dev/null || "
+            f"(git fetch origin {instance.base_sha} --depth=1 && "
+            f"git checkout {instance.base_sha}) && "
+            f"git reset --hard",
+            timeout=60, remote_host=remote_host,
+        )
+        if checkout_result.exit_code != 0:
+            return _fail(
+                f"[setup] git checkout failed in container: {checkout_result.stderr[:200]}",
                 skill_generation=skill_metrics,
             )
 
-    # --- Step 6: Baseline commit ---
-    _progress("baseline", "creating baseline commit")
-    create_baseline_commit(str(workspace))
+        # Strip docs inside container (matches what we did on local workspace)
+        exec_in_container(
+            ctr,
+            "find . -name '*.md' ! -iname 'readme*' -delete && "
+            "rm -rf .github docs .claude .cursor .codex",
+            timeout=30, remote_host=remote_host,
+        )
 
-    # --- Step 7: Run Claude ---
-    _progress("claude", "running claude")
-    prompt = build_prompt(instance.problem_description, condition)
-    log_dir = workspaces_dir.parent / "logs"
-    log_dir.mkdir(parents=True, exist_ok=True)
-    stderr_log = log_dir / f"{task_id}-{condition.value}-r{rep}.log"
+        # Step 7: Overlay context files + test infrastructure onto /testbed
+        copy_into_container(ctr, str(workspace), "/testbed", remote_host=remote_host)
 
-    claude_result = run_claude(
-        str(workspace), prompt, timeout=claude_timeout, model=model,
-        stderr_log=str(stderr_log),
-    )
+        # Step 8: Run setup commands ONCE
+        _progress("setup-docker", "installing dependencies (once)")
+        setup_cmd = " && ".join(
+            cmd.removeprefix("sudo ") for cmd in instance.setup_commands
+        )
+        setup_result = exec_in_container(
+            ctr, setup_cmd, timeout=DOCKER_STEP_TIMEOUT, remote_host=remote_host,
+        )
+        if setup_result.exit_code != 0:
+            stderr_tail = (setup_result.stderr or "").strip()[-300:]
+            stdout_tail = (setup_result.stdout or "").strip()[-300:]
+            detail = stderr_tail or stdout_tail or "(no output)"
+            return _fail(
+                f"[setup] docker exit={setup_result.exit_code}: {detail}",
+                skill_generation=skill_metrics,
+            )
 
-    if claude_result.timed_out:
-        return TaskResult(
-            task_id=task_id, condition=condition, success=False,
-            test_output="", wall_clock_seconds=time.time() - start,
-            input_tokens=claude_result.input_tokens,
-            output_tokens=claude_result.output_tokens,
-            tool_calls=claude_result.tool_calls,
-            lines_changed=0, files_touched=[], rep=rep,
-            error="[timeout] claude timed out",
-            skill_generation=skill_metrics,
-            exit_code=claude_result.exit_code,
-            is_timeout=True,
+        # Step 9: Pre-validate — instance tests should fail at base_sha
+        _progress("pre-validate", "checking instance tests fail at base")
+        test_cmd = _build_activate_cmd(instance.setup_commands, instance.test_commands)
+        exec_in_container(ctr, test_cmd, timeout=PRE_VALIDATION_TIMEOUT, remote_host=remote_host)
+
+        pr_json = exec_in_container(
+            ctr, "cat /testbed/pr_test_results.json",
+            timeout=10, remote_host=remote_host,
         )
+        try:
+            pre_pr_results = json.loads(pr_json.stdout) if pr_json.exit_code == 0 else None
+            if not isinstance(pre_pr_results, dict):
+                pre_pr_results = None
+        except (json.JSONDecodeError, ValueError):
+            pre_pr_results = None
+
+        if pre_pr_results is not None and pre_pr_results and all(pre_pr_results.values()):
+            return _fail(
+                "[pre-validation] instance tests already pass at base_sha",
+                skill_generation=skill_metrics,
+            )
 
-    # Check for empty run (Claude did nothing)
-    if claude_result.tool_calls == 0:
-        return TaskResult(
-            task_id=task_id, condition=condition, success=False,
-            test_output="", wall_clock_seconds=time.time() - start,
-            input_tokens=claude_result.input_tokens,
-            output_tokens=claude_result.output_tokens,
-            tool_calls=0,
-            lines_changed=0, files_touched=[], rep=rep,
-            error="[empty-run] claude made no tool calls",
-            skill_generation=skill_metrics,
-            exit_code=claude_result.exit_code,
+        # --- Step 10: Baseline commit (local) ---
+        _progress("baseline", "creating baseline commit")
+        create_baseline_commit(str(workspace))
+
+        # --- Step 11: Run Claude (operates on local workspace) ---
+        _progress("claude", "running claude")
+        prompt = build_prompt(instance.problem_description, condition)
+        log_dir = workspaces_dir.parent / "logs"
+        log_dir.mkdir(parents=True, exist_ok=True)
+        stderr_log = log_dir / f"{task_id}-{condition.value}-r{rep}.log"
+
+        claude_result = run_claude(
+            str(workspace), prompt, timeout=claude_timeout, model=model,
+            stderr_log=str(stderr_log),
         )
 
-    # --- Step 8: Evaluate ---
-    _progress("evaluate", "running tests")
-    success, test_output = evaluate_instance(
-        workspace, instance, docker_image, timeout=DOCKER_STEP_TIMEOUT
-    )
+        if claude_result.timed_out:
+            return _fail(
+                "[timeout] claude timed out",
+                skill_generation=skill_metrics,
+                input_tokens=claude_result.input_tokens,
+                output_tokens=claude_result.output_tokens,
+                tool_calls=claude_result.tool_calls,
+                exit_code=claude_result.exit_code,
+                is_timeout=True,
+            )
+
+        if claude_result.tool_calls == 0:
+            return _fail(
+                "[empty-run] claude made no tool calls",
+                skill_generation=skill_metrics,
+                input_tokens=claude_result.input_tokens,
+                output_tokens=claude_result.output_tokens,
+                exit_code=claude_result.exit_code,
+            )
+
+        # --- Step 12: Copy Claude's changes into container ---
+        _progress("sync", "copying changed files into container")
+        _copy_diffs_into_container(workspace, ctr, remote_host)
+
+        # --- Step 13: Evaluate ---
+        _progress("evaluate", "running tests")
+        success, test_output = evaluate_instance(
+            ctr, instance, remote_host=remote_host, timeout=DOCKER_STEP_TIMEOUT,
+        )
 
-    # --- Step 9: Collect diff stats ---
+    # --- Step 14: Collect diff stats ---
     diff = get_diff_stats(str(workspace))
 
     elapsed = time.time() - start
diff --git a/eval-harness/lib/cli.py b/eval-harness/lib/cli.py
index 3feada2..1f36b22 100644
--- a/eval-harness/lib/cli.py
+++ b/eval-harness/lib/cli.py
@@ -1037,7 +1037,8 @@ def run_agentbench(parallel, output, timeout, verbose, condition, model, repetit
                f"({len(work_queue)} total) with {parallel} workers")
 
     # --- Pre-pull Docker images ---
-    from lib.docker_runner import REMOTE_DOCKER_HOST
+    from lib.docker_runner import REMOTE_DOCKER_HOST, cleanup_stale_containers
+    cleanup_stale_containers(remote_host=REMOTE_DOCKER_HOST)
     unique_images = sorted(set(i.docker_image for i in instances))
     remote_host = REMOTE_DOCKER_HOST
     pull_target = f" on {remote_host}" if remote_host else ""
@@ -1131,6 +1132,16 @@ def _run_one(item: tuple[AgentbenchInstance, Condition, int]) -> TaskResult:
             progress_callback=progress_callback,
         )
 
+        # Clean up local workspace after evaluation to avoid filling disk.
+        # Docker execution happens on chronos; local clone is only needed
+        # for setup + sync and can be removed once results are collected.
+        task_hash = format(hash(inst.instance_id) % 0xFFFF, '04x')
+        ws_name = f"{inst.repo}-{inst.base_sha[:8]}-{task_hash}-{cond.value}-r{rep}"
+        ws_path = workspaces_dir / ws_name
+        if ws_path.exists():
+            import shutil
+            shutil.rmtree(ws_path, ignore_errors=True)
+
         if result.error:
             newly_tripped = _cb_record(inst.instance_id, cond.value, result.error)
             if newly_tripped:
diff --git a/eval-harness/lib/docker_runner.py b/eval-harness/lib/docker_runner.py
index b1ef5bf..248f492 100644
--- a/eval-harness/lib/docker_runner.py
+++ b/eval-harness/lib/docker_runner.py
@@ -5,9 +5,11 @@
 import subprocess
 import threading
 import time
+from contextlib import contextmanager
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Callable
+from uuid import uuid4
 
 logger = logging.getLogger(__name__)
 
@@ -24,6 +26,9 @@
     "-o", "ConnectTimeout=10",
     "-o", "ServerAliveInterval=30",
     "-o", "ServerAliveCountMax=3",
+    "-o", "ControlMaster=auto",
+    "-o", "ControlPath=/tmp/ssh-eval-%r@%h:%p",
+    "-o", "ControlPersist=300",
 ]
 
 # Timeout (seconds) for sync operations (mkdir, tar upload/download).
@@ -386,3 +391,127 @@ def _drain(stream, target: list[str], key: str):
     finally:
         if log_file:
             log_file.close()
+
+
+# ---------------------------------------------------------------------------
+# Persistent container API (for AGENTbench: start once, exec many)
+# ---------------------------------------------------------------------------
+
+def start_container(
+    image: str,
+    workdir: str = "/testbed",
+    memory: str = "4g",
+    cpus: str = "1",
+    network: str = "bridge",
+    remote_host: str | None = None,
+) -> str:
+    """Start a persistent container. Returns container name."""
+    name = f"eval-{uuid4().hex[:12]}"
+    cmd = [
+        "docker", "run", "-d", "--name", name,
+        "--memory", memory, "--cpus", cpus,
+        "--network", network, "-w", workdir,
+        image, "sleep", "infinity",
+    ]
+    if remote_host:
+        escaped = " ".join(_sh_quote(p) for p in cmd)
+        cmd = ["ssh", *_SSH_OPTS, remote_host, escaped]
+    subprocess.run(cmd, check=True, capture_output=True, timeout=60)
+    return name
+
+
+def exec_in_container(
+    name: str,
+    command: str,
+    timeout: int = 120,
+    remote_host: str | None = None,
+    stream_log: str | Path | None = None,
+    heartbeat_interval: int = 20,
+    heartbeat_callback: Callable[[float, int, int], None] | None = None,
+) -> DockerResult:
+    """Run command inside persistent container via docker exec."""
+    cmd = ["docker", "exec", name, "bash", "-lc", command]
+    if remote_host:
+        escaped = " ".join(_sh_quote(p) for p in cmd)
+        cmd = ["ssh", *_SSH_OPTS, remote_host, escaped]
+    return _exec_cmd(
+        cmd, timeout=timeout, stream_log=stream_log,
+        heartbeat_interval=heartbeat_interval,
+        heartbeat_callback=heartbeat_callback,
+    )
+
+
+def stop_container(name: str, remote_host: str | None = None) -> None:
+    """Stop and remove container. Safe to call if container doesn't exist."""
+    stop_cmd = ["docker", "stop", "-t", "1", name]
+    rm_cmd = ["docker", "rm", "-f", name]
+    try:
+        if remote_host:
+            for c in (stop_cmd, rm_cmd):
+                subprocess.run(
+                    ["ssh", *_SSH_OPTS, remote_host,
+                     " ".join(_sh_quote(p) for p in c)],
+                    capture_output=True, timeout=30,
+                )
+        else:
+            for c in (stop_cmd, rm_cmd):
+                subprocess.run(c, capture_output=True, timeout=30)
+    except Exception:
+        pass  # best-effort cleanup
+
+
+@contextmanager
+def persistent_container(image: str, **kwargs):
+    """Context manager for persistent container lifecycle."""
+    name = start_container(image, **kwargs)
+    try:
+        yield name
+    finally:
+        stop_container(name, remote_host=kwargs.get("remote_host"))
+
+
+def copy_into_container(
+    name: str,
+    local_dir: str,
+    container_dir: str,
+    remote_host: str | None = None,
+    excludes: list[str] | None = None,
+) -> None:
+    """Overlay local directory onto container path via tar pipe.
+
+    Excludes .git by default (200-400 MB for large repos).
+    """
+    exclude_args = ["--exclude=.git"]
+    for e in (excludes or []):
+        exclude_args.append(f"--exclude={e}")
+    exclude_str = " ".join(exclude_args)
+
+    if remote_host:
+        ssh_opts_str = " ".join(_SSH_OPTS)
+        subprocess.run(
+            f"tar -cf - {exclude_str} -C {_sh_quote(local_dir)} . | "
+            f"ssh {ssh_opts_str} {remote_host} "
+            f"'docker exec -i {_sh_quote(name)} tar -xf - -C {_sh_quote(container_dir)}'",
+            shell=True, check=True, capture_output=True, timeout=_SYNC_TIMEOUT,
+        )
+    else:
+        subprocess.run(
+            f"tar -cf - {exclude_str} -C {_sh_quote(local_dir)} . | "
+            f"docker exec -i {_sh_quote(name)} tar -xf - -C {_sh_quote(container_dir)}",
+            shell=True, check=True, capture_output=True, timeout=_SYNC_TIMEOUT,
+        )
+
+
+def cleanup_stale_containers(remote_host: str | None = None) -> None:
+    """Remove orphaned eval-* containers from prior crashed runs."""
+    cmd = "docker ps -a --filter name=eval- -q | xargs -r docker rm -f"
+    try:
+        if remote_host:
+            subprocess.run(
+                ["ssh", *_SSH_OPTS, remote_host, cmd],
+                capture_output=True, timeout=30,
+            )
+        else:
+            subprocess.run(cmd, shell=True, capture_output=True, timeout=30)
+    except Exception:
+        pass  # best-effort
diff --git a/eval-harness/tests/test_agentbench.py b/eval-harness/tests/test_agentbench.py
index fa65c01..add2254 100644
--- a/eval-harness/tests/test_agentbench.py
+++ b/eval-harness/tests/test_agentbench.py
@@ -15,6 +15,7 @@
     evaluate_instance,
     build_prompt,
 )
+from lib.docker_runner import DockerResult
 from lib.task_runner import Condition
 
 
@@ -275,89 +276,75 @@ def test_non_dict_json(self):
 
 class TestEvaluateInstance:
 
-    def _setup_results(self, ws: Path, pr_results: dict | None, repo_results: dict | None):
-        """Write test result files that the evaluator will parse."""
-        if pr_results is not None:
-            (ws / "pr_test_results.json").write_text(json.dumps(pr_results))
-        if repo_results is not None:
-            (ws / "test_results.json").write_text(json.dumps(repo_results))
-
-    @patch("lib.agentbench_runner.run_in_docker")
-    def test_all_pass(self, mock_docker):
+    def _mock_exec_fn(self, pr_results=None, repo_results=None):
+        """Build a side_effect for exec_in_container that returns test results."""
+        pr_json = json.dumps(pr_results) if pr_results is not None else ""
+        repo_json = json.dumps(repo_results) if repo_results is not None else ""
+
+        def _side_effect(name, command, **kwargs):
+            if command.startswith("rm -f"):
+                return DockerResult(exit_code=0, stdout="", stderr="")
+            if "pr_test_results.json" in command and command.startswith("cat"):
+                if pr_results is None:
+                    return DockerResult(exit_code=1, stdout="", stderr="No such file")
+                return DockerResult(exit_code=0, stdout=pr_json, stderr="")
+            if "test_results.json" in command and command.startswith("cat"):
+                if repo_results is None:
+                    return DockerResult(exit_code=1, stdout="", stderr="No such file")
+                return DockerResult(exit_code=0, stdout=repo_json, stderr="")
+            return DockerResult(exit_code=0, stdout="", stderr="")
+
+        return _side_effect
+
+    @patch("lib.agentbench_runner.exec_in_container")
+    def test_all_pass(self, mock_exec_ctr):
         inst = _make_instance(
             repo_test_after_pr_patch={"test_a": True, "test_b": True},
         )
-        mock_docker.return_value = MagicMock(exit_code=0)
-
-        with tempfile.TemporaryDirectory() as d:
-            ws = Path(d)
-            # Pre-write both result files (evaluator reads after Docker returns)
-            self._setup_results(ws, {"t1": True, "t2": True}, {"test_a": True, "test_b": True})
-
-            success, output = evaluate_instance(ws, inst, "test-image")
-            assert success is True
-            assert "INSTANCE: 2/2 passed" in output
-            assert "REGRESSION: 2/2 passed" in output
+        mock_exec_ctr.side_effect = self._mock_exec_fn(
+            {"t1": True, "t2": True}, {"test_a": True, "test_b": True},
+        )
+        success, output = evaluate_instance("eval-abc123", inst)
+        assert success is True
+        assert "INSTANCE: 2/2 passed" in output
+        assert "REGRESSION: 2/2 passed" in output
 
-    @patch("lib.agentbench_runner.run_in_docker")
-    def test_instance_test_fails(self, mock_docker):
+    @patch("lib.agentbench_runner.exec_in_container")
+    def test_instance_test_fails(self, mock_exec_ctr):
         inst = _make_instance()
-        mock_docker.return_value = MagicMock(exit_code=1)
+        mock_exec_ctr.side_effect = self._mock_exec_fn({"t1": True, "t2": False})
+        success, output = evaluate_instance("eval-abc123", inst)
+        assert success is False
+        assert "INSTANCE: 1/2 passed" in output
 
-        with tempfile.TemporaryDirectory() as d:
-            ws = Path(d)
-            self._setup_results(ws, {"t1": True, "t2": False}, None)
-
-            success, output = evaluate_instance(ws, inst, "test-image")
-            assert success is False
-            assert "INSTANCE: 1/2 passed" in output
-
-    @patch("lib.agentbench_runner.run_in_docker")
-    def test_missing_pr_results(self, mock_docker):
+    @patch("lib.agentbench_runner.exec_in_container")
+    def test_missing_pr_results(self, mock_exec_ctr):
         inst = _make_instance()
-        mock_docker.return_value = MagicMock(exit_code=1)
+        mock_exec_ctr.side_effect = self._mock_exec_fn()
+        success, output = evaluate_instance("eval-abc123", inst)
+        assert success is False
+        assert "missing or corrupt" in output
 
-        with tempfile.TemporaryDirectory() as d:
-            ws = Path(d)
-            # No result files written
-            success, output = evaluate_instance(ws, inst, "test-image")
-            assert success is False
-            assert "missing or corrupt" in output
-
-    @patch("lib.agentbench_runner.run_in_docker")
-    def test_empty_pr_results_not_treated_as_all_pass(self, mock_docker):
+    @patch("lib.agentbench_runner.exec_in_container")
+    def test_empty_pr_results_not_treated_as_all_pass(self, mock_exec_ctr):
         """Empty dict should not be treated as 'all tests passed'."""
         inst = _make_instance()
-        mock_docker.return_value = MagicMock(exit_code=0)
-
-        with tempfile.TemporaryDirectory() as d:
-            ws = Path(d)
-            # Empty results dict — all({}.values()) is True, but should not count as pass
-            self._setup_results(ws, {}, None)
+        mock_exec_ctr.side_effect = self._mock_exec_fn({})
+        success, output = evaluate_instance("eval-abc123", inst)
+        assert success is False
+        assert "0/0" in output
 
-            success, output = evaluate_instance(ws, inst, "test-image")
-            assert success is False
-            assert "0/0" in output
-
-    @patch("lib.agentbench_runner.run_in_docker")
-    def test_regression_flipped_tests(self, mock_docker):
+    @patch("lib.agentbench_runner.exec_in_container")
+    def test_regression_flipped_tests(self, mock_exec_ctr):
         inst = _make_instance(
             repo_test_after_pr_patch={"test_a": True, "test_b": True},
         )
-        mock_docker.return_value = MagicMock(exit_code=0)
-
-        with tempfile.TemporaryDirectory() as d:
-            ws = Path(d)
-            # Instance tests pass, but regression test_b flipped from True -> False
-            self._setup_results(
-                ws,
-                {"t1": True},
-                {"test_a": True, "test_b": False},
-            )
-
-            success, output = evaluate_instance(ws, inst, "test-image")
-            assert success is False
-            assert "regressions" in output
+        mock_exec_ctr.side_effect = self._mock_exec_fn(
+            {"t1": True}, {"test_a": True, "test_b": False},
+        )
+        success, output = evaluate_instance("eval-abc123", inst)
+        assert success is False
+        assert "regressions" in output
 
 
 # ── Runner: build_prompt ───────────────────────────────────────────────

From a30b7aa65881922b91ec112441a3daab4d0dd53c Mon Sep 17 00:00:00 2001
From: Ryan Orban <ryan@orban.dev>
Date: Mon, 20 Apr 2026 16:18:17 -0700
Subject: [PATCH 18/21] add throughput instrumentation, LPT scheduling,
 idle-timeout kill

Diagnostic findings (see moirai/scripts/analyze_harness_throughput.py):
  * Docker daemon is NOT the serializer (queue_wait p50<1s at parallel=8)
  * LLM generation owns 98% of Claude wall-clock; tool execution is 2%
  * Trial duration heterogeneity (17s-335s) kills effective concurrency

Changes:
  * task_runner/reporter: persist started_at, finished_at, cost_usd,
    docker_invocations, and stream_events on TaskResult so wall-clock
    concurrency and per-Claude-event timing can be reconstructed.
  * docker_runner: capture invoked_at / first_byte_at / finished_at
    per DockerResult. Gap between invoked_at and first_byte_at is
    the Docker daemon queue wait; gap to finished_at is exec time.
  * claude_runner: add stream_events field on ClaudeResult. The
    streaming path records (t, type, tools, token deltas) per
    NDJSON line. Extract _wait_for_proc helper that enforces both
    hard timeout and the new idle_timeout (kills on N seconds
    with no new stream events, so stalled trials stop pinning the
    wall-clock at the hard ceiling).
  * cli: --schedule {fifo,lpt} with --prior-results-dir loads
    median wall_clock_seconds per task_id from prior trial JSONs
    and sorts the work queue longest-first. --idle-timeout wires
    through TaskRunner to claude_runner.
---
 eval-harness/lib/claude_runner.py             | 193 ++++++++++++-----
 eval-harness/lib/cli.py                       |  77 ++++++-
 eval-harness/lib/docker_runner.py             |  31 +++
 eval-harness/lib/reporter.py                  |   5 +
 eval-harness/lib/task_runner.py               |  86 +++++++-
 eval-harness/tests/test_claude_runner.py      | 184 +++++++++++++++-
 eval-harness/tests/test_cli.py                |  54 ++++-
 eval-harness/tests/test_task_runner.py        |  12 +-
 .../tests/test_trial_instrumentation.py       | 205 ++++++++++++++++++
 9 files changed, 779 insertions(+), 68 deletions(-)
 create mode 100644 eval-harness/tests/test_trial_instrumentation.py

diff --git a/eval-harness/lib/claude_runner.py b/eval-harness/lib/claude_runner.py
index ace268b..6fb7abe 100644
--- a/eval-harness/lib/claude_runner.py
+++ b/eval-harness/lib/claude_runner.py
@@ -5,7 +5,7 @@
 import time
 import json
 import os
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from pathlib import Path
 
 
@@ -21,6 +21,10 @@ class ClaudeResult:
     timed_out: bool = False
     cost_usd: float = 0.0
     num_turns: int = 0
+    # Per-event arrival records from the stream-json path; one dict per
+    # NDJSON line we read. Empty when the fast (non-streaming) path runs.
+    # See _extract_stream_event for the dict shape.
+    stream_events: list[dict] = field(default_factory=list)
 
 
 def parse_claude_output(stdout: str) -> dict:
@@ -87,57 +91,92 @@ def parse_claude_output(stdout: str) -> dict:
         return {"input_tokens": 0, "output_tokens": 0, "tool_calls": 0}
 
 
-def _summarize_stream_event(line: str) -> str | None:
-    """Extract a human-readable summary from a stream-json NDJSON line.
+def _extract_stream_event(line: str, t: float) -> tuple[dict | None, str | None]:
+    """Parse a stream-json NDJSON line into (record, summary).
+
+    `record`  — minimal structured dict suitable for per-trial timing analysis,
+                with keys:
+                  t            relative seconds since claude_runner start
+                  type         "assistant" | "user" | "result" | "system" | ...
+                  tools        list[str] of tool names (assistant events only)
+                  out          output_tokens delta (when usage block present)
+                  cache_read   cache_read_input_tokens delta
+                  cache_create cache_creation_input_tokens delta
+                  inp          raw input_tokens delta (rare; usually all cached)
+                  num_turns    final turn count (result events only)
+                  cost_usd     cumulative cost (result events only)
+
+    `summary` — human-readable line for the live progress log, matching the
+                pre-existing format. None when the event isn't worth logging.
 
-    Returns a short string for interesting events (tool calls, results),
-    or None for events we don't care about logging.
+    Returns (None, None) on parse failure or non-dict payloads. The caller
+    should still record an arrival timestamp for unparseable lines if it
+    wants idle-gap visibility, but most care only about structured events.
     """
     try:
         event = json.loads(line)
     except (json.JSONDecodeError, TypeError):
-        return None
-
+        return None, None
     if not isinstance(event, dict):
-        return None
+        return None, None
 
     etype = event.get("type", "")
+    record: dict = {"t": round(t, 3), "type": etype}
 
-    # Assistant messages — look for tool_use blocks
     if etype == "assistant":
         msg = event.get("message", {})
         content = msg.get("content", []) if isinstance(msg, dict) else []
-        parts = []
+        usage = msg.get("usage", {}) if isinstance(msg, dict) else {}
+        if isinstance(usage, dict):
+            for short, long in (
+                ("inp", "input_tokens"),
+                ("out", "output_tokens"),
+                ("cache_read", "cache_read_input_tokens"),
+                ("cache_create", "cache_creation_input_tokens"),
+            ):
+                v = usage.get(long, 0)
+                if v:
+                    record[short] = v
+        tools: list[str] = []
+        parts: list[str] = []
         for block in (content if isinstance(content, list) else []):
             if not isinstance(block, dict):
                 continue
             if block.get("type") == "tool_use":
                 name = block.get("name", "?")
-                inp = block.get("input", {})
-                if not isinstance(inp, dict):
-                    inp = {}
+                tools.append(name)
+                inp = block.get("input", {}) if isinstance(block.get("input"), dict) else {}
                 if name in ("Read", "Edit", "Write"):
                     parts.append(f"{name} {inp.get('file_path', '?')}")
                 elif name == "Bash":
-                    cmd_str = inp.get("command", "?")
-                    parts.append(f"Bash: {cmd_str[:80]}")
+                    parts.append(f"Bash: {inp.get('command', '?')[:80]}")
                 elif name == "Grep":
                     parts.append(f"Grep: {inp.get('pattern', '?')}")
                 elif name == "Glob":
                     parts.append(f"Glob: {inp.get('pattern', '?')}")
                 else:
                     parts.append(name)
-        if parts:
-            return "  ".join(f"[tool] {p}" for p in parts)
-        return None
+        if tools:
+            record["tools"] = tools
+        summary = "  ".join(f"[tool] {p}" for p in parts) if parts else None
+        return record, summary
 
-    # Result event — final summary
     if etype == "result":
         cost = event.get("total_cost_usd", 0)
         turns = event.get("num_turns", "?")
-        return f"[result] {turns} turns, ${cost:.4f}"
+        if isinstance(cost, (int, float)):
+            record["cost_usd"] = cost
+        if isinstance(turns, int):
+            record["num_turns"] = turns
+        return record, f"[result] {turns} turns, ${cost:.4f}"
+
+    return record, None
 
-    return None
+
+def _summarize_stream_event(line: str) -> str | None:
+    """Back-compat wrapper. Prefer _extract_stream_event for new callers."""
+    _, summary = _extract_stream_event(line, 0.0)
+    return summary
 
 
 def parse_stream_json_output(lines: list[str]) -> dict:
@@ -187,6 +226,46 @@ def parse_stream_json_output(lines: list[str]) -> dict:
     }
 
 
+def _wait_for_proc(
+    proc: subprocess.Popen,
+    start: float,
+    timeout: float,
+    idle_timeout: float | None,
+    stream_events: list[dict],
+    poll_interval: float = 2.0,
+) -> str | None:
+    """Poll proc until it exits, is hard-killed at `timeout`, or is killed for
+    going `idle_timeout` seconds without a new stream event.
+
+    Returns None when the process finished on its own, `"timeout"` when we
+    killed it for the hard wall-clock ceiling, or `"idle"` when we killed it
+    because no new stream-json events arrived for `idle_timeout` seconds.
+
+    `stream_events` is read concurrently — it is appended from the stdout
+    reader thread. We only read the tail timestamp, which is safe because
+    list append and index access are atomic in CPython and the list is
+    monotonic (events are never removed).
+    """
+    deadline = start + timeout
+    while True:
+        if proc.poll() is not None:
+            return None
+        now = time.time()
+        if now >= deadline:
+            proc.kill()
+            proc.wait()
+            return "timeout"
+        if idle_timeout and idle_timeout > 0:
+            last_event_wall = (
+                start + stream_events[-1]["t"] if stream_events else start
+            )
+            if now - last_event_wall >= idle_timeout:
+                proc.kill()
+                proc.wait()
+                return "idle"
+        time.sleep(poll_interval)
+
+
 def run_claude(
     workspace: str,
     prompt: str,
@@ -195,6 +274,7 @@ def run_claude(
     model: str | None = None,
     extra_env: dict[str, str] | None = None,
     stderr_log: str | Path | None = None,
+    idle_timeout: float | None = None,
 ) -> ClaudeResult:
     """Run Claude Code CLI and capture metrics.
 
@@ -203,6 +283,12 @@ def run_claude(
             uses ``--output-format stream-json`` and writes human-readable
             event summaries (tool calls, results) to this file so callers
             can ``tail -f`` it for live monitoring.
+        idle_timeout: When set and > 0, kills the Claude subprocess if no new
+            stream-json events arrive for this many seconds. Only active on
+            the streaming path (stderr_log set). The diagnostic finding that
+            prompted this: failed trials sit silently on an API stall for
+            minutes before the hard timeout fires; idle_timeout reclaims
+            that tail cheaply.
     """
     output_format = "stream-json" if stderr_log else "json"
     cmd = [
@@ -285,12 +371,20 @@ def run_claude(
     log_path.parent.mkdir(parents=True, exist_ok=True)
     stdout_lines: list[str] = []
     stderr_lines: list[str] = []
+    stream_events: list[dict] = []
 
     def _drain_stdout(stream, log_file):
-        """Read stream-json stdout, write summaries to log, collect lines."""
+        """Read stream-json stdout, write summaries to log, collect lines.
+
+        Also records per-event arrival metadata (relative seconds since
+        `start`) for downstream throughput analysis. The list is appended
+        from this thread only, so no lock is needed.
+        """
         for line in stream:
             stdout_lines.append(line)
-            summary = _summarize_stream_event(line)
+            record, summary = _extract_stream_event(line, time.time() - start)
+            if record is not None:
+                stream_events.append(record)
             if summary:
                 log_file.write(summary + "\n")
                 log_file.flush()
@@ -335,46 +429,46 @@ def _drain_stderr(stream, log_file):
             out_reader.start()
             err_reader.start()
 
-            # Wait for process, enforcing timeout
-            try:
-                proc.wait(timeout=timeout)
-            except subprocess.TimeoutExpired:
-                proc.kill()
-                proc.wait()
-                out_reader.join(timeout=5)
-                err_reader.join(timeout=5)
-                elapsed = time.time() - start
-                # Parse whatever we got before timeout
-                metrics = parse_stream_json_output(stdout_lines)
-                return ClaudeResult(
-                    exit_code=-1,
-                    wall_clock_seconds=elapsed,
-                    input_tokens=metrics["input_tokens"],
-                    output_tokens=metrics["output_tokens"],
-                    tool_calls=metrics["tool_calls"],
-                    stdout="".join(stdout_lines),
-                    stderr="".join(stderr_lines) or "Command timed out",
-                    timed_out=True,
-                    cost_usd=metrics.get("cost_usd", 0),
-                    num_turns=metrics.get("num_turns", 0),
-                )
+            # Wait for process. The helper enforces both the hard `timeout`
+            # ceiling and the optional `idle_timeout` (no new stream events).
+            kill_reason = _wait_for_proc(
+                proc, start, timeout, idle_timeout, stream_events
+            )
 
             out_reader.join(timeout=5)
             err_reader.join(timeout=5)
             elapsed = time.time() - start
             metrics = parse_stream_json_output(stdout_lines)
 
+            if kill_reason == "idle":
+                # Annotate stderr so operators can distinguish idle-kills
+                # from hard timeouts in post-hoc analysis.
+                note = (
+                    f"Killed: no stream events for "
+                    f"{idle_timeout:.0f}s (total {elapsed:.0f}s)"
+                )
+                stderr_text = (
+                    "".join(stderr_lines)
+                    + ("\n" if stderr_lines else "")
+                    + note
+                )
+            elif kill_reason == "timeout":
+                stderr_text = "".join(stderr_lines) or "Command timed out"
+            else:
+                stderr_text = "".join(stderr_lines)
+
             return ClaudeResult(
-                exit_code=proc.returncode,
+                exit_code=-1 if kill_reason else proc.returncode,
                 wall_clock_seconds=elapsed,
                 input_tokens=metrics["input_tokens"],
                 output_tokens=metrics["output_tokens"],
                 tool_calls=metrics["tool_calls"],
                 stdout="".join(stdout_lines),
-                stderr="".join(stderr_lines),
-                timed_out=False,
+                stderr=stderr_text,
+                timed_out=bool(kill_reason),
                 cost_usd=metrics.get("cost_usd", 0),
                 num_turns=metrics.get("num_turns", 0),
+                stream_events=stream_events,
             )
     except OSError as e:
         elapsed = time.time() - start
@@ -387,4 +481,5 @@ def _drain_stderr(stream, log_file):
             stdout="",
             stderr=f"Failed to start process: {e}",
             timed_out=False,
+            stream_events=stream_events,
         )
diff --git a/eval-harness/lib/cli.py b/eval-harness/lib/cli.py
index 1f36b22..ac79326 100644
--- a/eval-harness/lib/cli.py
+++ b/eval-harness/lib/cli.py
@@ -94,6 +94,49 @@ def _load_prior_results(json_path: str) -> tuple[set[tuple[str, str]], set[tuple
     return passed, genuine_failures, data
 
 
+def _load_durations_from_dir(results_dir: Path) -> dict[str, float]:
+    """Scan <results_dir>/trials/*.json and return median wall_clock_seconds per
+    task_id across all conditions and reps. Used by LPT scheduling to predict
+    how long each task will take on its next run.
+
+    Returns an empty dict when the directory or trials are absent. Trials with
+    non-positive or missing wall_clock_seconds are ignored — those are almost
+    always pre-validation or infra failures, so they carry no timing signal.
+    """
+    trials_dir = results_dir / "trials"
+    if not trials_dir.is_dir():
+        return {}
+    by_task: dict[str, list[float]] = {}
+    for f in trials_dir.glob("*.json"):
+        try:
+            d = json.loads(f.read_text())
+        except (json.JSONDecodeError, OSError):
+            continue
+        tid = d.get("task_id")
+        wall = d.get("wall_clock_seconds")
+        if tid and isinstance(wall, (int, float)) and wall > 0:
+            by_task.setdefault(tid, []).append(float(wall))
+    return {
+        tid: sorted(walls)[len(walls) // 2]
+        for tid, walls in by_task.items()
+    }
+
+
+def _sort_lpt(
+    work_queue: list, durations: dict[str, float], default: float
+) -> list:
+    """Sort work queue by predicted duration descending. Items whose task_id is
+    not in `durations` get `default` (the across-task median), so they land in
+    the middle of the queue rather than first-or-last by accident. Stable sort
+    preserves original order within same-predicted-duration groups.
+    """
+    return sorted(
+        work_queue,
+        key=lambda item: durations.get(item[1].id, default),
+        reverse=True,
+    )
+
+
 def _is_infra_error_dict(cond_data: dict) -> bool:
     """Check if a condition dict represents an infrastructure error."""
     error = cond_data.get("error")
@@ -389,7 +432,18 @@ def scan(repo, output, since, limit, docker_image, setup, test_command, branch):
               help="Prior results JSON — skip passed pairs, re-run infra errors")
 @click.option("--retry-all", is_flag=True,
               help="With --resume: also retry genuine failures, not just infra errors")
-def run(tasks, parallel, category, output, keep_workspaces, dry_run, timeout, verbose, clear_cache, no_cache, cache_dir, condition, model, repetitions, resume, retry_all):
+@click.option("--schedule", type=click.Choice(["fifo", "lpt"]), default="fifo",
+              help="Task ordering: fifo (default) or lpt (longest predicted "
+                   "duration first, to shorten makespan under parallelism)")
+@click.option("--prior-results-dir", default=None, type=click.Path(exists=True),
+              help="Output dir with prior trial JSONs to use for LPT duration "
+                   "estimates. Defaults to --output if it already has trials.")
+@click.option("--idle-timeout", default=0.0, type=float,
+              help="Kill a Claude subprocess if no new stream events arrive "
+                   "for this many seconds (0 disables). Useful for reclaiming "
+                   "time from stalled trials that would otherwise pin at the "
+                   "hard --timeout ceiling.")
+def run(tasks, parallel, category, output, keep_workspaces, dry_run, timeout, verbose, clear_cache, no_cache, cache_dir, condition, model, repetitions, resume, retry_all, schedule, prior_results_dir, idle_timeout):
     """Run eval on task files."""
     # Validate task files exist
     for task_path in tasks:
@@ -473,6 +527,26 @@ def run(tasks, parallel, category, output, keep_workspaces, dry_run, timeout, ve
         if pre_validated_tasks:
             click.echo(f"Resume: {len(pre_validated_tasks)} task(s) will skip pre-validation")
 
+    if schedule == "lpt":
+        src_dir = Path(prior_results_dir) if prior_results_dir else Path(output)
+        durations = _load_durations_from_dir(src_dir)
+        if not durations:
+            click.echo(
+                f"\u26a0 --schedule lpt requested but no prior trial JSONs in "
+                f"{src_dir}/trials/; falling back to fifo. Pass "
+                f"--prior-results-dir to point at an earlier output dir."
+            )
+        else:
+            sorted_durs = sorted(durations.values())
+            default_dur = sorted_durs[len(sorted_durs) // 2]
+            work_queue = _sort_lpt(work_queue, durations, default_dur)
+            covered = sum(1 for item in work_queue if item[1].id in durations)
+            click.echo(
+                f"Schedule: LPT — {covered}/{len(work_queue)} items have prior "
+                f"durations (median fallback {default_dur:.0f}s). Longest "
+                f"predicted task runs first."
+            )
+
     if dry_run:
         click.echo("\nDry run - would execute:")
         for _repo, task, cond, rep in work_queue:
@@ -618,6 +692,7 @@ def run_single(item):
             pre_val_cache=pre_val_cache,
             claude_timeout=timeout,
             skip_pre_validation_for=pre_validated_tasks,
+            claude_idle_timeout=idle_timeout if idle_timeout > 0 else None,
         )
         result = runner.run(task, condition, model=model, rep=rep)
 
diff --git a/eval-harness/lib/docker_runner.py b/eval-harness/lib/docker_runner.py
index 248f492..c9268ce 100644
--- a/eval-harness/lib/docker_runner.py
+++ b/eval-harness/lib/docker_runner.py
@@ -7,6 +7,7 @@
 import time
 from contextlib import contextmanager
 from dataclasses import dataclass
+from datetime import datetime, timezone
 from pathlib import Path
 from typing import Callable
 from uuid import uuid4
@@ -43,6 +44,15 @@ class DockerResult:
     stdout: str
     stderr: str
     timed_out: bool = False
+    # Wall-clock instrumentation. invoked_at is when Popen was called on
+    # `docker run`; first_byte_at is when the container emitted its first
+    # stdout/stderr line (proxy for "container is running"); finished_at is
+    # when the subprocess returned. The gap between invoked_at and
+    # first_byte_at ≈ Docker daemon queue + image pull + container create.
+    # first_byte_at is None on the fast (non-streaming) path.
+    invoked_at: str | None = None
+    first_byte_at: str | None = None
+    finished_at: str | None = None
 
 
 # ---------------------------------------------------------------------------
@@ -289,6 +299,7 @@ def _exec_cmd(
 
     # Fast path: no streaming/heartbeat needed
     if stream_log is None and heartbeat_callback is None:
+        invoked_at = datetime.now(timezone.utc).isoformat()
         try:
             result = subprocess.run(
                 cmd, capture_output=True, text=True, timeout=timeout,
@@ -297,11 +308,15 @@ def _exec_cmd(
                 exit_code=result.returncode,
                 stdout=result.stdout,
                 stderr=result.stderr,
+                invoked_at=invoked_at,
+                finished_at=datetime.now(timezone.utc).isoformat(),
             )
         except subprocess.TimeoutExpired:
             return DockerResult(
                 exit_code=-1, stdout="",
                 stderr="Command timed out", timed_out=True,
+                invoked_at=invoked_at,
+                finished_at=datetime.now(timezone.utc).isoformat(),
             )
 
     # Streaming path with heartbeat support
@@ -314,6 +329,9 @@ def _exec_cmd(
     stdout_lines: list[str] = []
     stderr_lines: list[str] = []
     line_counts = {"stdout": 0, "stderr": 0}
+    # Single-element holder so _drain can write first_byte_at without
+    # needing nonlocal; protected by the same lock that guards line_counts.
+    first_byte_holder: list[str | None] = [None]
     lock = threading.Lock()
 
     def _drain(stream, target: list[str], key: str):
@@ -321,10 +339,13 @@ def _drain(stream, target: list[str], key: str):
             target.append(line)
             with lock:
                 line_counts[key] += 1
+                if first_byte_holder[0] is None:
+                    first_byte_holder[0] = datetime.now(timezone.utc).isoformat()
             if log_file:
                 log_file.write(f"[{key}] {line}")
                 log_file.flush()
 
+    invoked_at = datetime.now(timezone.utc).isoformat()
     try:
         proc = subprocess.Popen(
             cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
@@ -370,23 +391,33 @@ def _drain(stream, target: list[str], key: str):
         out_thread.join(timeout=5)
         err_thread.join(timeout=5)
 
+        finished_at = datetime.now(timezone.utc).isoformat()
+
         if timed_out:
             return DockerResult(
                 exit_code=-1,
                 stdout="".join(stdout_lines),
                 stderr="".join(stderr_lines) or "Command timed out",
                 timed_out=True,
+                invoked_at=invoked_at,
+                first_byte_at=first_byte_holder[0],
+                finished_at=finished_at,
             )
 
         return DockerResult(
             exit_code=proc.returncode,
             stdout="".join(stdout_lines),
             stderr="".join(stderr_lines),
+            invoked_at=invoked_at,
+            first_byte_at=first_byte_holder[0],
+            finished_at=finished_at,
         )
     except OSError as e:
         return DockerResult(
             exit_code=-1, stdout="",
             stderr=f"Failed to start process: {e}",
+            invoked_at=invoked_at,
+            finished_at=datetime.now(timezone.utc).isoformat(),
         )
     finally:
         if log_file:
diff --git a/eval-harness/lib/reporter.py b/eval-harness/lib/reporter.py
index c7352e4..6cc054a 100644
--- a/eval-harness/lib/reporter.py
+++ b/eval-harness/lib/reporter.py
@@ -618,10 +618,15 @@ def write_trial(self, result: 'TaskResult') -> str:
             "condition": result.condition.value,
             "rep": result.rep,
             "success": result.success,
+            "started_at": result.started_at,
+            "finished_at": result.finished_at,
             "wall_clock_seconds": result.wall_clock_seconds,
             "input_tokens": result.input_tokens,
             "output_tokens": result.output_tokens,
             "tool_calls": result.tool_calls,
+            "cost_usd": result.cost_usd,
+            "docker_invocations": result.docker_invocations,
+            "stream_events": result.stream_events,
             "lines_changed": result.lines_changed,
         }
         if result.error:
diff --git a/eval-harness/lib/task_runner.py b/eval-harness/lib/task_runner.py
index e386a60..2dd90db 100644
--- a/eval-harness/lib/task_runner.py
+++ b/eval-harness/lib/task_runner.py
@@ -9,6 +9,7 @@
 import threading
 import time
 from dataclasses import dataclass, field
+from datetime import datetime, timezone
 from enum import Enum
 from pathlib import Path
 from typing import Callable
@@ -141,6 +142,41 @@ class TaskResult:
     error: str | None = None
     exit_code: int | None = None
     is_timeout: bool = False
+    # Wall-clock instrumentation. started_at/finished_at are the trial-wrapper
+    # boundary (set in run()), needed to reconstruct concurrency-by-second.
+    # cost_usd comes from ClaudeResult on paths that actually invoked Claude;
+    # zero for pre-validation/skill-gen/infra failures.
+    started_at: str | None = None
+    finished_at: str | None = None
+    cost_usd: float = 0.0
+    # Per-Docker-invocation timings from DockerResult. Each dict has
+    # phase, invoked_at, first_byte_at, finished_at, exit_code, timed_out.
+    # Gap between invoked_at and first_byte_at ≈ Docker daemon queue wait.
+    docker_invocations: list[dict] = field(default_factory=list)
+    # Per-stream-event arrival records from the Claude fix call. Each dict has
+    # t (relative seconds since claude start), type, and optionally tools and
+    # token deltas. Empty for trials that never reached the fix call.
+    stream_events: list[dict] = field(default_factory=list)
+
+
+def _record_docker(invocations: list[dict] | None, phase: str, result) -> None:
+    """Append a Docker invocation record to the per-trial accumulator.
+
+    No-op when invocations is None (e.g. callers that pre-date the patch).
+    Uses getattr with defaults so that test mocks producing a partial result
+    object (no instrumentation fields) still work — the timing fields just
+    come through as None.
+    """
+    if invocations is None:
+        return
+    invocations.append({
+        "phase": phase,
+        "invoked_at": getattr(result, "invoked_at", None),
+        "first_byte_at": getattr(result, "first_byte_at", None),
+        "finished_at": getattr(result, "finished_at", None),
+        "exit_code": getattr(result, "exit_code", None),
+        "timed_out": getattr(result, "timed_out", False),
+    })
 
 
 class PreValidationError(Exception):
@@ -166,6 +202,7 @@ def __init__(
         claude_timeout: int = 300,
         skip_pre_validation_for: frozenset[str] = frozenset(),
         pre_validation_timeout: int = PRE_VALIDATION_TIMEOUT,
+        claude_idle_timeout: float | None = None,
     ):
         self.repo = repo
         self.workspaces_dir = Path(workspaces_dir)
@@ -177,6 +214,7 @@ def __init__(
         self.claude_timeout = claude_timeout
         self._skip_pre_validation_for = skip_pre_validation_for
         self._pre_validation_timeout = pre_validation_timeout
+        self.claude_idle_timeout = claude_idle_timeout
 
     def _progress(self, task_id: str, condition: str, step: str, message: str = ""):
         """Report progress if callback is set."""
@@ -220,6 +258,7 @@ def _pre_validate(
         task_id: str = "pre_validate",
         condition: str = "none",
         stream_log: str | Path | None = None,
+        docker_invocations: list[dict] | None = None,
     ) -> str | None:
         """Validate that a task is runnable before spending API tokens.
 
@@ -265,6 +304,7 @@ def _pre_validate(
                     task_id, condition, "pre_validate_live"
                 ),
             )
+            _record_docker(docker_invocations, "pre_validate_setup", result)
             if result.timed_out:
                 raise PreValidationError(
                     "Docker setup timed out during pre-validation."
@@ -296,6 +336,7 @@ def _pre_validate(
                     task_id, condition, "pre_validate_live"
                 ),
             )
+            _record_docker(docker_invocations, "pre_validate_test", result)
 
             # 2. The test MUST fail at pre_fix_commit (that's the whole point)
             if task.prompt_source == "failing_test" and result.exit_code == 0:
@@ -625,8 +666,24 @@ def warm_cache(
                 shutil.rmtree(workspace)
 
     def run(self, task: Task, condition: Condition, model: str | None = None, rep: int = 0) -> TaskResult:
+        """Execute a single task and stamp wall-clock boundaries on the result.
+
+        started_at / finished_at let downstream tooling reconstruct
+        concurrency-by-second across trials, which a relative
+        wall_clock_seconds alone cannot answer.
+        """
+        started_at = datetime.now(timezone.utc).isoformat()
+        result = self._run_inner(task, condition, model=model, rep=rep)
+        result.started_at = started_at
+        result.finished_at = datetime.now(timezone.utc).isoformat()
+        return result
+
+    def _run_inner(self, task: Task, condition: Condition, model: str | None = None, rep: int = 0) -> TaskResult:
         """Execute a single task under the given condition."""
         cond_str = condition.value
+        # Per-trial accumulator for Docker timings. Threaded into _pre_validate
+        # and _build_prompt so each docker call records its queue/exec split.
+        docker_invocations: list[dict] = []
         self._progress(task.id, cond_str, "setup", "creating workspace")
         workspace = self.setup_workspace(task, condition, rep=rep)
 
@@ -680,6 +737,7 @@ def run(self, task: Task, condition: Condition, model: str | None = None, rep: i
                         task_id=task.id,
                         condition=cond_str,
                         stream_log=precheck_log,
+                        docker_invocations=docker_invocations,
                     ),
                 )
                 self._progress(task.id, cond_str, "pre_validate_done", "pre-validation passed")
@@ -697,6 +755,7 @@ def run(self, task: Task, condition: Condition, model: str | None = None, rep: i
                     task_id=task.id,
                     condition=cond_str,
                     stream_log=precheck_log,
+                    docker_invocations=docker_invocations,
                 )
                 self._progress(task.id, cond_str, "pre_validate_done", "pre-validation passed")
 
@@ -789,7 +848,7 @@ def run(self, task: Task, condition: Condition, model: str | None = None, rep: i
 
             # Build prompt with condition-appropriate preamble
             self._progress(task.id, cond_str, "prompt", "building prompt")
-            prompt = self._build_prompt(task, workspace, condition, cached_test_output=pre_validate_output)
+            prompt = self._build_prompt(task, workspace, condition, cached_test_output=pre_validate_output, docker_invocations=docker_invocations)
 
             # Run Claude on the task
             fix_log = self._build_run_log_path(task, cond_str, "fix", rep)
@@ -799,7 +858,8 @@ def run(self, task: Task, condition: Condition, model: str | None = None, rep: i
                 fix_extra_env = {"CLAUDE_PLUGIN_ROOT": PLUGIN_ROOT}
             claude_result = run_claude(workspace, prompt, timeout=self.claude_timeout,
                                        model=model, stderr_log=str(fix_log),
-                                       extra_env=fix_extra_env)
+                                       extra_env=fix_extra_env,
+                                       idle_timeout=self.claude_idle_timeout)
             self._progress(task.id, cond_str, "claude_done", f"completed in {claude_result.wall_clock_seconds:.1f}s, {claude_result.tool_calls} tool calls")
 
             # Detect empty runs: Claude returned without doing any work
@@ -830,6 +890,9 @@ def run(self, task: Task, condition: Condition, model: str | None = None, rep: i
                         f"prompt_bytes={prompt_size}{stderr_info})"
                     ),
                     exit_code=claude_result.exit_code,
+                    cost_usd=claude_result.cost_usd,
+                    docker_invocations=docker_invocations,
+                    stream_events=claude_result.stream_events,
                 )
 
             # Detect timeout: Claude ran out of time
@@ -852,6 +915,9 @@ def run(self, task: Task, condition: Condition, model: str | None = None, rep: i
                     ),
                     exit_code=claude_result.exit_code,
                     is_timeout=True,
+                    cost_usd=claude_result.cost_usd,
+                    docker_invocations=docker_invocations,
+                    stream_events=claude_result.stream_events,
                 )
 
             # Run tests — use targeted test file when available (~150s → ~15s)
@@ -881,6 +947,7 @@ def run(self, task: Task, condition: Condition, model: str | None = None, rep: i
                     task.id, cond_str, "test_live"
                 ),
             )
+            _record_docker(docker_invocations, "post_test", test_result)
             test_status = "PASSED" if test_result.exit_code == 0 else "FAILED"
             self._progress(task.id, cond_str, "test_done", f"tests {test_status}")
 
@@ -908,6 +975,9 @@ def run(self, task: Task, condition: Condition, model: str | None = None, rep: i
                 skill_generation=skill_metrics,
                 agents_files_read=agents_files_read,
                 exit_code=claude_result.exit_code,
+                cost_usd=claude_result.cost_usd,
+                docker_invocations=docker_invocations,
+                stream_events=claude_result.stream_events,
             )
         except PreValidationError as e:
             logger.warning("Pre-validation failed for %s (%s): %s", task.id, cond_str, e)
@@ -923,7 +993,8 @@ def run(self, task: Task, condition: Condition, model: str | None = None, rep: i
                 lines_changed=0,
                 files_touched=[],
                 rep=rep,
-                error=f"[pre-validation] {e}"
+                error=f"[pre-validation] {e}",
+                docker_invocations=docker_invocations,
             )
         except SkillGenerationError as e:
             logger.warning("Skill generation failed for %s (%s): %s", task.id, cond_str, e)
@@ -939,7 +1010,8 @@ def run(self, task: Task, condition: Condition, model: str | None = None, rep: i
                 lines_changed=0,
                 files_touched=[],
                 rep=rep,
-                error=f"[skill-generation] {e}"
+                error=f"[skill-generation] {e}",
+                docker_invocations=docker_invocations,
             )
         except Exception as e:
             logger.error("Infrastructure error in task %s (%s): %s", task.id, cond_str, e, exc_info=True)
@@ -955,7 +1027,8 @@ def run(self, task: Task, condition: Condition, model: str | None = None, rep: i
                 lines_changed=0,
                 files_touched=[],
                 rep=rep,
-                error=f"[infrastructure] {e}"
+                error=f"[infrastructure] {e}",
+                docker_invocations=docker_invocations,
             )
 
     def setup_workspace(self, task: Task, condition: Condition, rep: int = 0) -> str:
@@ -976,7 +1049,7 @@ def setup_workspace(self, task: Task, condition: Condition, rep: int = 0) -> str
 
         return str(workspace)
 
-    def _build_prompt(self, task: Task, workspace: str, condition: Condition, cached_test_output: str | None = None) -> str:
+    def _build_prompt(self, task: Task, workspace: str, condition: Condition, cached_test_output: str | None = None, docker_invocations: list[dict] | None = None) -> str:
         """Build the appropriate prompt based on task config.
 
         Args:
@@ -1018,6 +1091,7 @@ def _build_prompt(self, task: Task, workspace: str, condition: Condition, cached
                 test_cmd,
                 timeout=PRE_VALIDATION_TIMEOUT
             )
+            _record_docker(docker_invocations, "prompt_build_test", result)
             return build_prompt_from_failing_test(
                 result.stdout + result.stderr, preamble=preamble
             )
diff --git a/eval-harness/tests/test_claude_runner.py b/eval-harness/tests/test_claude_runner.py
index add6066..d97ce7e 100644
--- a/eval-harness/tests/test_claude_runner.py
+++ b/eval-harness/tests/test_claude_runner.py
@@ -8,6 +8,8 @@
     parse_claude_output,
     parse_stream_json_output,
     _summarize_stream_event,
+    _extract_stream_event,
+    _wait_for_proc,
 )
 
 
@@ -169,9 +171,8 @@ def test_run_claude_stream_json_logs_tool_calls(tmp_path):
     assert "[result]" in log_content
 
 
-def test_run_claude_stream_json_timeout_preserves_partial(tmp_path):
-    """Test that stream-json path preserves partial metrics on timeout."""
-    import subprocess
+def test_run_claude_stream_json_timeout_preserves_partial(tmp_path, monkeypatch):
+    """stream-json path preserves partial metrics when hard timeout fires."""
     log_file = tmp_path / "timeout.log"
 
     # One assistant event before timeout
@@ -182,15 +183,21 @@ def test_run_claude_stream_json_timeout_preserves_partial(tmp_path):
         ]}
     })
 
+    # Fake time progression: start at 0, jump past the 5s deadline on the next
+    # check. The _wait_for_proc loop reads time.time() once per poll, so we
+    # need enough ticks to cover: start stamp, the loop check, the elapsed
+    # calc after return, plus some slack.
+    times = iter([0.0, 0.0] + [6.0] * 20)
+    monkeypatch.setattr("lib.claude_runner.time.time", lambda: next(times))
+    monkeypatch.setattr("lib.claude_runner.time.sleep", lambda _s: None)
+
     with patch("lib.claude_runner.subprocess.Popen") as mock_popen:
         mock_proc = MagicMock()
         mock_proc.stdout.__iter__ = lambda self: iter([assistant_event + "\n"])
         mock_proc.stderr.__iter__ = lambda self: iter([])
-        mock_proc.wait.side_effect = [
-            subprocess.TimeoutExpired(cmd="claude", timeout=5),
-            None,
-        ]
+        mock_proc.poll.return_value = None  # still running — forces timeout branch
         mock_proc.kill.return_value = None
+        mock_proc.wait.return_value = 0
         mock_popen.return_value = mock_proc
 
         result = run_claude("/tmp/workspace", "Fix the bug", timeout=5, stderr_log=str(log_file))
@@ -261,6 +268,73 @@ def test_summarize_stream_event_ignores_system():
     assert _summarize_stream_event(line) is None
 
 
+def test_extract_stream_event_assistant_with_tool_and_usage():
+    """Assistant event records arrival time, tool names, and token deltas."""
+    line = json.dumps({
+        "type": "assistant",
+        "message": {
+            "content": [
+                {"type": "text", "text": "thinking..."},
+                {"type": "tool_use", "name": "Bash", "input": {"command": "ls -la"}},
+            ],
+            "usage": {
+                "input_tokens": 4,
+                "output_tokens": 120,
+                "cache_read_input_tokens": 18000,
+                "cache_creation_input_tokens": 250,
+            },
+        },
+    })
+    record, summary = _extract_stream_event(line, t=12.345)
+    assert record is not None
+    assert record["t"] == 12.345
+    assert record["type"] == "assistant"
+    assert record["tools"] == ["Bash"]
+    assert record["inp"] == 4
+    assert record["out"] == 120
+    assert record["cache_read"] == 18000
+    assert record["cache_create"] == 250
+    assert summary is not None and "Bash" in summary
+
+
+def test_extract_stream_event_assistant_text_only():
+    """Assistant text-only event records timestamp + type but no tools, no summary."""
+    line = json.dumps({
+        "type": "assistant",
+        "message": {"content": [{"type": "text", "text": "ok"}], "usage": {"output_tokens": 3}},
+    })
+    record, summary = _extract_stream_event(line, t=1.0)
+    assert record is not None
+    assert record["type"] == "assistant"
+    assert "tools" not in record
+    assert record["out"] == 3
+    assert summary is None
+
+
+def test_extract_stream_event_result_captures_cost_and_turns():
+    line = json.dumps({"type": "result", "num_turns": 7, "total_cost_usd": 0.42})
+    record, summary = _extract_stream_event(line, t=99.0)
+    assert record is not None
+    assert record["type"] == "result"
+    assert record["num_turns"] == 7
+    assert record["cost_usd"] == 0.42
+    assert summary is not None and "[result]" in summary and "7 turns" in summary
+
+
+def test_extract_stream_event_user_event_records_timestamp():
+    """User events (tool_result) get a timestamp record but no summary line."""
+    line = json.dumps({"type": "user", "message": {"content": []}})
+    record, summary = _extract_stream_event(line, t=5.5)
+    assert record == {"t": 5.5, "type": "user"}
+    assert summary is None
+
+
+def test_extract_stream_event_unparseable_returns_none_pair():
+    record, summary = _extract_stream_event("not json", t=0.0)
+    assert record is None
+    assert summary is None
+
+
 def test_parse_stream_json_output_with_result():
     """Test parsing NDJSON with a result event."""
     lines = [
@@ -313,3 +387,99 @@ def test_claude_result_dataclass():
     )
     assert result.exit_code == 0
     assert result.wall_clock_seconds == 45.2
+
+
+# --- _wait_for_proc watchdog ---
+
+
+class _FakeProc:
+    """Mimics subprocess.Popen surface needed by _wait_for_proc.
+
+    `poll_sequence` is a list of values returned by successive poll() calls.
+    None means still-running; an int means the process exited with that code.
+    kill() advances to the next sequence entry so the loop sees it exit.
+    """
+
+    def __init__(self, poll_sequence: list):
+        self._seq = list(poll_sequence)
+        self._calls = 0
+        self.kill_called = False
+        self.wait_called = False
+
+    def poll(self):
+        if self._calls < len(self._seq):
+            v = self._seq[self._calls]
+            self._calls += 1
+            return v
+        return self._seq[-1]
+
+    def kill(self):
+        self.kill_called = True
+        # After kill, the next poll should show exit. Append a "killed" code
+        # at the end so poll() returns non-None on subsequent calls.
+        self._seq.append(-9)
+
+    def wait(self):
+        self.wait_called = True
+
+
+def test_wait_for_proc_returns_none_when_proc_exits_on_own(monkeypatch):
+    """Process exits normally → returns None, no kill."""
+    # Bypass real sleep and pin wall-clock before the hard deadline.
+    monkeypatch.setattr("lib.claude_runner.time.sleep", lambda _s: None)
+    monkeypatch.setattr("lib.claude_runner.time.time", lambda: 1.0)
+    proc = _FakeProc(poll_sequence=[None, 0])  # running, then exited
+    reason = _wait_for_proc(proc, start=0.0, timeout=60, idle_timeout=None,
+                            stream_events=[], poll_interval=0.01)
+    assert reason is None
+    assert not proc.kill_called
+
+
+def test_wait_for_proc_kills_on_hard_timeout(monkeypatch):
+    """now >= deadline → kill with reason 'timeout'."""
+    # Fake time.time progression: 0.0 (start), 61.0 (past deadline).
+    ticks = iter([0.0, 61.0, 61.0, 61.0])
+    monkeypatch.setattr("lib.claude_runner.time.time", lambda: next(ticks))
+    monkeypatch.setattr("lib.claude_runner.time.sleep", lambda _s: None)
+    proc = _FakeProc(poll_sequence=[None])
+    reason = _wait_for_proc(proc, start=0.0, timeout=60, idle_timeout=None,
+                            stream_events=[], poll_interval=0.01)
+    assert reason == "timeout"
+    assert proc.kill_called and proc.wait_called
+
+
+def test_wait_for_proc_kills_on_idle(monkeypatch):
+    """No stream events for idle_timeout → kill with reason 'idle'."""
+    ticks = iter([30.0, 30.0, 30.0])  # 30s since start, no events
+    monkeypatch.setattr("lib.claude_runner.time.time", lambda: next(ticks))
+    monkeypatch.setattr("lib.claude_runner.time.sleep", lambda _s: None)
+    proc = _FakeProc(poll_sequence=[None])
+    reason = _wait_for_proc(proc, start=0.0, timeout=600, idle_timeout=20,
+                            stream_events=[], poll_interval=0.01)
+    assert reason == "idle"
+    assert proc.kill_called
+
+
+def test_wait_for_proc_does_not_kill_when_events_are_fresh(monkeypatch):
+    """Recent stream event → not idle, proc continues until it exits on own."""
+    monkeypatch.setattr("lib.claude_runner.time.time", lambda: 30.0)
+    monkeypatch.setattr("lib.claude_runner.time.sleep", lambda _s: None)
+    proc = _FakeProc(poll_sequence=[None, 0])  # running, then exits clean
+    # Event arrived at t=29.0 relative — only 1s before our check at wall=30.0
+    events = [{"t": 29.0, "type": "assistant"}]
+    reason = _wait_for_proc(proc, start=0.0, timeout=600, idle_timeout=20,
+                            stream_events=events, poll_interval=0.01)
+    assert reason is None
+    assert not proc.kill_called
+
+
+def test_wait_for_proc_idle_timeout_disabled_when_zero_or_none(monkeypatch):
+    """idle_timeout=None or 0 → no idle check, only hard timeout applies."""
+    monkeypatch.setattr("lib.claude_runner.time.time", lambda: 50.0)
+    monkeypatch.setattr("lib.claude_runner.time.sleep", lambda _s: None)
+    proc = _FakeProc(poll_sequence=[None, 0])
+    # 50s into a 60s timeout, no events — would trip any positive idle threshold
+    reason = _wait_for_proc(proc, start=0.0, timeout=60, idle_timeout=0,
+                            stream_events=[], poll_interval=0.01)
+    assert reason is None
+    assert not proc.kill_called
diff --git a/eval-harness/tests/test_cli.py b/eval-harness/tests/test_cli.py
index d182281..d6ed2cb 100644
--- a/eval-harness/tests/test_cli.py
+++ b/eval-harness/tests/test_cli.py
@@ -1,7 +1,11 @@
 # tests/test_cli.py
+import json
+from pathlib import Path
+from types import SimpleNamespace
+
 import pytest
 from click.testing import CliRunner
-from lib.cli import main, scan, run
+from lib.cli import main, scan, run, _load_durations_from_dir, _sort_lpt
 
 
 @pytest.fixture
@@ -135,3 +139,51 @@ def test_run_clear_cache_integration(runner, tmp_path):
     # Should only have manifest, and manifest should be empty
     assert len(cache_contents) == 1
     assert cache_contents[0].name == "cache-manifest.json"
+
+
+# --- LPT scheduling helpers ---
+
+
+def _write_trial(trials_dir: Path, name: str, task_id: str, wall: float) -> None:
+    trials_dir.mkdir(parents=True, exist_ok=True)
+    (trials_dir / f"{name}.json").write_text(
+        json.dumps({"task_id": task_id, "wall_clock_seconds": wall})
+    )
+
+
+def test_load_durations_from_dir_returns_median_per_task(tmp_path):
+    """Multiple trials per task → median wall_clock_seconds; bad data ignored."""
+    trials = tmp_path / "trials"
+    _write_trial(trials, "alpha-r0", "alpha", 10.0)
+    _write_trial(trials, "alpha-r1", "alpha", 30.0)  # median pick
+    _write_trial(trials, "alpha-r2", "alpha", 50.0)
+    _write_trial(trials, "beta-r0", "beta", 100.0)
+    # Skipped: pre-validation fails recorded as wall=0
+    _write_trial(trials, "gamma-r0", "gamma", 0.0)
+    # Skipped: corrupt JSON
+    (trials / "broken.json").write_text("{not json")
+
+    durations = _load_durations_from_dir(tmp_path)
+    assert durations == {"alpha": 30.0, "beta": 100.0}
+
+
+def test_load_durations_returns_empty_when_no_trials_dir(tmp_path):
+    assert _load_durations_from_dir(tmp_path) == {}
+    (tmp_path / "trials").mkdir()
+    assert _load_durations_from_dir(tmp_path) == {}
+
+
+def test_sort_lpt_orders_by_predicted_duration_desc():
+    """Known durations sort by value desc; unknown task uses default (middle)."""
+    items = [
+        ("repo", SimpleNamespace(id="short"), "none", 0),
+        ("repo", SimpleNamespace(id="long"), "none", 0),
+        ("repo", SimpleNamespace(id="unknown"), "none", 0),
+        ("repo", SimpleNamespace(id="medium"), "none", 0),
+    ]
+    durations = {"short": 5.0, "long": 200.0, "medium": 50.0}
+    sorted_items = _sort_lpt(items, durations, default=50.0)
+    ids = [item[1].id for item in sorted_items]
+    assert ids[0] == "long"          # 200s — first
+    assert ids[-1] == "short"        # 5s — last
+    assert set(ids[1:3]) == {"medium", "unknown"}  # both 50s, stable order between them
diff --git a/eval-harness/tests/test_task_runner.py b/eval-harness/tests/test_task_runner.py
index 8cf1c6d..ac23e13 100644
--- a/eval-harness/tests/test_task_runner.py
+++ b/eval-harness/tests/test_task_runner.py
@@ -889,7 +889,8 @@ def fake_get_commit_message(workspace, commit):
         return "fix: something"
 
     def fake_run_claude(workspace, prompt, timeout=300, model=None,
-                        extra_env=None, stderr_log=None, max_turns=50):
+                        extra_env=None, stderr_log=None, max_turns=50,
+                        idle_timeout=None):
         captured_calls.append({
             "workspace": workspace,
             "extra_env": extra_env,
@@ -976,7 +977,8 @@ def fake_get_commit_message(workspace, commit):
         return "fix: something"
 
     def fake_run_claude(workspace, prompt, timeout=300, model=None,
-                        extra_env=None, stderr_log=None, max_turns=50):
+                        extra_env=None, stderr_log=None, max_turns=50,
+                        idle_timeout=None):
         captured_calls.append({"extra_env": extra_env})
         return type("ClaudeResult", (), {
             "exit_code": 0,
@@ -1047,7 +1049,8 @@ def fake_get_commit_message(workspace, commit):
         return "fix: something"
 
     def fake_run_claude(workspace, prompt, timeout=300, model=None,
-                        extra_env=None, stderr_log=None, max_turns=50):
+                        extra_env=None, stderr_log=None, max_turns=50,
+                        idle_timeout=None):
         # Check if .claude/settings.local.json was written
         settings_path = os.path.join(workspace, ".claude", "settings.local.json")
         if os.path.exists(settings_path):
@@ -1155,7 +1158,8 @@ def fake_check_or_generate_index(self, workspace, repo_url, commit,
         )
 
     def fake_run_claude(workspace, prompt, timeout=300, model=None,
-                        extra_env=None, stderr_log=None, max_turns=50):
+                        extra_env=None, stderr_log=None, max_turns=50,
+                        idle_timeout=None):
         # Capture the settings file at the time Claude runs
         settings_path = os.path.join(workspace, ".claude", "settings.local.json")
         if os.path.exists(settings_path):
diff --git a/eval-harness/tests/test_trial_instrumentation.py b/eval-harness/tests/test_trial_instrumentation.py
new file mode 100644
index 0000000..705c188
--- /dev/null
+++ b/eval-harness/tests/test_trial_instrumentation.py
@@ -0,0 +1,205 @@
+# tests/test_trial_instrumentation.py
+"""End-to-end coverage for wall-clock + cost fields added to TaskResult.
+
+The wrapper run() stamps started_at/finished_at; cost_usd is threaded in from
+ClaudeResult on paths that actually invoke Claude. These fields need to
+survive serialization through reporter.write_trial so downstream tooling
+can reconstruct concurrency-by-second.
+"""
+import json
+import tempfile
+
+from lib.reporter import Reporter
+from lib.task_runner import TaskResult, Condition
+
+
+def _read_trial(reporter: Reporter, result: TaskResult) -> dict:
+    path = reporter.write_trial(result)
+    with open(path) as f:
+        return json.load(f)
+
+
+def test_write_trial_preserves_instrumentation_fields():
+    """Populated started_at / finished_at / cost_usd round-trip through JSON."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        reporter = Reporter(tmpdir)
+        result = TaskResult(
+            task_id="instr-1",
+            condition=Condition.INTENT_LAYER,
+            success=True,
+            test_output="ok",
+            wall_clock_seconds=42.5,
+            input_tokens=300,
+            output_tokens=150,
+            tool_calls=8,
+            lines_changed=3,
+            files_touched=["a.py"],
+            rep=0,
+            started_at="2026-04-19T12:00:00+00:00",
+            finished_at="2026-04-19T12:00:42+00:00",
+            cost_usd=0.1234,
+        )
+        data = _read_trial(reporter, result)
+
+        assert data["started_at"] == "2026-04-19T12:00:00+00:00"
+        assert data["finished_at"] == "2026-04-19T12:00:42+00:00"
+        assert data["cost_usd"] == 0.1234
+
+
+def test_write_trial_serializes_defaults_for_new_fields():
+    """Defaulted fields serialize as null (timestamps) and 0.0 (cost)."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        reporter = Reporter(tmpdir)
+        result = TaskResult(
+            task_id="instr-2",
+            condition=Condition.NONE,
+            success=False,
+            test_output="",
+            wall_clock_seconds=0,
+            input_tokens=0,
+            output_tokens=0,
+            tool_calls=0,
+            lines_changed=0,
+            files_touched=[],
+            rep=0,
+        )
+        data = _read_trial(reporter, result)
+
+        assert data["started_at"] is None
+        assert data["finished_at"] is None
+        assert data["cost_usd"] == 0.0
+
+
+def test_write_trial_includes_docker_invocations():
+    """Per-Docker timings round-trip through the trial JSON as a list."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        reporter = Reporter(tmpdir)
+        invocations = [
+            {
+                "phase": "pre_validate_test",
+                "invoked_at": "2026-04-19T12:00:00+00:00",
+                "first_byte_at": "2026-04-19T12:00:02+00:00",
+                "finished_at": "2026-04-19T12:00:25+00:00",
+                "exit_code": 1,
+                "timed_out": False,
+            },
+            {
+                "phase": "post_test",
+                "invoked_at": "2026-04-19T12:01:30+00:00",
+                "first_byte_at": "2026-04-19T12:01:31+00:00",
+                "finished_at": "2026-04-19T12:01:50+00:00",
+                "exit_code": 0,
+                "timed_out": False,
+            },
+        ]
+        result = TaskResult(
+            task_id="instr-3",
+            condition=Condition.NONE,
+            success=True,
+            test_output="ok",
+            wall_clock_seconds=10.0,
+            input_tokens=0,
+            output_tokens=0,
+            tool_calls=0,
+            lines_changed=0,
+            files_touched=[],
+            rep=0,
+            docker_invocations=invocations,
+        )
+        data = _read_trial(reporter, result)
+
+        assert data["docker_invocations"] == invocations
+        assert data["docker_invocations"][0]["phase"] == "pre_validate_test"
+        assert data["docker_invocations"][1]["first_byte_at"] == "2026-04-19T12:01:31+00:00"
+
+
+def test_write_trial_includes_stream_events():
+    """Per-stream-event arrival records round-trip through the trial JSON."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        reporter = Reporter(tmpdir)
+        events = [
+            {"t": 0.05, "type": "system"},
+            {"t": 1.20, "type": "assistant", "tools": ["Bash"], "out": 80, "cache_read": 12000},
+            {"t": 8.40, "type": "user"},
+            {"t": 12.10, "type": "result", "num_turns": 3, "cost_usd": 0.01},
+        ]
+        result = TaskResult(
+            task_id="instr-stream-1",
+            condition=Condition.NONE,
+            success=True,
+            test_output="ok",
+            wall_clock_seconds=12.5,
+            input_tokens=12000,
+            output_tokens=80,
+            tool_calls=1,
+            lines_changed=0,
+            files_touched=[],
+            rep=0,
+            stream_events=events,
+        )
+        data = _read_trial(reporter, result)
+
+        assert data["stream_events"] == events
+        assert data["stream_events"][1]["tools"] == ["Bash"]
+        assert data["stream_events"][3]["num_turns"] == 3
+
+
+def test_record_docker_handles_partial_result_objects():
+    """_record_docker tolerates result objects without instrumentation fields.
+
+    Test mocks for run_in_docker often return a stub result with only
+    exit_code/stdout/stderr. The recorder should fill timing fields with None.
+    """
+    from lib.task_runner import _record_docker
+
+    class StubResult:
+        def __init__(self):
+            self.exit_code = 0
+            self.stdout = ""
+            self.stderr = ""
+
+    invocations: list[dict] = []
+    _record_docker(invocations, "test_phase", StubResult())
+
+    assert len(invocations) == 1
+    rec = invocations[0]
+    assert rec["phase"] == "test_phase"
+    assert rec["invoked_at"] is None
+    assert rec["first_byte_at"] is None
+    assert rec["finished_at"] is None
+    assert rec["exit_code"] == 0
+
+
+def test_run_wrapper_stamps_boundaries(monkeypatch):
+    """TaskRunner.run() sets started_at and finished_at around _run_inner."""
+    from lib import task_runner as tr
+
+    runner = tr.TaskRunner.__new__(tr.TaskRunner)
+
+    def fake_inner(self, task, condition, model=None, rep=0):
+        return TaskResult(
+            task_id="t",
+            condition=condition,
+            success=True,
+            test_output="",
+            wall_clock_seconds=1.0,
+            input_tokens=0,
+            output_tokens=0,
+            tool_calls=0,
+            lines_changed=0,
+            files_touched=[],
+            rep=rep,
+            cost_usd=0.05,
+        )
+
+    monkeypatch.setattr(tr.TaskRunner, "_run_inner", fake_inner)
+
+    class DummyTask:
+        id = "t"
+
+    result = runner.run(DummyTask(), Condition.NONE)
+
+    assert result.started_at is not None
+    assert result.finished_at is not None
+    assert result.started_at <= result.finished_at
+    assert result.cost_usd == 0.05

From e7071b6dbeaf5a9dee26e7632fa9ca863e5dd240 Mon Sep 17 00:00:00 2001
From: Ryan Orban <ryan@orban.dev>
Date: Mon, 20 Apr 2026 16:25:49 -0700
Subject: [PATCH 19/21] fix kill races, LPT median bias, and silent
 reader-thread drops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Review findings on a30b7aa:

  * claude_runner: extract _kill_and_reap helper that swallows OSError
    from races where the child exits between poll() and kill(). The
    bare proc.kill()/proc.wait() in _wait_for_proc could leak an
    exception that skipped the reader-thread joins below, leaving
    drains stuck on closed pipes.
  * claude_runner: warn when out_reader/err_reader join times out.
    A 5s join with no is_alive() check silently dropped partial
    stdout/stderr exactly at the diagnostic moment (idle-kill or
    hard-timeout) when the buffer is most useful.
  * cli: switch _load_durations_from_dir to statistics.median.
    Previous sorted(walls)[len(walls)//2] returned the upper-middle
    for even N — biased LPT predictions upward (e.g. 87s vs 52s
    on the four-task ansible smoke). New test exercises both odd
    and even N to lock this in.
  * cli: surface skipped trial JSONs from _load_durations_from_dir
    via stderr instead of swallowing per-file decode/IO errors.
    Partial corruption no longer silently biases LPT onto an
    unrepresentative subset.
  * cli: replace u26a0 unicode warning sign with plain "warn:"
    prefix per CLAUDE.md (no emoji unless requested).
  * claude_runner: tighten _wait_for_proc docstring — the safety
    property is "append-only of immutable-after-append dicts," not
    a general thread-safety guarantee.
---
 eval-harness/lib/claude_runner.py | 54 ++++++++++++++++++++++++++-----
 eval-harness/lib/cli.py           | 28 +++++++++++-----
 eval-harness/tests/test_cli.py    | 18 ++++++++---
 3 files changed, 80 insertions(+), 20 deletions(-)

diff --git a/eval-harness/lib/claude_runner.py b/eval-harness/lib/claude_runner.py
index 6fb7abe..ee3dec7 100644
--- a/eval-harness/lib/claude_runner.py
+++ b/eval-harness/lib/claude_runner.py
@@ -1,5 +1,6 @@
 # lib/claude_runner.py
 from __future__ import annotations
+import logging
 import subprocess
 import threading
 import time
@@ -8,6 +9,8 @@
 from dataclasses import dataclass, field
 from pathlib import Path
 
+logger = logging.getLogger(__name__)
+
 
 @dataclass
 class ClaudeResult:
@@ -226,6 +229,23 @@ def parse_stream_json_output(lines: list[str]) -> dict:
     }
 
 
+def _kill_and_reap(proc: subprocess.Popen) -> None:
+    """Kill `proc` and wait for it to exit, swallowing OSError from races
+    with a child that already terminated between our last poll and the kill.
+    Without this swallow, _wait_for_proc could leak an exception into
+    `run_claude` and skip the reader-thread joins below it, leaving stdout
+    and stderr drains stuck on closed pipes.
+    """
+    try:
+        proc.kill()
+    except OSError:
+        pass
+    try:
+        proc.wait()
+    except OSError:
+        pass
+
+
 def _wait_for_proc(
     proc: subprocess.Popen,
     start: float,
@@ -241,10 +261,14 @@ def _wait_for_proc(
     killed it for the hard wall-clock ceiling, or `"idle"` when we killed it
     because no new stream-json events arrived for `idle_timeout` seconds.
 
-    `stream_events` is read concurrently — it is appended from the stdout
-    reader thread. We only read the tail timestamp, which is safe because
-    list append and index access are atomic in CPython and the list is
-    monotonic (events are never removed).
+    Concurrency contract: `stream_events` is the same list that the stdout
+    reader thread appends to. We only read its length and the tail dict.
+    This is safe because (a) the list is append-only — nothing pops or
+    reassigns entries, and (b) each appended dict is constructed in full
+    by `_extract_stream_event` and never mutated afterwards. The safety
+    comes from the append-only-of-immutable-after-append discipline, not
+    from any general thread-safety guarantee about Python lists; if you
+    introduce a `pop()` or in-place mutation here, that breaks.
     """
     deadline = start + timeout
     while True:
@@ -252,16 +276,14 @@ def _wait_for_proc(
             return None
         now = time.time()
         if now >= deadline:
-            proc.kill()
-            proc.wait()
+            _kill_and_reap(proc)
             return "timeout"
         if idle_timeout and idle_timeout > 0:
             last_event_wall = (
                 start + stream_events[-1]["t"] if stream_events else start
             )
             if now - last_event_wall >= idle_timeout:
-                proc.kill()
-                proc.wait()
+                _kill_and_reap(proc)
                 return "idle"
         time.sleep(poll_interval)
 
@@ -435,8 +457,24 @@ def _drain_stderr(stream, log_file):
                 proc, start, timeout, idle_timeout, stream_events
             )
 
+            # If a reader thread hasn't drained within 5s after the process
+            # exits or is killed, warn loudly. Silent loss of stdout/stderr
+            # at exactly the diagnostic moment we'd most want it (an idle-
+            # kill or hard-timeout) is the worst kind of debug regression.
             out_reader.join(timeout=5)
+            if out_reader.is_alive():
+                logger.warning(
+                    "stdout reader did not drain within 5s "
+                    "(kill_reason=%s); partial stream_events may be missing",
+                    kill_reason,
+                )
             err_reader.join(timeout=5)
+            if err_reader.is_alive():
+                logger.warning(
+                    "stderr reader did not drain within 5s "
+                    "(kill_reason=%s); partial stderr may be missing",
+                    kill_reason,
+                )
             elapsed = time.time() - start
             metrics = parse_stream_json_output(stdout_lines)
 
diff --git a/eval-harness/lib/cli.py b/eval-harness/lib/cli.py
index ac79326..64048d2 100644
--- a/eval-harness/lib/cli.py
+++ b/eval-harness/lib/cli.py
@@ -2,6 +2,7 @@
 from __future__ import annotations
 import json
 import shutil
+import statistics
 import sys
 import tempfile
 import threading
@@ -107,19 +108,31 @@ def _load_durations_from_dir(results_dir: Path) -> dict[str, float]:
     if not trials_dir.is_dir():
         return {}
     by_task: dict[str, list[float]] = {}
+    skipped: list[tuple[str, str]] = []
     for f in trials_dir.glob("*.json"):
         try:
             d = json.loads(f.read_text())
-        except (json.JSONDecodeError, OSError):
+        except (json.JSONDecodeError, OSError) as e:
+            # Don't drop corrupt files silently — a partially corrupt prior
+            # results dir would bias LPT estimates onto an unrepresentative
+            # subset with no operator-visible trace.
+            skipped.append((f.name, str(e)))
             continue
         tid = d.get("task_id")
         wall = d.get("wall_clock_seconds")
         if tid and isinstance(wall, (int, float)) and wall > 0:
             by_task.setdefault(tid, []).append(float(wall))
-    return {
-        tid: sorted(walls)[len(walls) // 2]
-        for tid, walls in by_task.items()
-    }
+    if skipped:
+        click.echo(
+            f"warn: skipped {len(skipped)} unreadable trial JSON(s) in "
+            f"{trials_dir}: {', '.join(name for name, _ in skipped[:5])}"
+            f"{'…' if len(skipped) > 5 else ''}",
+            err=True,
+        )
+    # statistics.median is correct for both odd and even N; the previous
+    # `sorted(walls)[len(walls)//2]` returned the upper-middle for even N
+    # and biased predictions upward.
+    return {tid: statistics.median(walls) for tid, walls in by_task.items()}
 
 
 def _sort_lpt(
@@ -532,13 +545,12 @@ def run(tasks, parallel, category, output, keep_workspaces, dry_run, timeout, ve
         durations = _load_durations_from_dir(src_dir)
         if not durations:
             click.echo(
-                f"\u26a0 --schedule lpt requested but no prior trial JSONs in "
+                f"warn: --schedule lpt requested but no prior trial JSONs in "
                 f"{src_dir}/trials/; falling back to fifo. Pass "
                 f"--prior-results-dir to point at an earlier output dir."
             )
         else:
-            sorted_durs = sorted(durations.values())
-            default_dur = sorted_durs[len(sorted_durs) // 2]
+            default_dur = statistics.median(durations.values())
             work_queue = _sort_lpt(work_queue, durations, default_dur)
             covered = sum(1 for item in work_queue if item[1].id in durations)
             click.echo(
diff --git a/eval-harness/tests/test_cli.py b/eval-harness/tests/test_cli.py
index d6ed2cb..a180799 100644
--- a/eval-harness/tests/test_cli.py
+++ b/eval-harness/tests/test_cli.py
@@ -152,19 +152,29 @@ def _write_trial(trials_dir: Path, name: str, task_id: str, wall: float) -> None
 
 
 def test_load_durations_from_dir_returns_median_per_task(tmp_path):
-    """Multiple trials per task → median wall_clock_seconds; bad data ignored."""
+    """Multiple trials per task → median wall_clock_seconds; bad data ignored.
+
+    Also exercises even-length inputs to lock in correct median behavior —
+    a previous implementation used `sorted(walls)[len(walls)//2]`, which
+    returns the upper-middle for even N and biased predictions upward.
+    """
     trials = tmp_path / "trials"
+    # Odd N=3 — true median = middle element
     _write_trial(trials, "alpha-r0", "alpha", 10.0)
-    _write_trial(trials, "alpha-r1", "alpha", 30.0)  # median pick
+    _write_trial(trials, "alpha-r1", "alpha", 30.0)
     _write_trial(trials, "alpha-r2", "alpha", 50.0)
-    _write_trial(trials, "beta-r0", "beta", 100.0)
+    # Even N=4 — true median = mean of middle two = (20 + 40) / 2 = 30.0
+    _write_trial(trials, "beta-r0", "beta", 10.0)
+    _write_trial(trials, "beta-r1", "beta", 20.0)
+    _write_trial(trials, "beta-r2", "beta", 40.0)
+    _write_trial(trials, "beta-r3", "beta", 100.0)
     # Skipped: pre-validation fails recorded as wall=0
     _write_trial(trials, "gamma-r0", "gamma", 0.0)
     # Skipped: corrupt JSON
     (trials / "broken.json").write_text("{not json")
 
     durations = _load_durations_from_dir(tmp_path)
-    assert durations == {"alpha": 30.0, "beta": 100.0}
+    assert durations == {"alpha": 30.0, "beta": 30.0}
 
 
 def test_load_durations_returns_empty_when_no_trials_dir(tmp_path):

From 250a20c10b2aca199ac445f1dae11d515844e1ed Mon Sep 17 00:00:00 2001
From: Ryan Orban <ryan@orban.dev>
Date: Sat, 23 May 2026 02:12:57 -0700
Subject: [PATCH 20/21] add knowledge-intake ingestion run report

Captures per-source fetched-item counts from the 2026-05-23 ingestion run:
pinboard 20, papers 9, github 100, email 0 (gog keyring timeout). Total 129.

Nightshift-Task: knowledge-ingest
Nightshift-Ref: https://github.com/marcus/nightshift
---
 ...26-05-23-knowledge-intake-ingestion-run.md | 57 +++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 docs/reports/2026-05-23-knowledge-intake-ingestion-run.md

diff --git a/docs/reports/2026-05-23-knowledge-intake-ingestion-run.md b/docs/reports/2026-05-23-knowledge-intake-ingestion-run.md
new file mode 100644
index 0000000..fc453eb
--- /dev/null
+++ b/docs/reports/2026-05-23-knowledge-intake-ingestion-run.md
@@ -0,0 +1,57 @@
+# Knowledge Intake Ingestion Run — 2026-05-23
+
+Operational run of the `knowledge-intake` ingestion pipeline, capturing how many items each source returned.
+
+## Command
+
+```bash
+cd ~/dev/knowledge-intake && source .env && bun run src/cli.ts ingest
+```
+
+- **Run date**: 2026-05-23
+- **Repo**: `~/dev/knowledge-intake` (sibling to `intent-layer`)
+- **Runtime**: `bun`
+- **Exit code**: `0` (overall run succeeded; one source errored — see below)
+
+## Per-source results
+
+| Source     | Items fetched | Status |
+|------------|--------------:|--------|
+| pinboard   | 20            | OK |
+| papers     | 9             | OK |
+| github     | 100           | OK |
+| email      | 0             | Error — see below |
+| **Total**  | **129**       | 3 of 4 sources succeeded |
+
+The reported total of **129 new items** equals pinboard (20) + papers (9) + github (100). The `email` source contributed 0 items because it failed before fetching.
+
+## Source-level error: email
+
+The `email` source failed during a Gmail search via the `gog` CLI:
+
+```
+[email] error: gog gmail search failed (exit 1): gmail options: token source: get token for ryan.orban@gmail.com: read token: keyring connection timed out after 10s while reading keyring item (macOS Keychain may be waiting for a permission prompt; run `gog auth list` from a terminal and click "Always Allow" when prompted); set GOG_KEYRING_BACKEND=file and GOG_KEYRING_PASSWORD=<password> to use encrypted file storage instead
+```
+
+**Cause**: The macOS Keychain read for the Gmail OAuth token timed out after 10s. This happens when the Keychain is waiting on an interactive "Always Allow" permission prompt that never gets answered in a non-interactive run.
+
+**Remediation** (either option):
+
+1. Run `gog auth list` from an interactive terminal and click **Always Allow** when macOS prompts for Keychain access, then re-run the ingestion.
+2. Switch `gog` to encrypted file-based token storage so no Keychain prompt is needed: set `GOG_KEYRING_BACKEND=file` and `GOG_KEYRING_PASSWORD=<password>` in the environment before running.
+
+This is an environment/auth issue local to the run host, not a code defect in the ingestion pipeline. The other three sources fetched normally.
+
+## Raw log
+
+```text
+$ cd ~/dev/knowledge-intake && source .env && bun run src/cli.ts ingest
+
+[pinboard] fetched 20 new items
+[papers] fetched 9 new items
+[github] fetched 100 new items
+[email] error: gog gmail search failed (exit 1): gmail options: token source: get token for ryan.orban@gmail.com: read token: keyring connection timed out after 10s while reading keyring item (macOS Keychain may be waiting for a permission prompt; run `gog auth list` from a terminal and click "Always Allow" when prompted); set GOG_KEYRING_BACKEND=file and GOG_KEYRING_PASSWORD=<password> to use encrypted file storage instead
+
+Ingestion complete: 129 new items
+EXIT_CODE=0
+```

From ea7e99d830b13551ac9ab776017c7af02c62e952 Mon Sep 17 00:00:00 2001
From: Ryan Orban <ryan@orban.dev>
Date: Mon, 25 May 2026 02:11:03 -0700
Subject: [PATCH 21/21] Add 2026-05-25 knowledge ingest report

Nightshift-Task: knowledge-ingest
Nightshift-Ref: https://github.com/marcus/nightshift
---
 ...26-05-25-knowledge-intake-ingestion-run.md | 58 +++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 docs/reports/2026-05-25-knowledge-intake-ingestion-run.md

diff --git a/docs/reports/2026-05-25-knowledge-intake-ingestion-run.md b/docs/reports/2026-05-25-knowledge-intake-ingestion-run.md
new file mode 100644
index 0000000..19c3e0d
--- /dev/null
+++ b/docs/reports/2026-05-25-knowledge-intake-ingestion-run.md
@@ -0,0 +1,58 @@
+# Knowledge Intake Ingestion Run — 2026-05-25
+
+Operational run of the `knowledge-intake` ingestion pipeline, capturing how many items each source returned.
+
+## Command
+
+```bash
+cd ~/dev/knowledge-intake && source .env && bun run src/cli.ts ingest
+```
+
+- **Run date**: 2026-05-25
+- **Repo**: `~/dev/knowledge-intake` (sibling to `intent-layer`)
+- **Runtime**: `bun`
+- **Exit code**: `0` (overall run succeeded; one source errored — see below)
+
+## Per-source results
+
+| Source     | Items fetched | Status |
+|------------|--------------:|--------|
+| pinboard   | 0             | OK |
+| papers     | 0             | OK |
+| github     | 100           | OK |
+| email      | 0             | Error — see below |
+| **Total**  | **100**       | 3 of 4 sources succeeded |
+
+The reported total of **100 new items** equals pinboard (0) + papers (0) + github (100). The `email` source contributed 0 items because it failed before fetching.
+
+## Source-level error: email
+
+The `email` source failed during a Gmail search via the `gog` CLI:
+
+```text
+[email] error: gog gmail search failed (exit 1): gmail options: token source: get token for ryan.orban@gmail.com: read token: keyring connection timed out after 10s while reading keyring item (macOS Keychain may be waiting for a permission prompt; run `gog auth list` from a terminal and click "Always Allow" when prompted); set GOG_KEYRING_BACKEND=file and GOG_KEYRING_PASSWORD=<password> to use encrypted file storage instead
+```
+
+**Cause**: The macOS Keychain read for the Gmail OAuth token timed out after 10s. This happens when the Keychain is waiting on an interactive "Always Allow" permission prompt that never gets answered in a non-interactive run.
+
+**Remediation** (either option):
+
+1. Run `gog auth list` from an interactive terminal and click **Always Allow** when macOS prompts for Keychain access, then re-run the ingestion.
+2. Switch `gog` to encrypted file-based token storage so no Keychain prompt is needed: set `GOG_KEYRING_BACKEND=file` and `GOG_KEYRING_PASSWORD=<password>` in the environment before running.
+
+This is an environment/auth issue local to the run host, not a code defect in the ingestion pipeline. The other three sources fetched normally.
+
+## Raw log
+
+```text
+$ cd ~/dev/knowledge-intake && source .env && bun run src/cli.ts ingest
+
+[pinboard] fetched 0 new items
+[papers] fetched 0 new items
+[github] fetched 100 new items
+[email] error: gog gmail search failed (exit 1): gmail options: token source: get token for ryan.orban@gmail.com: read token: keyring connection timed out after 10s while reading keyring item (macOS Keychain may be waiting for a permission prompt; run `gog auth list` from a terminal and click "Always Allow" when prompted); set GOG_KEYRING_BACKEND=file and GOG_KEYRING_PASSWORD=<password> to use encrypted file storage instead
+
+Ingestion complete: 100 new items
+
+EXIT_CODE=0
+```