From 740d1c72952299777681e75856f0dd005c701089 Mon Sep 17 00:00:00 2001 From: Ryan Orban Date: Wed, 15 Apr 2026 13:27:50 -0700 Subject: [PATCH] add test-after-edit eval condition adds a fifth experimental condition that constrains the agent to run tests after every source-file edit before making further changes. isolates the effect of tight test-driven feedback loops from context-injection effects (none/flat_llm/intent_layer). wired through Condition enum, prompt builder, reporter display labels, and the run() CLI YAML_CONDITIONS list. test_condition_enum updated to cover the new variant. --- eval-harness/lib/cli.py | 2 +- eval-harness/lib/prompt_builder.py | 4 ++++ eval-harness/lib/reporter.py | 1 + eval-harness/lib/task_runner.py | 3 +++ eval-harness/tests/test_task_runner.py | 3 ++- 5 files changed, 11 insertions(+), 2 deletions(-) diff --git a/eval-harness/lib/cli.py b/eval-harness/lib/cli.py index accf6e0..6a7d7fe 100644 --- a/eval-harness/lib/cli.py +++ b/eval-harness/lib/cli.py @@ -493,7 +493,7 @@ def run(tasks, parallel, category, output, keep_workspaces, dry_run, timeout, ve click.echo(f"Cleared index cache at {cache_dir}") # Determine conditions to run (HUMAN is AGENTbench-only, not used here) - YAML_CONDITIONS = [Condition.NONE, Condition.FLAT_LLM, Condition.INTENT_LAYER] + YAML_CONDITIONS = [Condition.NONE, Condition.FLAT_LLM, Condition.INTENT_LAYER, Condition.TEST_AFTER_EDIT] if condition: conditions = [Condition(c) for c in condition] else: diff --git a/eval-harness/lib/prompt_builder.py b/eval-harness/lib/prompt_builder.py index f6b4959..7aea700 100644 --- a/eval-harness/lib/prompt_builder.py +++ b/eval-harness/lib/prompt_builder.py @@ -16,6 +16,10 @@ """ +TEST_AFTER_EDIT_PREAMBLE = """CONSTRAINT: Every time you edit a source file, you must immediately run the relevant tests before making any further edits. Read the full test output and use it to decide your next action. Do not batch multiple edits before testing. + +""" + def build_prompt_from_commit_message(message: str, preamble: str | None = None) -> str: """Build a prompt from a git commit message.""" diff --git a/eval-harness/lib/reporter.py b/eval-harness/lib/reporter.py index d59a1bc..c71c911 100644 --- a/eval-harness/lib/reporter.py +++ b/eval-harness/lib/reporter.py @@ -29,6 +29,7 @@ class Reporter: "flat_llm": "Flat LLM", "intent_layer": "Intent Layer", "human": "Human", + "test_after_edit": "Test After Edit", } def __init__(self, output_dir: str): diff --git a/eval-harness/lib/task_runner.py b/eval-harness/lib/task_runner.py index 500f761..2cddd5f 100644 --- a/eval-harness/lib/task_runner.py +++ b/eval-harness/lib/task_runner.py @@ -25,6 +25,7 @@ build_prompt_from_issue, FLAT_PREAMBLE, INTENT_LAYER_PREAMBLE, + TEST_AFTER_EDIT_PREAMBLE, ) from lib.index_cache import IndexCache @@ -142,6 +143,7 @@ class Condition(Enum): FLAT_LLM = "flat_llm" INTENT_LAYER = "intent_layer" HUMAN = "human" + TEST_AFTER_EDIT = "test_after_edit" @dataclass @@ -1165,6 +1167,7 @@ def _build_prompt(self, task: Task, workspace: str, condition: Condition, cached Condition.NONE: None, Condition.FLAT_LLM: FLAT_PREAMBLE, Condition.INTENT_LAYER: INTENT_LAYER_PREAMBLE, + Condition.TEST_AFTER_EDIT: TEST_AFTER_EDIT_PREAMBLE, }[condition] if task.prompt_source == "commit_message": diff --git a/eval-harness/tests/test_task_runner.py b/eval-harness/tests/test_task_runner.py index 0f569b2..dd4484d 100644 --- a/eval-harness/tests/test_task_runner.py +++ b/eval-harness/tests/test_task_runner.py @@ -145,7 +145,8 @@ def test_condition_enum(): assert Condition.FLAT_LLM.value == "flat_llm" assert Condition.INTENT_LAYER.value == "intent_layer" assert Condition.HUMAN.value == "human" - assert len(Condition) == 4 + assert Condition.TEST_AFTER_EDIT.value == "test_after_edit" + assert len(Condition) == 5 def test_find_agents_files(sample_repo):