diff --git a/eval-harness/lib/cli.py b/eval-harness/lib/cli.py index accf6e0..6a7d7fe 100644 --- a/eval-harness/lib/cli.py +++ b/eval-harness/lib/cli.py @@ -493,7 +493,7 @@ def run(tasks, parallel, category, output, keep_workspaces, dry_run, timeout, ve click.echo(f"Cleared index cache at {cache_dir}") # Determine conditions to run (HUMAN is AGENTbench-only, not used here) - YAML_CONDITIONS = [Condition.NONE, Condition.FLAT_LLM, Condition.INTENT_LAYER] + YAML_CONDITIONS = [Condition.NONE, Condition.FLAT_LLM, Condition.INTENT_LAYER, Condition.TEST_AFTER_EDIT] if condition: conditions = [Condition(c) for c in condition] else: diff --git a/eval-harness/lib/prompt_builder.py b/eval-harness/lib/prompt_builder.py index f6b4959..7aea700 100644 --- a/eval-harness/lib/prompt_builder.py +++ b/eval-harness/lib/prompt_builder.py @@ -16,6 +16,10 @@ """ +TEST_AFTER_EDIT_PREAMBLE = """CONSTRAINT: Every time you edit a source file, you must immediately run the relevant tests before making any further edits. Read the full test output and use it to decide your next action. Do not batch multiple edits before testing. + +""" + def build_prompt_from_commit_message(message: str, preamble: str | None = None) -> str: """Build a prompt from a git commit message.""" diff --git a/eval-harness/lib/reporter.py b/eval-harness/lib/reporter.py index d59a1bc..c71c911 100644 --- a/eval-harness/lib/reporter.py +++ b/eval-harness/lib/reporter.py @@ -29,6 +29,7 @@ class Reporter: "flat_llm": "Flat LLM", "intent_layer": "Intent Layer", "human": "Human", + "test_after_edit": "Test After Edit", } def __init__(self, output_dir: str): diff --git a/eval-harness/lib/task_runner.py b/eval-harness/lib/task_runner.py index 500f761..2cddd5f 100644 --- a/eval-harness/lib/task_runner.py +++ b/eval-harness/lib/task_runner.py @@ -25,6 +25,7 @@ build_prompt_from_issue, FLAT_PREAMBLE, INTENT_LAYER_PREAMBLE, + TEST_AFTER_EDIT_PREAMBLE, ) from lib.index_cache import IndexCache @@ -142,6 +143,7 @@ class Condition(Enum): FLAT_LLM = "flat_llm" INTENT_LAYER = "intent_layer" HUMAN = "human" + TEST_AFTER_EDIT = "test_after_edit" @dataclass @@ -1165,6 +1167,7 @@ def _build_prompt(self, task: Task, workspace: str, condition: Condition, cached Condition.NONE: None, Condition.FLAT_LLM: FLAT_PREAMBLE, Condition.INTENT_LAYER: INTENT_LAYER_PREAMBLE, + Condition.TEST_AFTER_EDIT: TEST_AFTER_EDIT_PREAMBLE, }[condition] if task.prompt_source == "commit_message": diff --git a/eval-harness/tests/test_task_runner.py b/eval-harness/tests/test_task_runner.py index 0f569b2..dd4484d 100644 --- a/eval-harness/tests/test_task_runner.py +++ b/eval-harness/tests/test_task_runner.py @@ -145,7 +145,8 @@ def test_condition_enum(): assert Condition.FLAT_LLM.value == "flat_llm" assert Condition.INTENT_LAYER.value == "intent_layer" assert Condition.HUMAN.value == "human" - assert len(Condition) == 4 + assert Condition.TEST_AFTER_EDIT.value == "test_after_edit" + assert len(Condition) == 5 def test_find_agents_files(sample_repo):