Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion eval-harness/lib/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,7 @@ def run(tasks, parallel, category, output, keep_workspaces, dry_run, timeout, ve
click.echo(f"Cleared index cache at {cache_dir}")

# Determine conditions to run (HUMAN is AGENTbench-only, not used here)
YAML_CONDITIONS = [Condition.NONE, Condition.FLAT_LLM, Condition.INTENT_LAYER]
YAML_CONDITIONS = [Condition.NONE, Condition.FLAT_LLM, Condition.INTENT_LAYER, Condition.TEST_AFTER_EDIT]
if condition:
conditions = [Condition(c) for c in condition]
else:
Expand Down
4 changes: 4 additions & 0 deletions eval-harness/lib/prompt_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@

"""

TEST_AFTER_EDIT_PREAMBLE = """CONSTRAINT: Every time you edit a source file, you must immediately run the relevant tests before making any further edits. Read the full test output and use it to decide your next action. Do not batch multiple edits before testing.

"""


def build_prompt_from_commit_message(message: str, preamble: str | None = None) -> str:
"""Build a prompt from a git commit message."""
Expand Down
1 change: 1 addition & 0 deletions eval-harness/lib/reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class Reporter:
"flat_llm": "Flat LLM",
"intent_layer": "Intent Layer",
"human": "Human",
"test_after_edit": "Test After Edit",
}

def __init__(self, output_dir: str):
Expand Down
3 changes: 3 additions & 0 deletions eval-harness/lib/task_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
build_prompt_from_issue,
FLAT_PREAMBLE,
INTENT_LAYER_PREAMBLE,
TEST_AFTER_EDIT_PREAMBLE,
)
from lib.index_cache import IndexCache

Expand Down Expand Up @@ -142,6 +143,7 @@ class Condition(Enum):
FLAT_LLM = "flat_llm"
INTENT_LAYER = "intent_layer"
HUMAN = "human"
TEST_AFTER_EDIT = "test_after_edit"


@dataclass
Expand Down Expand Up @@ -1165,6 +1167,7 @@ def _build_prompt(self, task: Task, workspace: str, condition: Condition, cached
Condition.NONE: None,
Condition.FLAT_LLM: FLAT_PREAMBLE,
Condition.INTENT_LAYER: INTENT_LAYER_PREAMBLE,
Condition.TEST_AFTER_EDIT: TEST_AFTER_EDIT_PREAMBLE,
}[condition]

if task.prompt_source == "commit_message":
Expand Down
3 changes: 2 additions & 1 deletion eval-harness/tests/test_task_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,8 @@ def test_condition_enum():
assert Condition.FLAT_LLM.value == "flat_llm"
assert Condition.INTENT_LAYER.value == "intent_layer"
assert Condition.HUMAN.value == "human"
assert len(Condition) == 4
assert Condition.TEST_AFTER_EDIT.value == "test_after_edit"
assert len(Condition) == 5


def test_find_agents_files(sample_repo):
Expand Down