Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 16 additions & 9 deletions src/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import json
import logging
import re
from pathlib import Path

from . import scorers as scorer_registry
Expand All @@ -27,12 +28,7 @@


class Evaluator:
"""Run a batch of scenarios against their saved trajectories.

``default_scorer`` names the registered scorer to use when a
scenario does not set ``scoring_method``. Per-scenario overrides
take precedence.
"""
"""Run a batch of scenarios against their saved trajectories."""

def __init__(
self,
Expand Down Expand Up @@ -64,7 +60,8 @@ def _score_one(
scorer = self._resolve(name)
self._validate_judge_model(name, traj)
trajectory_text = _trajectory_to_text(traj)
score = scorer(scenario, traj.answer, trajectory_text)
answer = _strip_think_blocks(traj.answer)
score = scorer(scenario, answer, trajectory_text)

return ScenarioResult(
scenario_id=scenario.id,
Expand All @@ -73,7 +70,7 @@ def _score_one(
runner=traj.runner,
model=traj.model,
question=traj.question,
answer=traj.answer,
answer=answer,
score=score,
ops=metrics_from_trajectory(traj),
)
Expand Down Expand Up @@ -108,10 +105,20 @@ def _trajectory_to_text(traj: PersistedTrajectory) -> str:
return str(traj.trajectory)


def _strip_think_blocks(answer: str) -> str:
"""Remove model-private <think>...</think> blocks before scoring."""
return re.sub(
r"<think\b[^>]*>.*?</think>",
"",
answer,
flags=re.IGNORECASE | re.DOTALL,
).strip()


def _normalize_model_id(model_id: str | None) -> str:
if not model_id:
return ""
normalized = model_id.strip()
if normalized.startswith("litellm_proxy/"):
normalized = normalized[len("litellm_proxy/") :]
return normalized
return normalized
43 changes: 40 additions & 3 deletions src/evaluation/tests/test_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,50 @@ def test_evaluator_routes_to_default_scorer(tmp_path: Path, make_persisted_recor
assert report.results[0].score.scorer == "stub-evaluator"


def test_evaluator_strips_think_blocks_before_scoring(
tmp_path: Path, make_persisted_record
):
seen: dict[str, str] = {}

def capture_scorer(
scenario: Scenario, answer: str, trajectory_text: str
) -> ScorerResult:
seen["answer"] = answer
return ScorerResult(scorer="capture-evaluator", passed=True, score=1.0)

rec = make_persisted_record(
run_id="run-1",
scenario_id=1,
answer=(
"<think>I should inspect the work orders.</think>\n\n"
"<think>There are no kit entries.</think>\n\n"
"0"
),
)
(tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8")

scenarios_path = tmp_path / "scenarios.json"
scenarios_path.write_text(
json.dumps([{"id": 1, "text": "Q", "type": "wo"}]),
encoding="utf-8",
)

registry.register("capture-evaluator", capture_scorer)

report = Evaluator(default_scorer="capture-evaluator").evaluate(
trajectories_path=tmp_path,
scenarios_paths=[scenarios_path],
)

assert seen["answer"] == "0"
assert report.results[0].answer == "0"


def _fail_scorer(scenario: Scenario, answer: str, trajectory_text: str) -> ScorerResult:
return ScorerResult(scorer="fail-default", passed=False, score=0.0)


def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_record):
# The scenario-level scoring_method must route around the default
# scorer, even when the default scorer would reject the answer.
rec = make_persisted_record(run_id="run-1", scenario_id=1, answer="answer text")
(tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8")

Expand Down Expand Up @@ -166,4 +203,4 @@ def test_evaluator_allows_non_llm_judge_even_with_matching_model(
scenarios_paths=[scenarios_path],
)

assert report.totals["passed"] == 1
assert report.totals["passed"] == 1