diff --git a/src/evaluation/evaluator.py b/src/evaluation/evaluator.py index 27845f08..9beb1cc8 100644 --- a/src/evaluation/evaluator.py +++ b/src/evaluation/evaluator.py @@ -9,6 +9,7 @@ import json import logging +import re from pathlib import Path from . import scorers as scorer_registry @@ -27,12 +28,7 @@ class Evaluator: - """Run a batch of scenarios against their saved trajectories. - - ``default_scorer`` names the registered scorer to use when a - scenario does not set ``scoring_method``. Per-scenario overrides - take precedence. - """ + """Run a batch of scenarios against their saved trajectories.""" def __init__( self, @@ -64,7 +60,8 @@ def _score_one( scorer = self._resolve(name) self._validate_judge_model(name, traj) trajectory_text = _trajectory_to_text(traj) - score = scorer(scenario, traj.answer, trajectory_text) + answer = _strip_think_blocks(traj.answer) + score = scorer(scenario, answer, trajectory_text) return ScenarioResult( scenario_id=scenario.id, @@ -73,7 +70,7 @@ def _score_one( runner=traj.runner, model=traj.model, question=traj.question, - answer=traj.answer, + answer=answer, score=score, ops=metrics_from_trajectory(traj), ) @@ -108,10 +105,20 @@ def _trajectory_to_text(traj: PersistedTrajectory) -> str: return str(traj.trajectory) +def _strip_think_blocks(answer: str) -> str: + """Remove model-private ... blocks before scoring.""" + return re.sub( + r"]*>.*?", + "", + answer, + flags=re.IGNORECASE | re.DOTALL, + ).strip() + + def _normalize_model_id(model_id: str | None) -> str: if not model_id: return "" normalized = model_id.strip() if normalized.startswith("litellm_proxy/"): normalized = normalized[len("litellm_proxy/") :] - return normalized + return normalized \ No newline at end of file diff --git a/src/evaluation/tests/test_evaluator.py b/src/evaluation/tests/test_evaluator.py index 5cab8665..b1109213 100644 --- a/src/evaluation/tests/test_evaluator.py +++ b/src/evaluation/tests/test_evaluator.py @@ -35,13 +35,50 @@ def test_evaluator_routes_to_default_scorer(tmp_path: Path, make_persisted_recor assert report.results[0].score.scorer == "stub-evaluator" +def test_evaluator_strips_think_blocks_before_scoring( + tmp_path: Path, make_persisted_record +): + seen: dict[str, str] = {} + + def capture_scorer( + scenario: Scenario, answer: str, trajectory_text: str + ) -> ScorerResult: + seen["answer"] = answer + return ScorerResult(scorer="capture-evaluator", passed=True, score=1.0) + + rec = make_persisted_record( + run_id="run-1", + scenario_id=1, + answer=( + "I should inspect the work orders.\n\n" + "There are no kit entries.\n\n" + "0" + ), + ) + (tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8") + + scenarios_path = tmp_path / "scenarios.json" + scenarios_path.write_text( + json.dumps([{"id": 1, "text": "Q", "type": "wo"}]), + encoding="utf-8", + ) + + registry.register("capture-evaluator", capture_scorer) + + report = Evaluator(default_scorer="capture-evaluator").evaluate( + trajectories_path=tmp_path, + scenarios_paths=[scenarios_path], + ) + + assert seen["answer"] == "0" + assert report.results[0].answer == "0" + + def _fail_scorer(scenario: Scenario, answer: str, trajectory_text: str) -> ScorerResult: return ScorerResult(scorer="fail-default", passed=False, score=0.0) def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_record): - # The scenario-level scoring_method must route around the default - # scorer, even when the default scorer would reject the answer. rec = make_persisted_record(run_id="run-1", scenario_id=1, answer="answer text") (tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8") @@ -166,4 +203,4 @@ def test_evaluator_allows_non_llm_judge_even_with_matching_model( scenarios_paths=[scenarios_path], ) - assert report.totals["passed"] == 1 + assert report.totals["passed"] == 1 \ No newline at end of file