diff --git a/src/evaluation/evaluator.py b/src/evaluation/evaluator.py
index 27845f08..9beb1cc8 100644
--- a/src/evaluation/evaluator.py
+++ b/src/evaluation/evaluator.py
@@ -9,6 +9,7 @@
import json
import logging
+import re
from pathlib import Path
from . import scorers as scorer_registry
@@ -27,12 +28,7 @@
class Evaluator:
- """Run a batch of scenarios against their saved trajectories.
-
- ``default_scorer`` names the registered scorer to use when a
- scenario does not set ``scoring_method``. Per-scenario overrides
- take precedence.
- """
+ """Run a batch of scenarios against their saved trajectories."""
def __init__(
self,
@@ -64,7 +60,8 @@ def _score_one(
scorer = self._resolve(name)
self._validate_judge_model(name, traj)
trajectory_text = _trajectory_to_text(traj)
- score = scorer(scenario, traj.answer, trajectory_text)
+ answer = _strip_think_blocks(traj.answer)
+ score = scorer(scenario, answer, trajectory_text)
return ScenarioResult(
scenario_id=scenario.id,
@@ -73,7 +70,7 @@ def _score_one(
runner=traj.runner,
model=traj.model,
question=traj.question,
- answer=traj.answer,
+ answer=answer,
score=score,
ops=metrics_from_trajectory(traj),
)
@@ -108,10 +105,20 @@ def _trajectory_to_text(traj: PersistedTrajectory) -> str:
return str(traj.trajectory)
+def _strip_think_blocks(answer: str) -> str:
+ """Remove model-private ... blocks before scoring."""
+ return re.sub(
+ r"]*>.*?",
+ "",
+ answer,
+ flags=re.IGNORECASE | re.DOTALL,
+ ).strip()
+
+
def _normalize_model_id(model_id: str | None) -> str:
if not model_id:
return ""
normalized = model_id.strip()
if normalized.startswith("litellm_proxy/"):
normalized = normalized[len("litellm_proxy/") :]
- return normalized
+ return normalized
\ No newline at end of file
diff --git a/src/evaluation/tests/test_evaluator.py b/src/evaluation/tests/test_evaluator.py
index 5cab8665..b1109213 100644
--- a/src/evaluation/tests/test_evaluator.py
+++ b/src/evaluation/tests/test_evaluator.py
@@ -35,13 +35,50 @@ def test_evaluator_routes_to_default_scorer(tmp_path: Path, make_persisted_recor
assert report.results[0].score.scorer == "stub-evaluator"
+def test_evaluator_strips_think_blocks_before_scoring(
+ tmp_path: Path, make_persisted_record
+):
+ seen: dict[str, str] = {}
+
+ def capture_scorer(
+ scenario: Scenario, answer: str, trajectory_text: str
+ ) -> ScorerResult:
+ seen["answer"] = answer
+ return ScorerResult(scorer="capture-evaluator", passed=True, score=1.0)
+
+ rec = make_persisted_record(
+ run_id="run-1",
+ scenario_id=1,
+ answer=(
+ "I should inspect the work orders.\n\n"
+ "There are no kit entries.\n\n"
+ "0"
+ ),
+ )
+ (tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8")
+
+ scenarios_path = tmp_path / "scenarios.json"
+ scenarios_path.write_text(
+ json.dumps([{"id": 1, "text": "Q", "type": "wo"}]),
+ encoding="utf-8",
+ )
+
+ registry.register("capture-evaluator", capture_scorer)
+
+ report = Evaluator(default_scorer="capture-evaluator").evaluate(
+ trajectories_path=tmp_path,
+ scenarios_paths=[scenarios_path],
+ )
+
+ assert seen["answer"] == "0"
+ assert report.results[0].answer == "0"
+
+
def _fail_scorer(scenario: Scenario, answer: str, trajectory_text: str) -> ScorerResult:
return ScorerResult(scorer="fail-default", passed=False, score=0.0)
def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_record):
- # The scenario-level scoring_method must route around the default
- # scorer, even when the default scorer would reject the answer.
rec = make_persisted_record(run_id="run-1", scenario_id=1, answer="answer text")
(tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8")
@@ -166,4 +203,4 @@ def test_evaluator_allows_non_llm_judge_even_with_matching_model(
scenarios_paths=[scenarios_path],
)
- assert report.totals["passed"] == 1
+ assert report.totals["passed"] == 1
\ No newline at end of file