IBM · DhavalRepo18 · Jun 28, 2026 · Jun 27, 2026 · Jun 27, 2026
diff --git a/src/evaluation/evaluator.py b/src/evaluation/evaluator.py
@@ -9,6 +9,7 @@
 
 import json
 import logging
+import re
 from pathlib import Path
 
 from . import scorers as scorer_registry
@@ -27,12 +28,7 @@
 
 
 class Evaluator:
-    """Run a batch of scenarios against their saved trajectories.
-
-    ``default_scorer`` names the registered scorer to use when a
-    scenario does not set ``scoring_method``.  Per-scenario overrides
-    take precedence.
-    """
+    """Run a batch of scenarios against their saved trajectories."""
 
     def __init__(
         self,
@@ -64,7 +60,8 @@ def _score_one(
         scorer = self._resolve(name)
         self._validate_judge_model(name, traj)
         trajectory_text = _trajectory_to_text(traj)
-        score = scorer(scenario, traj.answer, trajectory_text)
+        answer = _strip_think_blocks(traj.answer)
+        score = scorer(scenario, answer, trajectory_text)
 
         return ScenarioResult(
             scenario_id=scenario.id,
@@ -73,7 +70,7 @@ def _score_one(
             runner=traj.runner,
             model=traj.model,
             question=traj.question,
-            answer=traj.answer,
+            answer=answer,
             score=score,
             ops=metrics_from_trajectory(traj),
         )
@@ -108,10 +105,20 @@ def _trajectory_to_text(traj: PersistedTrajectory) -> str:
         return str(traj.trajectory)
 
 
+def _strip_think_blocks(answer: str) -> str:
+    """Remove model-private <think>...</think> blocks before scoring."""
+    return re.sub(
+        r"<think\b[^>]*>.*?</think>",
+        "",
+        answer,
+        flags=re.IGNORECASE | re.DOTALL,
+    ).strip()
+
+
 def _normalize_model_id(model_id: str | None) -> str:
     if not model_id:
         return ""
     normalized = model_id.strip()
     if normalized.startswith("litellm_proxy/"):
         normalized = normalized[len("litellm_proxy/") :]
-    return normalized
+    return normalized
diff --git a/src/evaluation/tests/test_evaluator.py b/src/evaluation/tests/test_evaluator.py
@@ -35,13 +35,50 @@ def test_evaluator_routes_to_default_scorer(tmp_path: Path, make_persisted_recor
     assert report.results[0].score.scorer == "stub-evaluator"
 
 
+def test_evaluator_strips_think_blocks_before_scoring(
+    tmp_path: Path, make_persisted_record
+):
+    seen: dict[str, str] = {}
+
+    def capture_scorer(
+        scenario: Scenario, answer: str, trajectory_text: str
+    ) -> ScorerResult:
+        seen["answer"] = answer
+        return ScorerResult(scorer="capture-evaluator", passed=True, score=1.0)
+
+    rec = make_persisted_record(
+        run_id="run-1",
+        scenario_id=1,
+        answer=(
+            "<think>I should inspect the work orders.</think>\n\n"
+            "<think>There are no kit entries.</think>\n\n"
+            "0"
+        ),
+    )
+    (tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8")
+
+    scenarios_path = tmp_path / "scenarios.json"
+    scenarios_path.write_text(
+        json.dumps([{"id": 1, "text": "Q", "type": "wo"}]),
+        encoding="utf-8",
+    )
+
+    registry.register("capture-evaluator", capture_scorer)
+
+    report = Evaluator(default_scorer="capture-evaluator").evaluate(
+        trajectories_path=tmp_path,
+        scenarios_paths=[scenarios_path],
+    )
+
+    assert seen["answer"] == "0"
+    assert report.results[0].answer == "0"
+
+
 def _fail_scorer(scenario: Scenario, answer: str, trajectory_text: str) -> ScorerResult:
     return ScorerResult(scorer="fail-default", passed=False, score=0.0)
 
 
 def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_record):
-    # The scenario-level scoring_method must route around the default
-    # scorer, even when the default scorer would reject the answer.
     rec = make_persisted_record(run_id="run-1", scenario_id=1, answer="answer text")
     (tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8")
 
@@ -166,4 +203,4 @@ def test_evaluator_allows_non_llm_judge_even_with_matching_model(
         scenarios_paths=[scenarios_path],
     )
 
-    assert report.totals["passed"] == 1
+    assert report.totals["passed"] == 1