From 285c43937ef38e990c86c081aab3c0c058faa1fa Mon Sep 17 00:00:00 2001
From: Chathurangi Shyalika
 <chathurangishyalika@Chathurangis-MacBook-Pro.local>
Date: Sat, 27 Jun 2026 18:20:42 -0400
Subject: [PATCH 1/2] Strip <think> blocks before evaluation

Signed-off-by: Chathurangi Shyalika <chathurangishyalika@Chathurangis-MacBook-Pro.local>
---
 src/evaluation/evaluator.py            |  16 ++-
 src/evaluation/tests/test_evaluator.py | 147 +++++++------------------
 2 files changed, 53 insertions(+), 110 deletions(-)
diff --git a/src/evaluation/evaluator.py b/src/evaluation/evaluator.py
index 27845f08..b35870c0 100644
--- a/src/evaluation/evaluator.py
+++ b/src/evaluation/evaluator.py
@@ -9,6 +9,7 @@
 
 import json
 import logging
+import re
 from pathlib import Path
 
 from . import scorers as scorer_registry
@@ -64,7 +65,8 @@ def _score_one(
         scorer = self._resolve(name)
         self._validate_judge_model(name, traj)
         trajectory_text = _trajectory_to_text(traj)
-        score = scorer(scenario, traj.answer, trajectory_text)
+        answer = _strip_think_blocks(traj.answer)
+        score = scorer(scenario, answer, trajectory_text)
 
         return ScenarioResult(
             scenario_id=scenario.id,
@@ -73,7 +75,7 @@ def _score_one(
             runner=traj.runner,
             model=traj.model,
             question=traj.question,
-            answer=traj.answer,
+            answer=answer,
             score=score,
             ops=metrics_from_trajectory(traj),
         )
@@ -108,6 +110,16 @@ def _trajectory_to_text(traj: PersistedTrajectory) -> str:
         return str(traj.trajectory)
 
 
+def _strip_think_blocks(answer: str) -> str:
+    """Remove model-private <think>...</think> blocks before scoring."""
+    return re.sub(
+        r"<think\b[^>]*>.*?</think>",
+        "",
+        answer,
+        flags=re.IGNORECASE | re.DOTALL,
+    ).strip()
+
+
 def _normalize_model_id(model_id: str | None) -> str:
     if not model_id:
         return ""
diff --git a/src/evaluation/tests/test_evaluator.py b/src/evaluation/tests/test_evaluator.py
index 5cab8665..f101361b 100644
--- a/src/evaluation/tests/test_evaluator.py
+++ b/src/evaluation/tests/test_evaluator.py
@@ -35,135 +35,66 @@ def test_evaluator_routes_to_default_scorer(tmp_path: Path, make_persisted_recor
     assert report.results[0].score.scorer == "stub-evaluator"
 
 
-def _fail_scorer(scenario: Scenario, answer: str, trajectory_text: str) -> ScorerResult:
-    return ScorerResult(scorer="fail-default", passed=False, score=0.0)
-
-
-def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_record):
-    # The scenario-level scoring_method must route around the default
-    # scorer, even when the default scorer would reject the answer.
-    rec = make_persisted_record(run_id="run-1", scenario_id=1, answer="answer text")
-    (tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8")
-
-    scenarios_path = tmp_path / "scenarios.json"
-    scenarios_path.write_text(
-        json.dumps(
-            [
-                {
-                    "id": 1,
-                    "text": "Q",
-                    "type": "tsfm",
-                    "scoring_method": "stub-evaluator",
-                }
-            ]
-        ),
-        encoding="utf-8",
-    )
-
-    registry.register("stub-evaluator", _stub_scorer)
-    registry.register("fail-default", _fail_scorer)
-
-    report = Evaluator(default_scorer="fail-default").evaluate(
-        trajectories_path=tmp_path,
-        scenarios_paths=[scenarios_path],
-    )
-
-    assert report.totals["passed"] == 1
-    assert report.results[0].score.scorer == "stub-evaluator"
-
+def test_evaluator_strips_think_blocks_before_scoring(
+    tmp_path: Path, make_persisted_record
+):
+    seen: dict[str, str] = {}
 
-def test_evaluator_rejects_self_judging_model(tmp_path: Path, make_persisted_record):
-    trajectories_dir = tmp_path / "trajectories"
-    trajectories_dir.mkdir()
+    def capture_scorer(
+        scenario: Scenario, answer: str, trajectory_text: str
+    ) -> ScorerResult:
+        seen["answer"] = answer
+        return ScorerResult(scorer="capture-evaluator", passed=True, score=1.0)
 
     rec = make_persisted_record(
         run_id="run-1",
         scenario_id=1,
-        model="litellm_proxy/aws/claude-opus-4-6",
+        answer=(
+            "<think>I should inspect the work orders.</think>\n\n"
+            "<think>There are no kit entries.</think>\n\n"
+            "0"
+        ),
     )
-    (trajectories_dir / "run-1.json").write_text(json.dumps(rec), encoding="utf-8")
+    (tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8")
 
     scenarios_path = tmp_path / "scenarios.json"
     scenarios_path.write_text(
-        json.dumps([{"id": 1, "text": "Q", "type": "iot"}]),
+        json.dumps([{"id": 1, "text": "Q", "type": "wo"}]),
         encoding="utf-8",
     )
 
-    registry.register("llm_judge", _stub_scorer)
-
-    try:
-        Evaluator(
-            default_scorer="llm_judge",
-            judge_model="litellm_proxy/aws/claude-opus-4-6",
-        ).evaluate(
-            trajectories_path=trajectories_dir,
-            scenarios_paths=[scenarios_path],
-        )
-    except ValueError as exc:
-        assert "self-judging is not allowed" in str(exc)
-    else:
-        raise AssertionError("expected ValueError for self-judging")
-
-
-def test_evaluator_rejects_self_judging_with_normalized_model_ids(
-    tmp_path: Path, make_persisted_record
-):
-    trajectories_dir = tmp_path / "trajectories"
-    trajectories_dir.mkdir()
+    registry.register("capture-evaluator", capture_scorer)
 
-    rec = make_persisted_record(
-        run_id="run-1",
-        scenario_id=1,
-        model="litellm_proxy/aws/claude-opus-4-6",
+    report = Evaluator(default_scorer="capture-evaluator").evaluate(
+        trajectories_path=tmp_path,
+        scenarios_paths=[scenarios_path],
     )
-    (trajectories_dir / "run-1.json").write_text(json.dumps(rec), encoding="utf-8")
 
-    scenarios_path = tmp_path / "scenarios.json"
-    scenarios_path.write_text(
-        json.dumps([{"id": 1, "text": "Q", "type": "iot"}]),
-        encoding="utf-8",
-    )
+    assert seen["answer"] == "0"
+    assert report.results[0].answer == "0"
 
-    registry.register("llm_judge", _stub_scorer)
 
-    try:
-        Evaluator(
-            default_scorer="llm_judge",
-            judge_model="aws/claude-opus-4-6",
-        ).evaluate(
-            trajectories_path=trajectories_dir,
-            scenarios_paths=[scenarios_path],
-        )
-    except ValueError as exc:
-        assert "self-judging is not allowed" in str(exc)
-    else:
-        raise AssertionError("expected ValueError for self-judging")
+def _fail_scorer(scenario: Scenario, answer: str, trajectory_text: str) -> ScorerResult:
+    return ScorerResult(scorer="fail-default", passed=False, score=0.0)
 
 
-def test_evaluator_allows_non_llm_judge_even_with_matching_model(
-    tmp_path: Path, make_persisted_record
-):
-    rec = make_persisted_record(
-        run_id="run-1",
-        scenario_id=1,
-        model="litellm_proxy/aws/claude-opus-4-6",
-    )
+def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_record):
+    # The scenario-level scoring_method must route around the default
+    # scorer, even when the default scorer would reject the answer.
+    rec = make_persisted_record(run_id="run-1", scenario_id=1, answer="answer text")
     (tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8")
 
     scenarios_path = tmp_path / "scenarios.json"
     scenarios_path.write_text(
-        json.dumps([{"id": 1, "text": "Q", "type": "iot"}]),
+        json.dumps(
+            [
+                {
+                    "id": 1,
+                    "text": "Q",
+                    "type": "tsfm",
+                    "scoring_method": "stub-evaluator",
+                }
+            ]
+        ),
         encoding="utf-8",
-    )
-
-    registry.register("stub-evaluator", _stub_scorer)
-
-    report = Evaluator(
-        default_scorer="stub-evaluator",
-        judge_model="aws/claude-opus-4-6",
-    ).evaluate(
-        trajectories_path=tmp_path,
-        scenarios_paths=[scenarios_path],
-    )
-
-    assert report.totals["passed"] == 1
+    )
\ No newline at end of file

From fee20175ecd06867004509d3e5940efdb8aa007e Mon Sep 17 00:00:00 2001
From: Chathurangi Shyalika
 <chathurangishyalika@Chathurangis-MacBook-Pro.local>
Date: Sat, 27 Jun 2026 18:36:37 -0400
Subject: [PATCH 2/2] [Updated] Strip <think> blocks before evaluation

Signed-off-by: Chathurangi Shyalika <chathurangishyalika@Chathurangis-MacBook-Pro.local>
---
 src/evaluation/evaluator.py            |   9 +-
 src/evaluation/tests/test_evaluator.py | 112 ++++++++++++++++++++++++-
 2 files changed, 111 insertions(+), 10 deletions(-)

diff --git a/src/evaluation/evaluator.py b/src/evaluation/evaluator.py
index b35870c0..9beb1cc8 100644
--- a/src/evaluation/evaluator.py
+++ b/src/evaluation/evaluator.py
@@ -28,12 +28,7 @@
 
 
 class Evaluator:
-    """Run a batch of scenarios against their saved trajectories.
-
-    ``default_scorer`` names the registered scorer to use when a
-    scenario does not set ``scoring_method``.  Per-scenario overrides
-    take precedence.
-    """
+    """Run a batch of scenarios against their saved trajectories."""
 
     def __init__(
         self,
@@ -126,4 +121,4 @@ def _normalize_model_id(model_id: str | None) -> str:
     normalized = model_id.strip()
     if normalized.startswith("litellm_proxy/"):
         normalized = normalized[len("litellm_proxy/") :]
-    return normalized
+    return normalized
\ No newline at end of file
diff --git a/src/evaluation/tests/test_evaluator.py b/src/evaluation/tests/test_evaluator.py
index f101361b..b1109213 100644
--- a/src/evaluation/tests/test_evaluator.py
+++ b/src/evaluation/tests/test_evaluator.py
@@ -79,8 +79,6 @@ def _fail_scorer(scenario: Scenario, answer: str, trajectory_text: str) -> Score
 
 
 def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_record):
-    # The scenario-level scoring_method must route around the default
-    # scorer, even when the default scorer would reject the answer.
     rec = make_persisted_record(run_id="run-1", scenario_id=1, answer="answer text")
     (tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8")
 
@@ -97,4 +95,112 @@ def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_rec
             ]
         ),
         encoding="utf-8",
-    )
\ No newline at end of file
+    )
+
+    registry.register("stub-evaluator", _stub_scorer)
+    registry.register("fail-default", _fail_scorer)
+
+    report = Evaluator(default_scorer="fail-default").evaluate(
+        trajectories_path=tmp_path,
+        scenarios_paths=[scenarios_path],
+    )
+
+    assert report.totals["passed"] == 1
+    assert report.results[0].score.scorer == "stub-evaluator"
+
+
+def test_evaluator_rejects_self_judging_model(tmp_path: Path, make_persisted_record):
+    trajectories_dir = tmp_path / "trajectories"
+    trajectories_dir.mkdir()
+
+    rec = make_persisted_record(
+        run_id="run-1",
+        scenario_id=1,
+        model="litellm_proxy/aws/claude-opus-4-6",
+    )
+    (trajectories_dir / "run-1.json").write_text(json.dumps(rec), encoding="utf-8")
+
+    scenarios_path = tmp_path / "scenarios.json"
+    scenarios_path.write_text(
+        json.dumps([{"id": 1, "text": "Q", "type": "iot"}]),
+        encoding="utf-8",
+    )
+
+    registry.register("llm_judge", _stub_scorer)
+
+    try:
+        Evaluator(
+            default_scorer="llm_judge",
+            judge_model="litellm_proxy/aws/claude-opus-4-6",
+        ).evaluate(
+            trajectories_path=trajectories_dir,
+            scenarios_paths=[scenarios_path],
+        )
+    except ValueError as exc:
+        assert "self-judging is not allowed" in str(exc)
+    else:
+        raise AssertionError("expected ValueError for self-judging")
+
+
+def test_evaluator_rejects_self_judging_with_normalized_model_ids(
+    tmp_path: Path, make_persisted_record
+):
+    trajectories_dir = tmp_path / "trajectories"
+    trajectories_dir.mkdir()
+
+    rec = make_persisted_record(
+        run_id="run-1",
+        scenario_id=1,
+        model="litellm_proxy/aws/claude-opus-4-6",
+    )
+    (trajectories_dir / "run-1.json").write_text(json.dumps(rec), encoding="utf-8")
+
+    scenarios_path = tmp_path / "scenarios.json"
+    scenarios_path.write_text(
+        json.dumps([{"id": 1, "text": "Q", "type": "iot"}]),
+        encoding="utf-8",
+    )
+
+    registry.register("llm_judge", _stub_scorer)
+
+    try:
+        Evaluator(
+            default_scorer="llm_judge",
+            judge_model="aws/claude-opus-4-6",
+        ).evaluate(
+            trajectories_path=trajectories_dir,
+            scenarios_paths=[scenarios_path],
+        )
+    except ValueError as exc:
+        assert "self-judging is not allowed" in str(exc)
+    else:
+        raise AssertionError("expected ValueError for self-judging")
+
+
+def test_evaluator_allows_non_llm_judge_even_with_matching_model(
+    tmp_path: Path, make_persisted_record
+):
+    rec = make_persisted_record(
+        run_id="run-1",
+        scenario_id=1,
+        model="litellm_proxy/aws/claude-opus-4-6",
+    )
+    (tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8")
+
+    scenarios_path = tmp_path / "scenarios.json"
+    scenarios_path.write_text(
+        json.dumps([{"id": 1, "text": "Q", "type": "iot"}]),
+        encoding="utf-8",
+    )
+
+    registry.register("stub-evaluator", _stub_scorer)
+
+    report = Evaluator(
+        default_scorer="stub-evaluator",
+        judge_model="aws/claude-opus-4-6",
+    ).evaluate(
+        trajectories_path=tmp_path,
+        scenarios_paths=[scenarios_path],
+    )
+
+    assert report.totals["passed"] == 1
\ No newline at end of file