From 285c43937ef38e990c86c081aab3c0c058faa1fa Mon Sep 17 00:00:00 2001 From: Chathurangi Shyalika Date: Sat, 27 Jun 2026 18:20:42 -0400 Subject: [PATCH 1/2] Strip blocks before evaluation Signed-off-by: Chathurangi Shyalika --- src/evaluation/evaluator.py | 16 ++- src/evaluation/tests/test_evaluator.py | 147 +++++++------------------ 2 files changed, 53 insertions(+), 110 deletions(-) diff --git a/src/evaluation/evaluator.py b/src/evaluation/evaluator.py index 27845f08..b35870c0 100644 --- a/src/evaluation/evaluator.py +++ b/src/evaluation/evaluator.py @@ -9,6 +9,7 @@ import json import logging +import re from pathlib import Path from . import scorers as scorer_registry @@ -64,7 +65,8 @@ def _score_one( scorer = self._resolve(name) self._validate_judge_model(name, traj) trajectory_text = _trajectory_to_text(traj) - score = scorer(scenario, traj.answer, trajectory_text) + answer = _strip_think_blocks(traj.answer) + score = scorer(scenario, answer, trajectory_text) return ScenarioResult( scenario_id=scenario.id, @@ -73,7 +75,7 @@ def _score_one( runner=traj.runner, model=traj.model, question=traj.question, - answer=traj.answer, + answer=answer, score=score, ops=metrics_from_trajectory(traj), ) @@ -108,6 +110,16 @@ def _trajectory_to_text(traj: PersistedTrajectory) -> str: return str(traj.trajectory) +def _strip_think_blocks(answer: str) -> str: + """Remove model-private ... blocks before scoring.""" + return re.sub( + r"]*>.*?", + "", + answer, + flags=re.IGNORECASE | re.DOTALL, + ).strip() + + def _normalize_model_id(model_id: str | None) -> str: if not model_id: return "" diff --git a/src/evaluation/tests/test_evaluator.py b/src/evaluation/tests/test_evaluator.py index 5cab8665..f101361b 100644 --- a/src/evaluation/tests/test_evaluator.py +++ b/src/evaluation/tests/test_evaluator.py @@ -35,135 +35,66 @@ def test_evaluator_routes_to_default_scorer(tmp_path: Path, make_persisted_recor assert report.results[0].score.scorer == "stub-evaluator" -def _fail_scorer(scenario: Scenario, answer: str, trajectory_text: str) -> ScorerResult: - return ScorerResult(scorer="fail-default", passed=False, score=0.0) - - -def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_record): - # The scenario-level scoring_method must route around the default - # scorer, even when the default scorer would reject the answer. - rec = make_persisted_record(run_id="run-1", scenario_id=1, answer="answer text") - (tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8") - - scenarios_path = tmp_path / "scenarios.json" - scenarios_path.write_text( - json.dumps( - [ - { - "id": 1, - "text": "Q", - "type": "tsfm", - "scoring_method": "stub-evaluator", - } - ] - ), - encoding="utf-8", - ) - - registry.register("stub-evaluator", _stub_scorer) - registry.register("fail-default", _fail_scorer) - - report = Evaluator(default_scorer="fail-default").evaluate( - trajectories_path=tmp_path, - scenarios_paths=[scenarios_path], - ) - - assert report.totals["passed"] == 1 - assert report.results[0].score.scorer == "stub-evaluator" - +def test_evaluator_strips_think_blocks_before_scoring( + tmp_path: Path, make_persisted_record +): + seen: dict[str, str] = {} -def test_evaluator_rejects_self_judging_model(tmp_path: Path, make_persisted_record): - trajectories_dir = tmp_path / "trajectories" - trajectories_dir.mkdir() + def capture_scorer( + scenario: Scenario, answer: str, trajectory_text: str + ) -> ScorerResult: + seen["answer"] = answer + return ScorerResult(scorer="capture-evaluator", passed=True, score=1.0) rec = make_persisted_record( run_id="run-1", scenario_id=1, - model="litellm_proxy/aws/claude-opus-4-6", + answer=( + "I should inspect the work orders.\n\n" + "There are no kit entries.\n\n" + "0" + ), ) - (trajectories_dir / "run-1.json").write_text(json.dumps(rec), encoding="utf-8") + (tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8") scenarios_path = tmp_path / "scenarios.json" scenarios_path.write_text( - json.dumps([{"id": 1, "text": "Q", "type": "iot"}]), + json.dumps([{"id": 1, "text": "Q", "type": "wo"}]), encoding="utf-8", ) - registry.register("llm_judge", _stub_scorer) - - try: - Evaluator( - default_scorer="llm_judge", - judge_model="litellm_proxy/aws/claude-opus-4-6", - ).evaluate( - trajectories_path=trajectories_dir, - scenarios_paths=[scenarios_path], - ) - except ValueError as exc: - assert "self-judging is not allowed" in str(exc) - else: - raise AssertionError("expected ValueError for self-judging") - - -def test_evaluator_rejects_self_judging_with_normalized_model_ids( - tmp_path: Path, make_persisted_record -): - trajectories_dir = tmp_path / "trajectories" - trajectories_dir.mkdir() + registry.register("capture-evaluator", capture_scorer) - rec = make_persisted_record( - run_id="run-1", - scenario_id=1, - model="litellm_proxy/aws/claude-opus-4-6", + report = Evaluator(default_scorer="capture-evaluator").evaluate( + trajectories_path=tmp_path, + scenarios_paths=[scenarios_path], ) - (trajectories_dir / "run-1.json").write_text(json.dumps(rec), encoding="utf-8") - scenarios_path = tmp_path / "scenarios.json" - scenarios_path.write_text( - json.dumps([{"id": 1, "text": "Q", "type": "iot"}]), - encoding="utf-8", - ) + assert seen["answer"] == "0" + assert report.results[0].answer == "0" - registry.register("llm_judge", _stub_scorer) - try: - Evaluator( - default_scorer="llm_judge", - judge_model="aws/claude-opus-4-6", - ).evaluate( - trajectories_path=trajectories_dir, - scenarios_paths=[scenarios_path], - ) - except ValueError as exc: - assert "self-judging is not allowed" in str(exc) - else: - raise AssertionError("expected ValueError for self-judging") +def _fail_scorer(scenario: Scenario, answer: str, trajectory_text: str) -> ScorerResult: + return ScorerResult(scorer="fail-default", passed=False, score=0.0) -def test_evaluator_allows_non_llm_judge_even_with_matching_model( - tmp_path: Path, make_persisted_record -): - rec = make_persisted_record( - run_id="run-1", - scenario_id=1, - model="litellm_proxy/aws/claude-opus-4-6", - ) +def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_record): + # The scenario-level scoring_method must route around the default + # scorer, even when the default scorer would reject the answer. + rec = make_persisted_record(run_id="run-1", scenario_id=1, answer="answer text") (tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8") scenarios_path = tmp_path / "scenarios.json" scenarios_path.write_text( - json.dumps([{"id": 1, "text": "Q", "type": "iot"}]), + json.dumps( + [ + { + "id": 1, + "text": "Q", + "type": "tsfm", + "scoring_method": "stub-evaluator", + } + ] + ), encoding="utf-8", - ) - - registry.register("stub-evaluator", _stub_scorer) - - report = Evaluator( - default_scorer="stub-evaluator", - judge_model="aws/claude-opus-4-6", - ).evaluate( - trajectories_path=tmp_path, - scenarios_paths=[scenarios_path], - ) - - assert report.totals["passed"] == 1 + ) \ No newline at end of file From fee20175ecd06867004509d3e5940efdb8aa007e Mon Sep 17 00:00:00 2001 From: Chathurangi Shyalika Date: Sat, 27 Jun 2026 18:36:37 -0400 Subject: [PATCH 2/2] [Updated] Strip blocks before evaluation Signed-off-by: Chathurangi Shyalika --- src/evaluation/evaluator.py | 9 +- src/evaluation/tests/test_evaluator.py | 112 ++++++++++++++++++++++++- 2 files changed, 111 insertions(+), 10 deletions(-) diff --git a/src/evaluation/evaluator.py b/src/evaluation/evaluator.py index b35870c0..9beb1cc8 100644 --- a/src/evaluation/evaluator.py +++ b/src/evaluation/evaluator.py @@ -28,12 +28,7 @@ class Evaluator: - """Run a batch of scenarios against their saved trajectories. - - ``default_scorer`` names the registered scorer to use when a - scenario does not set ``scoring_method``. Per-scenario overrides - take precedence. - """ + """Run a batch of scenarios against their saved trajectories.""" def __init__( self, @@ -126,4 +121,4 @@ def _normalize_model_id(model_id: str | None) -> str: normalized = model_id.strip() if normalized.startswith("litellm_proxy/"): normalized = normalized[len("litellm_proxy/") :] - return normalized + return normalized \ No newline at end of file diff --git a/src/evaluation/tests/test_evaluator.py b/src/evaluation/tests/test_evaluator.py index f101361b..b1109213 100644 --- a/src/evaluation/tests/test_evaluator.py +++ b/src/evaluation/tests/test_evaluator.py @@ -79,8 +79,6 @@ def _fail_scorer(scenario: Scenario, answer: str, trajectory_text: str) -> Score def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_record): - # The scenario-level scoring_method must route around the default - # scorer, even when the default scorer would reject the answer. rec = make_persisted_record(run_id="run-1", scenario_id=1, answer="answer text") (tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8") @@ -97,4 +95,112 @@ def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_rec ] ), encoding="utf-8", - ) \ No newline at end of file + ) + + registry.register("stub-evaluator", _stub_scorer) + registry.register("fail-default", _fail_scorer) + + report = Evaluator(default_scorer="fail-default").evaluate( + trajectories_path=tmp_path, + scenarios_paths=[scenarios_path], + ) + + assert report.totals["passed"] == 1 + assert report.results[0].score.scorer == "stub-evaluator" + + +def test_evaluator_rejects_self_judging_model(tmp_path: Path, make_persisted_record): + trajectories_dir = tmp_path / "trajectories" + trajectories_dir.mkdir() + + rec = make_persisted_record( + run_id="run-1", + scenario_id=1, + model="litellm_proxy/aws/claude-opus-4-6", + ) + (trajectories_dir / "run-1.json").write_text(json.dumps(rec), encoding="utf-8") + + scenarios_path = tmp_path / "scenarios.json" + scenarios_path.write_text( + json.dumps([{"id": 1, "text": "Q", "type": "iot"}]), + encoding="utf-8", + ) + + registry.register("llm_judge", _stub_scorer) + + try: + Evaluator( + default_scorer="llm_judge", + judge_model="litellm_proxy/aws/claude-opus-4-6", + ).evaluate( + trajectories_path=trajectories_dir, + scenarios_paths=[scenarios_path], + ) + except ValueError as exc: + assert "self-judging is not allowed" in str(exc) + else: + raise AssertionError("expected ValueError for self-judging") + + +def test_evaluator_rejects_self_judging_with_normalized_model_ids( + tmp_path: Path, make_persisted_record +): + trajectories_dir = tmp_path / "trajectories" + trajectories_dir.mkdir() + + rec = make_persisted_record( + run_id="run-1", + scenario_id=1, + model="litellm_proxy/aws/claude-opus-4-6", + ) + (trajectories_dir / "run-1.json").write_text(json.dumps(rec), encoding="utf-8") + + scenarios_path = tmp_path / "scenarios.json" + scenarios_path.write_text( + json.dumps([{"id": 1, "text": "Q", "type": "iot"}]), + encoding="utf-8", + ) + + registry.register("llm_judge", _stub_scorer) + + try: + Evaluator( + default_scorer="llm_judge", + judge_model="aws/claude-opus-4-6", + ).evaluate( + trajectories_path=trajectories_dir, + scenarios_paths=[scenarios_path], + ) + except ValueError as exc: + assert "self-judging is not allowed" in str(exc) + else: + raise AssertionError("expected ValueError for self-judging") + + +def test_evaluator_allows_non_llm_judge_even_with_matching_model( + tmp_path: Path, make_persisted_record +): + rec = make_persisted_record( + run_id="run-1", + scenario_id=1, + model="litellm_proxy/aws/claude-opus-4-6", + ) + (tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8") + + scenarios_path = tmp_path / "scenarios.json" + scenarios_path.write_text( + json.dumps([{"id": 1, "text": "Q", "type": "iot"}]), + encoding="utf-8", + ) + + registry.register("stub-evaluator", _stub_scorer) + + report = Evaluator( + default_scorer="stub-evaluator", + judge_model="aws/claude-opus-4-6", + ).evaluate( + trajectories_path=tmp_path, + scenarios_paths=[scenarios_path], + ) + + assert report.totals["passed"] == 1 \ No newline at end of file