From 99376c1ba22dffe98fa41ca463fedcffb61fa19d Mon Sep 17 00:00:00 2001 From: Goutham Annem Date: Mon, 15 Jun 2026 11:08:40 -0700 Subject: [PATCH] fix(evaluation): handle None inferences in LocalEvalService When inference fails (e.g. MCP session drop, timeout, API error), _evaluate_single_inference_result() calls len(inference_result.inferences) without a None guard, causing TypeError. Return EvalStatus.NOT_EVALUATED early when inferences is None. Closes #6071 --- .../adk/evaluation/local_eval_service.py | 11 +++++++ .../evaluation/test_local_eval_service.py | 31 +++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/src/google/adk/evaluation/local_eval_service.py b/src/google/adk/evaluation/local_eval_service.py index 1a032bad64..e9a50ecb8d 100644 --- a/src/google/adk/evaluation/local_eval_service.py +++ b/src/google/adk/evaluation/local_eval_service.py @@ -260,6 +260,17 @@ async def _evaluate_single_inference_result( f' {inference_result.eval_set_id}.' ) + if inference_result.inferences is None: + return inference_result, EvalCaseResult( + eval_set_file=inference_result.eval_set_id, + eval_set_id=inference_result.eval_set_id, + eval_id=inference_result.eval_case_id, + final_eval_status=EvalStatus.NOT_EVALUATED, + overall_eval_metric_results=[], + eval_metric_result_per_invocation=[], + session_id=inference_result.session_id or "", + ) + # Metric results for each invocation eval_metric_result_per_invocation = [] diff --git a/tests/unittests/evaluation/test_local_eval_service.py b/tests/unittests/evaluation/test_local_eval_service.py index 3bbfafc5be..44bee2bb47 100644 --- a/tests/unittests/evaluation/test_local_eval_service.py +++ b/tests/unittests/evaluation/test_local_eval_service.py @@ -465,6 +465,37 @@ async def test_evaluate_single_inference_result( assert metric_result.eval_status == EvalStatus.PASSED +@pytest.mark.asyncio +async def test_evaluate_single_inference_result_inferences_none( + eval_service, mock_eval_sets_manager, mocker +): + inference_result = InferenceResult( + app_name="test_app", + eval_set_id="test_eval_set", + eval_case_id="case1", + inferences=None, + session_id="session1", + ) + eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5) + evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1) + + mock_eval_case = mocker.MagicMock(spec=EvalCase) + mock_eval_case.conversation = [] + mock_eval_case.conversation_scenario = None + mock_eval_case.session_input = None + mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case + + _, result = await eval_service._evaluate_single_inference_result( + inference_result=inference_result, evaluate_config=evaluate_config + ) + + assert isinstance(result, EvalCaseResult) + assert result.eval_id == "case1" + assert result.final_eval_status == EvalStatus.NOT_EVALUATED + assert result.overall_eval_metric_results == [] + assert result.eval_metric_result_per_invocation == [] + + @pytest.mark.asyncio async def test_evaluate_single_inference_result_for_conversation_scenario( eval_service, mock_eval_sets_manager, mocker