From 7c36d57a243a3a557ffe2b5427c1a847c5899f20 Mon Sep 17 00:00:00 2001 From: Yufeng He <40085740+he-yufeng@users.noreply.github.com> Date: Mon, 25 May 2026 04:01:57 +0800 Subject: [PATCH 1/2] fix: include grounding metadata in rubric judge prompt --- src/google/adk/evaluation/eval_case.py | 3 ++ .../adk/evaluation/evaluation_generator.py | 38 ++++++++++++---- .../adk/evaluation/llm_as_judge_utils.py | 45 +++++++++++++++++++ .../rubric_based_final_response_quality_v1.py | 13 +++++- .../evaluation/test_evaluation_generator.py | 25 +++++++++++ .../evaluation/test_llm_as_judge_utils.py | 34 ++++++++++++++ ..._rubric_based_final_response_quality_v1.py | 31 +++++++++++++ 7 files changed, 178 insertions(+), 11 deletions(-) diff --git a/src/google/adk/evaluation/eval_case.py b/src/google/adk/evaluation/eval_case.py index 300b489d04b..ab85038fcf3 100644 --- a/src/google/adk/evaluation/eval_case.py +++ b/src/google/adk/evaluation/eval_case.py @@ -67,6 +67,9 @@ class InvocationEvent(EvalBaseModel): content: Optional[genai_types.Content] """The content of the event.""" + grounding_metadata: Optional[genai_types.GroundingMetadata] = None + """Grounding metadata emitted with the event.""" + class InvocationEvents(EvalBaseModel): """A container for events that occur during the course of an invocation.""" diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py index e0e61fe758a..4c1635d87fb 100644 --- a/src/google/adk/evaluation/evaluation_generator.py +++ b/src/google/adk/evaluation/evaluation_generator.py @@ -662,6 +662,7 @@ def convert_events_to_eval_invocations( final_response = event.content final_event = event + should_add_event = event.grounding_metadata is not None for p in event.content.parts: if ( p.function_call @@ -669,16 +670,35 @@ def convert_events_to_eval_invocations( or p.text or p.inline_data ): - events_to_add.append(event) + should_add_event = True break - - invocation_events = [ - InvocationEvent(author=e.author, content=e.content) - for e in events_to_add - if final_event is None - or e is not final_event - or e.get_function_calls() - ] + if should_add_event: + events_to_add.append(event) + elif event.grounding_metadata is not None: + events_to_add.append(event) + + invocation_events = [] + for e in events_to_add: + # Keep the final event only when it carries tool calls (so the judge + # still sees the function call) or grounding metadata; every other + # event is always included. + if ( + e is final_event + and not e.get_function_calls() + and not e.grounding_metadata + ): + continue + invocation_events.append( + InvocationEvent( + author=e.author, + content=( + e.content + if e is not final_event or e.get_function_calls() + else None + ), + grounding_metadata=e.grounding_metadata, + ) + ) invocations.append( Invocation( invocation_id=invocation_id, diff --git a/src/google/adk/evaluation/llm_as_judge_utils.py b/src/google/adk/evaluation/llm_as_judge_utils.py index edc057be7cc..81b90a59812 100644 --- a/src/google/adk/evaluation/llm_as_judge_utils.py +++ b/src/google/adk/evaluation/llm_as_judge_utils.py @@ -153,6 +153,20 @@ class _ToolCallsAndResponses(EvalBaseModel): tool_calls_and_response: list[_ToolCallAndResponse] +class _GroundingMetadataEntry(EvalBaseModel): + """Internal data model to capture grounding metadata from an invocation.""" + + step: int + author: str + grounding_metadata: genai_types.GroundingMetadata + + +class _GroundingMetadataEntries(EvalBaseModel): + """Internal data model used for serializing grounding metadata.""" + + grounding_metadata: list[_GroundingMetadataEntry] + + def get_tool_calls_and_responses_as_json_str( intermediate_data: Optional[IntermediateDataType], ) -> str: @@ -187,3 +201,34 @@ def get_tool_calls_and_responses_as_json_str( exclude_defaults=True, exclude_none=True, ) + + +def get_grounding_metadata_as_json_str( + intermediate_data: Optional[IntermediateDataType], +) -> str: + """Returns a JSON string representation of grounding metadata.""" + if not isinstance(intermediate_data, InvocationEvents): + return "No grounding metadata was provided." + + grounding_metadata = [] + for idx, invocation_event in enumerate(intermediate_data.invocation_events): + if invocation_event.grounding_metadata: + grounding_metadata.append( + _GroundingMetadataEntry( + step=idx, + author=invocation_event.author, + grounding_metadata=invocation_event.grounding_metadata, + ) + ) + + if not grounding_metadata: + return "No grounding metadata was provided." + + return _GroundingMetadataEntries( + grounding_metadata=grounding_metadata + ).model_dump_json( + indent=2, + exclude_unset=True, + exclude_defaults=True, + exclude_none=True, + ) diff --git a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py index 0229113175a..54ee97855fe 100644 --- a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py +++ b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py @@ -25,6 +25,7 @@ from .eval_case import InvocationEvents from .eval_metrics import EvalMetric from .eval_metrics import RubricsBasedCriterion +from .llm_as_judge_utils import get_grounding_metadata_as_json_str from .llm_as_judge_utils import get_text_from_content from .llm_as_judge_utils import get_tool_calls_and_responses_as_json_str from .llm_as_judge_utils import get_tool_declarations_as_json_str @@ -45,8 +46,9 @@ # Key Evaluation Principles Your evaluation must follow a two-part process: first, collect trusted evidence from the agent's work, and second, judge the final answer against it. -1. **Establish Trusted Evidence from Tool Calls**: You must first examine the agent's tool calls to determine if they are procedurally sound, meaning that the agent used the appropriate tools with logical parameters to address the user's prompt. - * Your ONLY sources of truth are the and the direct output ('tool_response') from PROCEDURALLY SOUND tool calls found in the . Examples of procedural flaws include: +1. **Establish Trusted Evidence from Tool Calls and Grounding**: You must first examine the agent's tool calls to determine if they are procedurally sound, meaning that the agent used the appropriate tools with logical parameters to address the user's prompt. + * Your ONLY sources of truth are the , the direct output ('tool_response') from PROCEDURALLY SOUND tool calls found in the , and model-supplied grounding metadata found in . + * Grounding metadata is trusted evidence for model-internal tools such as google_search whose raw search results may not appear as function tool responses. Examples of procedural flaws include: * The agent failed to call a tool that will enable it to answer the user's prompt despite having all the necessary parameters to do so. * The agent called the tool with incorrect or missing parameters. * The agent called a tool that does not exist, or called a tool with a parameter that does not exist. @@ -214,6 +216,9 @@ {response_steps} + + {grounding_metadata} + {final_response} @@ -296,6 +301,9 @@ def format_auto_rater_prompt( response_steps = get_tool_calls_and_responses_as_json_str( actual_invocation.intermediate_data ) + grounding_metadata = get_grounding_metadata_as_json_str( + actual_invocation.intermediate_data + ) app_details = actual_invocation.app_details if app_details: @@ -315,6 +323,7 @@ def format_auto_rater_prompt( tool_declarations=tool_declarations, user_input=user_input, response_steps=response_steps, + grounding_metadata=grounding_metadata, final_response=final_response, rubrics=rubrics_text, ) diff --git a/tests/unittests/evaluation/test_evaluation_generator.py b/tests/unittests/evaluation/test_evaluation_generator.py index ea6364cad3e..b239d07cdef 100644 --- a/tests/unittests/evaluation/test_evaluation_generator.py +++ b/tests/unittests/evaluation/test_evaluation_generator.py @@ -236,6 +236,31 @@ def test_convert_multi_agent_final_responses( assert intermediate_events[0].author == "agent1" assert intermediate_events[0].content.parts[0].text == "First response" + def test_convert_preserves_grounding_metadata_from_final_response( + self, + ): + """Tests final grounding metadata is available to evaluators.""" + grounding_metadata = types.GroundingMetadata( + web_search_queries=["recent AI news"] + ) + events = [ + _build_event("user", [types.Part(text="What's new in AI?")], "inv1"), + Event( + author="agent", + content=types.Content(parts=[types.Part(text="Here are sources.")]), + invocation_id="inv1", + grounding_metadata=grounding_metadata, + ), + ] + + invocations = EvaluationGenerator.convert_events_to_eval_invocations(events) + + assert len(invocations) == 1 + invocation_events = invocations[0].intermediate_data.invocation_events + assert len(invocation_events) == 1 + assert invocation_events[0].content is None + assert invocation_events[0].grounding_metadata == grounding_metadata + class TestGetAppDetailsByInvocationId: """Test cases for EvaluationGenerator._get_app_details_by_invocation_id method.""" diff --git a/tests/unittests/evaluation/test_llm_as_judge_utils.py b/tests/unittests/evaluation/test_llm_as_judge_utils.py index c7cd5ff5695..6e6dd8772a4 100644 --- a/tests/unittests/evaluation/test_llm_as_judge_utils.py +++ b/tests/unittests/evaluation/test_llm_as_judge_utils.py @@ -26,6 +26,7 @@ from google.adk.evaluation.evaluator import EvalStatus from google.adk.evaluation.llm_as_judge_utils import get_average_rubric_score from google.adk.evaluation.llm_as_judge_utils import get_eval_status +from google.adk.evaluation.llm_as_judge_utils import get_grounding_metadata_as_json_str from google.adk.evaluation.llm_as_judge_utils import get_text_from_content from google.adk.evaluation.llm_as_judge_utils import get_tool_calls_and_responses_as_json_str from google.adk.evaluation.llm_as_judge_utils import get_tool_declarations_as_json_str @@ -362,3 +363,36 @@ def test_get_tool_calls_and_responses_as_json_str_with_invocation_events_multipl ] } assert json.loads(json_str) == expected_json + + +def test_get_grounding_metadata_as_json_str_with_invocation_events(): + """Tests grounding metadata is serialized for LLM-as-judge prompts.""" + grounding_metadata = genai_types.GroundingMetadata( + web_search_queries=["recent AI news"] + ) + intermediate_data = InvocationEvents( + invocation_events=[ + InvocationEvent( + author="agent", + content=None, + grounding_metadata=grounding_metadata, + ) + ] + ) + + json_str = get_grounding_metadata_as_json_str(intermediate_data) + parsed = json.loads(json_str) + + assert parsed["grounding_metadata"][0]["step"] == 0 + assert parsed["grounding_metadata"][0]["author"] == "agent" + assert parsed["grounding_metadata"][0]["grounding_metadata"][ + "web_search_queries" + ] == ["recent AI news"] + + +def test_get_grounding_metadata_as_json_str_without_metadata(): + """Tests empty grounding metadata serialization.""" + assert ( + get_grounding_metadata_as_json_str(InvocationEvents()) + == "No grounding metadata was provided." + ) diff --git a/tests/unittests/evaluation/test_rubric_based_final_response_quality_v1.py b/tests/unittests/evaluation/test_rubric_based_final_response_quality_v1.py index e100f9c06a2..8166b8b62ae 100644 --- a/tests/unittests/evaluation/test_rubric_based_final_response_quality_v1.py +++ b/tests/unittests/evaluation/test_rubric_based_final_response_quality_v1.py @@ -182,6 +182,37 @@ def test_format_auto_rater_prompt_with_intermediate_data( assert '"result": "ok"' in prompt +def test_format_auto_rater_prompt_with_grounding_metadata( + evaluator: RubricBasedFinalResponseQualityV1Evaluator, +): + """Tests grounding metadata is included as trusted evidence.""" + grounding_metadata = genai_types.GroundingMetadata( + web_search_queries=["recent AI news"] + ) + invocation = Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="What's new in AI?")] + ), + final_response=genai_types.Content( + parts=[genai_types.Part(text="Here are sources.")] + ), + intermediate_data=InvocationEvents( + invocation_events=[ + InvocationEvent( + author="agent", + content=None, + grounding_metadata=grounding_metadata, + ) + ] + ), + ) + prompt = evaluator.format_auto_rater_prompt(invocation, None) + + assert "" in prompt + assert "recent AI news" in prompt + assert "model-supplied grounding metadata" in prompt + + def test_format_auto_rater_prompt_with_app_details_no_tools( evaluator: RubricBasedFinalResponseQualityV1Evaluator, ): From c33f9c683fdfb918321ab72092d0b67aa720ba1e Mon Sep 17 00:00:00 2001 From: Yufeng He <40085740+he-yufeng@users.noreply.github.com> Date: Wed, 27 May 2026 17:06:37 +0800 Subject: [PATCH 2/2] fix: satisfy eval grounding typing --- .../adk/evaluation/evaluation_generator.py | 5 +++-- .../adk/evaluation/llm_as_judge_utils.py | 18 +++++++++++------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py index 4c1635d87fb..8dbef2d8739 100644 --- a/src/google/adk/evaluation/evaluation_generator.py +++ b/src/google/adk/evaluation/evaluation_generator.py @@ -645,7 +645,7 @@ def convert_events_to_eval_invocations( ): app_details = app_details_per_invocation[invocation_id] - events_to_add = [] + events_to_add: list[Event] = [] for event in events: current_author = (event.author or _DEFAULT_AUTHOR).lower() @@ -683,7 +683,8 @@ def convert_events_to_eval_invocations( # still sees the function call) or grounding metadata; every other # event is always included. if ( - e is final_event + final_event is not None + and e is final_event and not e.get_function_calls() and not e.grounding_metadata ): diff --git a/src/google/adk/evaluation/llm_as_judge_utils.py b/src/google/adk/evaluation/llm_as_judge_utils.py index 81b90a59812..15b3d0854cf 100644 --- a/src/google/adk/evaluation/llm_as_judge_utils.py +++ b/src/google/adk/evaluation/llm_as_judge_utils.py @@ -17,6 +17,7 @@ import enum import statistics from typing import Any +from typing import cast from typing import Optional from typing import Union @@ -224,11 +225,14 @@ def get_grounding_metadata_as_json_str( if not grounding_metadata: return "No grounding metadata was provided." - return _GroundingMetadataEntries( - grounding_metadata=grounding_metadata - ).model_dump_json( - indent=2, - exclude_unset=True, - exclude_defaults=True, - exclude_none=True, + return cast( + str, + _GroundingMetadataEntries( + grounding_metadata=grounding_metadata + ).model_dump_json( + indent=2, + exclude_unset=True, + exclude_defaults=True, + exclude_none=True, + ), )