diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py index 24b77da149..4c92c0cff8 100644 --- a/src/google/adk/evaluation/final_response_match_v1.py +++ b/src/google/adk/evaluation/final_response_match_v1.py @@ -14,6 +14,7 @@ from __future__ import annotations +import re from typing import Optional from google.genai import types as genai_types @@ -92,6 +93,29 @@ def _get_eval_status(score: float, threshold: float): return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED +class _UnicodeTokenizer: + """Tokenizer that handles Unicode text with word-boundary awareness. + + The default RougeScorer tokenizer splits on whitespace, which works for + ASCII and Latin-script text but produces zero tokens for text in scripts + without word boundaries (Chinese, Japanese, Thai, etc.). + + For ASCII-majority text this tokenizer uses Unicode-aware word-character + matching (``\\w+`` in re). For non-ASCII text it falls back to whitespace + splitting, then character-level tokenization. + """ + + def tokenize(self, text: str) -> list[str]: + """Tokenizes text using Unicode-aware word boundaries.""" + ascii_chars = sum(1 for c in text if ord(c) < 128) + if ascii_chars > len(text) * 0.5: + return re.findall(r'\w+', text.lower()) + tokens = text.lower().split() + if tokens: + return tokens + return list(text.lower()) + + def _calculate_rouge_1_scores(candidate: str, reference: str): """Calculates the ROUGE-1 score between a candidate and reference text. @@ -110,7 +134,11 @@ def _calculate_rouge_1_scores(candidate: str, reference: str): Returns: A dictionary containing the ROUGE-1 precision, recall, and f-measure. """ - scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True) + scorer = rouge_scorer.RougeScorer( + ["rouge1"], + use_stemmer=True, + tokenizer=_UnicodeTokenizer(), + ) # The score method returns a dictionary where keys are the ROUGE types # and values are Score objects (tuples) with precision, recall, and fmeasure.