From c4f04ff91ef6bfb048ffc1ad0d6fea252e4ab01c Mon Sep 17 00:00:00 2001 From: perseus <51974392+tcconnally@users.noreply.github.com> Date: Mon, 15 Jun 2026 21:20:56 +0000 Subject: [PATCH 1/3] fix: ROUGE-1 eval fails for non-English languages (ASCII-only tokenizer) The default RougeScorer tokenizer uses r'\\w+' regex which only matches ASCII [a-zA-Z0-9_]. For non-Latin scripts (Thai, Chinese, Japanese, etc.), this returns zero tokens, causing ROUGE scores of 0.0 even when the response matches the expected output exactly. Added _unicode_tokenize function that uses re.UNICODE flag and falls back to character-level tokenization for non-ASCII scripts. Closes #3111 --- .../adk/evaluation/final_response_match_v1.py | 30 ++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py index 24b77da149..999190095b 100644 --- a/src/google/adk/evaluation/final_response_match_v1.py +++ b/src/google/adk/evaluation/final_response_match_v1.py @@ -92,6 +92,30 @@ def _get_eval_status(score: float, threshold: float): return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED +def _unicode_tokenize(text: str): + """Tokenizes text using Unicode-aware word boundaries. + + The default RougeScorer tokenizer uses r'\\w+' which only matches ASCII + [a-zA-Z0-9_]. For non-Latin scripts (Thai, Chinese, Japanese, Arabic, etc.), + this returns zero tokens, causing ROUGE scores of 0.0 on matching responses. + + This tokenizer uses re.UNICODE for ASCII-majority text and falls back to + character-level tokenization for non-ASCII text. + """ + import re + # For primarily non-ASCII text, tokenize by Unicode-aware patterns + ascii_chars = sum(1 for c in text if ord(c) < 128) + if ascii_chars > len(text) * 0.5: + return re.findall(r'[\\w]+', text.lower(), re.UNICODE) + # For non-Latin scripts, use whitespace splitting with Unicode support + tokens = re.split(r'[\\s\\p{P}]+', text, flags=re.UNICODE) + tokens = [t.lower() for t in tokens if t] + if tokens: + return tokens + # Character-level fallback for scripts without word boundaries + return list(text.lower()) + + def _calculate_rouge_1_scores(candidate: str, reference: str): """Calculates the ROUGE-1 score between a candidate and reference text. @@ -110,7 +134,11 @@ def _calculate_rouge_1_scores(candidate: str, reference: str): Returns: A dictionary containing the ROUGE-1 precision, recall, and f-measure. """ - scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True) + scorer = rouge_scorer.RougeScorer( + ["rouge1"], + use_stemmer=True, + tokenizer=_unicode_tokenize, + ) # The score method returns a dictionary where keys are the ROUGE types # and values are Score objects (tuples) with precision, recall, and fmeasure. From 5e7d5573c62032b29046188764e9406a92a762a6 Mon Sep 17 00:00:00 2001 From: perseus <51974392+tcconnally@users.noreply.github.com> Date: Mon, 15 Jun 2026 19:52:27 -0500 Subject: [PATCH 2/3] fix: use proper tokenizer class for Unicode RougeScorer - Replace function _unicode_tokenize with _UnicodeTokenizer class implementing the tokenize() method expected by RougeScorer - Move import re to module level - Fix double-escaped regex patterns (\w -> \w, remove unsupported \p{P}) - Add return type annotation for tokenize() to satisfy mypy strict mode - Fix RougeScorer constructor indentation --- .../adk/evaluation/final_response_match_v1.py | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py index 999190095b..4c92c0cff8 100644 --- a/src/google/adk/evaluation/final_response_match_v1.py +++ b/src/google/adk/evaluation/final_response_match_v1.py @@ -14,6 +14,7 @@ from __future__ import annotations +import re from typing import Optional from google.genai import types as genai_types @@ -92,28 +93,27 @@ def _get_eval_status(score: float, threshold: float): return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED -def _unicode_tokenize(text: str): - """Tokenizes text using Unicode-aware word boundaries. +class _UnicodeTokenizer: + """Tokenizer that handles Unicode text with word-boundary awareness. - The default RougeScorer tokenizer uses r'\\w+' which only matches ASCII - [a-zA-Z0-9_]. For non-Latin scripts (Thai, Chinese, Japanese, Arabic, etc.), - this returns zero tokens, causing ROUGE scores of 0.0 on matching responses. + The default RougeScorer tokenizer splits on whitespace, which works for + ASCII and Latin-script text but produces zero tokens for text in scripts + without word boundaries (Chinese, Japanese, Thai, etc.). - This tokenizer uses re.UNICODE for ASCII-majority text and falls back to - character-level tokenization for non-ASCII text. + For ASCII-majority text this tokenizer uses Unicode-aware word-character + matching (``\\w+`` in re). For non-ASCII text it falls back to whitespace + splitting, then character-level tokenization. """ - import re - # For primarily non-ASCII text, tokenize by Unicode-aware patterns - ascii_chars = sum(1 for c in text if ord(c) < 128) - if ascii_chars > len(text) * 0.5: - return re.findall(r'[\\w]+', text.lower(), re.UNICODE) - # For non-Latin scripts, use whitespace splitting with Unicode support - tokens = re.split(r'[\\s\\p{P}]+', text, flags=re.UNICODE) - tokens = [t.lower() for t in tokens if t] - if tokens: - return tokens - # Character-level fallback for scripts without word boundaries - return list(text.lower()) + + def tokenize(self, text: str) -> list[str]: + """Tokenizes text using Unicode-aware word boundaries.""" + ascii_chars = sum(1 for c in text if ord(c) < 128) + if ascii_chars > len(text) * 0.5: + return re.findall(r'\w+', text.lower()) + tokens = text.lower().split() + if tokens: + return tokens + return list(text.lower()) def _calculate_rouge_1_scores(candidate: str, reference: str): @@ -135,10 +135,10 @@ def _calculate_rouge_1_scores(candidate: str, reference: str): A dictionary containing the ROUGE-1 precision, recall, and f-measure. """ scorer = rouge_scorer.RougeScorer( - ["rouge1"], - use_stemmer=True, - tokenizer=_unicode_tokenize, - ) + ["rouge1"], + use_stemmer=True, + tokenizer=_UnicodeTokenizer(), + ) # The score method returns a dictionary where keys are the ROUGE types # and values are Score objects (tuples) with precision, recall, and fmeasure. From 98396a4de3e3ae59654aca92c10291ce19d09c77 Mon Sep 17 00:00:00 2001 From: perseus <51974392+tcconnally@users.noreply.github.com> Date: Wed, 17 Jun 2026 18:40:42 +0000 Subject: [PATCH 3/3] chore: apply pyink formatting --- src/google/adk/evaluation/final_response_match_v1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py index 4c92c0cff8..9131529b58 100644 --- a/src/google/adk/evaluation/final_response_match_v1.py +++ b/src/google/adk/evaluation/final_response_match_v1.py @@ -109,7 +109,7 @@ def tokenize(self, text: str) -> list[str]: """Tokenizes text using Unicode-aware word boundaries.""" ascii_chars = sum(1 for c in text if ord(c) < 128) if ascii_chars > len(text) * 0.5: - return re.findall(r'\w+', text.lower()) + return re.findall(r"\w+", text.lower()) tokens = text.lower().split() if tokens: return tokens