From c4f04ff91ef6bfb048ffc1ad0d6fea252e4ab01c Mon Sep 17 00:00:00 2001
From: perseus <51974392+tcconnally@users.noreply.github.com>
Date: Mon, 15 Jun 2026 21:20:56 +0000
Subject: [PATCH 1/3] fix: ROUGE-1 eval fails for non-English languages
 (ASCII-only tokenizer)

The default RougeScorer tokenizer uses r'\\w+' regex which only matches
ASCII [a-zA-Z0-9_]. For non-Latin scripts (Thai, Chinese, Japanese,
etc.), this returns zero tokens, causing ROUGE scores of 0.0 even when
the response matches the expected output exactly.

Added _unicode_tokenize function that uses re.UNICODE flag and falls
back to character-level tokenization for non-ASCII scripts.

Closes #3111
---
 .../adk/evaluation/final_response_match_v1.py | 30 ++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py
index 24b77da149..999190095b 100644
--- a/src/google/adk/evaluation/final_response_match_v1.py
+++ b/src/google/adk/evaluation/final_response_match_v1.py
@@ -92,6 +92,30 @@ def _get_eval_status(score: float, threshold: float):
   return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED
 
 
+def _unicode_tokenize(text: str):
+  """Tokenizes text using Unicode-aware word boundaries.
+
+  The default RougeScorer tokenizer uses r'\\w+' which only matches ASCII
+  [a-zA-Z0-9_]. For non-Latin scripts (Thai, Chinese, Japanese, Arabic, etc.),
+  this returns zero tokens, causing ROUGE scores of 0.0 on matching responses.
+
+  This tokenizer uses re.UNICODE for ASCII-majority text and falls back to
+  character-level tokenization for non-ASCII text.
+  """
+  import re
+  # For primarily non-ASCII text, tokenize by Unicode-aware patterns
+  ascii_chars = sum(1 for c in text if ord(c) < 128)
+  if ascii_chars > len(text) * 0.5:
+    return re.findall(r'[\\w]+', text.lower(), re.UNICODE)
+  # For non-Latin scripts, use whitespace splitting with Unicode support
+  tokens = re.split(r'[\\s\\p{P}]+', text, flags=re.UNICODE)
+  tokens = [t.lower() for t in tokens if t]
+  if tokens:
+    return tokens
+  # Character-level fallback for scripts without word boundaries
+  return list(text.lower())
+
+
 def _calculate_rouge_1_scores(candidate: str, reference: str):
   """Calculates the ROUGE-1 score between a candidate and reference text.
 
@@ -110,7 +134,11 @@ def _calculate_rouge_1_scores(candidate: str, reference: str):
   Returns:
       A dictionary containing the ROUGE-1 precision, recall, and f-measure.
   """
-  scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
+  scorer = rouge_scorer.RougeScorer(
+        ["rouge1"],
+        use_stemmer=True,
+        tokenizer=_unicode_tokenize,
+    )
 
   # The score method returns a dictionary where keys are the ROUGE types
   # and values are Score objects (tuples) with precision, recall, and fmeasure.

From 5e7d5573c62032b29046188764e9406a92a762a6 Mon Sep 17 00:00:00 2001
From: perseus <51974392+tcconnally@users.noreply.github.com>
Date: Mon, 15 Jun 2026 19:52:27 -0500
Subject: [PATCH 2/3] fix: use proper tokenizer class for Unicode RougeScorer

- Replace function _unicode_tokenize with _UnicodeTokenizer class
  implementing the tokenize() method expected by RougeScorer
- Move import re to module level
- Fix double-escaped regex patterns (\w -> \w, remove unsupported \p{P})
- Add return type annotation for tokenize() to satisfy mypy strict mode
- Fix RougeScorer constructor indentation
---
 .../adk/evaluation/final_response_match_v1.py | 46 +++++++++----------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py
index 999190095b..4c92c0cff8 100644
--- a/src/google/adk/evaluation/final_response_match_v1.py
+++ b/src/google/adk/evaluation/final_response_match_v1.py
@@ -14,6 +14,7 @@
 
 from __future__ import annotations
 
+import re
 from typing import Optional
 
 from google.genai import types as genai_types
@@ -92,28 +93,27 @@ def _get_eval_status(score: float, threshold: float):
   return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED
 
 
-def _unicode_tokenize(text: str):
-  """Tokenizes text using Unicode-aware word boundaries.
+class _UnicodeTokenizer:
+  """Tokenizer that handles Unicode text with word-boundary awareness.
 
-  The default RougeScorer tokenizer uses r'\\w+' which only matches ASCII
-  [a-zA-Z0-9_]. For non-Latin scripts (Thai, Chinese, Japanese, Arabic, etc.),
-  this returns zero tokens, causing ROUGE scores of 0.0 on matching responses.
+  The default RougeScorer tokenizer splits on whitespace, which works for
+  ASCII and Latin-script text but produces zero tokens for text in scripts
+  without word boundaries (Chinese, Japanese, Thai, etc.).
 
-  This tokenizer uses re.UNICODE for ASCII-majority text and falls back to
-  character-level tokenization for non-ASCII text.
+  For ASCII-majority text this tokenizer uses Unicode-aware word-character
+  matching (``\\w+`` in re). For non-ASCII text it falls back to whitespace
+  splitting, then character-level tokenization.
   """
-  import re
-  # For primarily non-ASCII text, tokenize by Unicode-aware patterns
-  ascii_chars = sum(1 for c in text if ord(c) < 128)
-  if ascii_chars > len(text) * 0.5:
-    return re.findall(r'[\\w]+', text.lower(), re.UNICODE)
-  # For non-Latin scripts, use whitespace splitting with Unicode support
-  tokens = re.split(r'[\\s\\p{P}]+', text, flags=re.UNICODE)
-  tokens = [t.lower() for t in tokens if t]
-  if tokens:
-    return tokens
-  # Character-level fallback for scripts without word boundaries
-  return list(text.lower())
+
+  def tokenize(self, text: str) -> list[str]:
+    """Tokenizes text using Unicode-aware word boundaries."""
+    ascii_chars = sum(1 for c in text if ord(c) < 128)
+    if ascii_chars > len(text) * 0.5:
+      return re.findall(r'\w+', text.lower())
+    tokens = text.lower().split()
+    if tokens:
+      return tokens
+    return list(text.lower())
 
 
 def _calculate_rouge_1_scores(candidate: str, reference: str):
@@ -135,10 +135,10 @@ def _calculate_rouge_1_scores(candidate: str, reference: str):
       A dictionary containing the ROUGE-1 precision, recall, and f-measure.
   """
   scorer = rouge_scorer.RougeScorer(
-        ["rouge1"],
-        use_stemmer=True,
-        tokenizer=_unicode_tokenize,
-    )
+      ["rouge1"],
+      use_stemmer=True,
+      tokenizer=_UnicodeTokenizer(),
+  )
 
   # The score method returns a dictionary where keys are the ROUGE types
   # and values are Score objects (tuples) with precision, recall, and fmeasure.

From 98396a4de3e3ae59654aca92c10291ce19d09c77 Mon Sep 17 00:00:00 2001
From: perseus <51974392+tcconnally@users.noreply.github.com>
Date: Wed, 17 Jun 2026 18:40:42 +0000
Subject: [PATCH 3/3] chore: apply pyink formatting

---
 src/google/adk/evaluation/final_response_match_v1.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py
index 4c92c0cff8..9131529b58 100644
--- a/src/google/adk/evaluation/final_response_match_v1.py
+++ b/src/google/adk/evaluation/final_response_match_v1.py
@@ -109,7 +109,7 @@ def tokenize(self, text: str) -> list[str]:
     """Tokenizes text using Unicode-aware word boundaries."""
     ascii_chars = sum(1 for c in text if ord(c) < 128)
     if ascii_chars > len(text) * 0.5:
-      return re.findall(r'\w+', text.lower())
+      return re.findall(r"\w+", text.lower())
     tokens = text.lower().split()
     if tokens:
       return tokens