Skip to content
This repository was archived by the owner on Jun 3, 2026. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions src/utils/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,51 @@

from __future__ import annotations

import logging
from typing import List

from src.config.constants import LLM_TAB_SEPARATOR
from src.schemas.classification import Classification
from src.utils.exceptions import ValidationError


logger = logging.getLogger(__name__)

# Maximum string length threshold (10 MB)
MAX_STRING_LENGTH = 10 * 1024 * 1024
Comment on lines +20 to +21
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 The comment, log message, and error message all label this limit as "bytes", but Python's len() on a str counts Unicode code points (characters), not UTF-8 bytes. A string consisting entirely of 4-byte emoji characters would be ~40 MB in UTF-8 yet only count as ~10 M characters — sailing well past the intended 10 MB byte guard. Either use len(content.encode('utf-8')) if bytes truly matter, or update the constant name and all messages to say "characters."

Suggested change
# Maximum string length threshold (10 MB)
MAX_STRING_LENGTH = 10 * 1024 * 1024
# Maximum string length threshold (10 MB, measured in characters)
MAX_STRING_LENGTH = 10 * 1024 * 1024

Fix in Cursor Fix in Codex Fix in Claude Code



# ---------------------------------------------------------------------------
# Validation helpers
# ---------------------------------------------------------------------------


def validate_input_text(content: str, function_name: str) -> None:
"""
Validate input text for null, empty, and size constraints.

Raises:
ValidationError: If content is None, empty after stripping, or exceeds max length.
TypeError: If content is not a string.
"""
if content is None:
raise ValidationError(f"{function_name}: Input text cannot be None")

if not isinstance(content, str):
raise TypeError(f"{function_name}: Input must be a string, got {type(content).__name__}")

if not content.strip():
raise ValidationError(f"{function_name}: Input text cannot be empty or whitespace-only")

if len(content) > MAX_STRING_LENGTH:
logger.warning(
f"{function_name}: Input text exceeds maximum length. "
f"Length: {len(content)} bytes, Max: {MAX_STRING_LENGTH} bytes"
)
raise ValidationError(
f"{function_name}: Input text exceeds maximum allowed length "
f"({len(content)} > {MAX_STRING_LENGTH} bytes)"
)
Comment on lines +46 to +54
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

In Python, len(string) returns the number of characters (code points), not the number of bytes. If the input contains multi-byte characters (such as emojis or non-ASCII text), the actual byte size will be larger than len(content). To avoid misleading log and error messages, we should refer to this limit in terms of characters rather than bytes.

Suggested change
if len(content) > MAX_STRING_LENGTH:
logger.warning(
f"{function_name}: Input text exceeds maximum length. "
f"Length: {len(content)} bytes, Max: {MAX_STRING_LENGTH} bytes"
)
raise ValidationError(
f"{function_name}: Input text exceeds maximum allowed length "
f"({len(content)} > {MAX_STRING_LENGTH} bytes)"
)
if len(content) > MAX_STRING_LENGTH:
logger.warning(
f"{function_name}: Input text exceeds maximum length. "
f"Length: {len(content)} characters, Max: {MAX_STRING_LENGTH} characters"
)
raise ValidationError(
f"{function_name}: Input text exceeds maximum allowed length "
f"({len(content)} > {MAX_STRING_LENGTH} characters)"
)

Comment on lines +47 to +54
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 The log message says "bytes" but len(content) counts characters, not bytes. For multi-byte Unicode the reported number will be smaller than the actual byte size, which is misleading when debugging oversized payloads.

Suggested change
logger.warning(
f"{function_name}: Input text exceeds maximum length. "
f"Length: {len(content)} bytes, Max: {MAX_STRING_LENGTH} bytes"
)
raise ValidationError(
f"{function_name}: Input text exceeds maximum allowed length "
f"({len(content)} > {MAX_STRING_LENGTH} bytes)"
)
logger.warning(
f"{function_name}: Input text exceeds maximum length. "
f"Length: {len(content)} chars, Max: {MAX_STRING_LENGTH} chars"
)
raise ValidationError(
f"{function_name}: Input text exceeds maximum allowed length "
f"({len(content)} > {MAX_STRING_LENGTH} chars)"
)

Fix in Cursor Fix in Codex Fix in Claude Code



def attribute_unify(value: str) -> str:
Expand Down Expand Up @@ -45,7 +86,12 @@ def parse_raw_response_to_classifications(content: str) -> List[Classification]:

Expected line format::
SOURCE::QUERY

Raises:
ValidationError: If content is None, empty, or exceeds maximum length.
"""
validate_input_text(content, "parse_raw_response_to_classifications")

classifications: List[Classification] = []

for line in content.strip().splitlines():
Expand Down Expand Up @@ -99,7 +145,12 @@ def parse_raw_response_to_profiles(content: str) -> list[dict]:
TOPIC::SUB_TOPIC::MEMO

Lines before ``---`` are treated as the LLM's "thinking" and ignored.

Raises:
ValidationError: If content is None, empty, or exceeds maximum length.
"""
validate_input_text(content, "parse_raw_response_to_profiles")

facts: list[dict] = []

# Skip the thinking section (everything before '---')
Expand Down Expand Up @@ -150,7 +201,12 @@ def parse_raw_response_to_events(content: str) -> list[dict]:

Returns:
List of event dicts. Empty list if no events found.

Raises:
ValidationError: If content is None, empty, or exceeds maximum length.
"""
validate_input_text(content, "parse_raw_response_to_events")

content = content.strip()

if "NO_EVENT" in content.upper():
Expand Down Expand Up @@ -253,7 +309,12 @@ def parse_raw_response_to_event(content: str) -> dict | None:

Returns:
Dict with event data or *None* if no event was found.

Raises:
ValidationError: If content is None, empty, or exceeds maximum length.
"""
validate_input_text(content, "parse_raw_response_to_event")

events = parse_raw_response_to_events(content)
return events[0] if events else None
Comment on lines +316 to 319
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 parse_raw_response_to_event calls validate_input_text and then immediately delegates to parse_raw_response_to_events, which calls validate_input_text a second time on the same content. The validation is redundant — removing it here keeps a single authoritative check inside the function that does the actual parsing.

Suggested change
validate_input_text(content, "parse_raw_response_to_event")
events = parse_raw_response_to_events(content)
return events[0] if events else None
events = parse_raw_response_to_events(content)
return events[0] if events else None

Note: If this suggestion doesn't match your team's coding style, reply to this and let me know. I'll remember it for next time!

Fix in Cursor Fix in Codex Fix in Claude Code


Expand All @@ -277,7 +338,12 @@ def parse_raw_response_to_image(content: str) -> dict:
Returns:
Dict with keys ``description`` (str) and ``observations`` (list of dicts).
Each observation dict has ``category``, ``description``, and optional ``confidence``.

Raises:
ValidationError: If content is None, empty, or exceeds maximum length.
"""
validate_input_text(content, "parse_raw_response_to_image")

content = content.strip()

result: dict = {
Expand Down
205 changes: 205 additions & 0 deletions tests/unit/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,208 @@ def test_xmem_error_serializes_context():
"operation": "write",
"details": {"id": "1"},
}


# ============================================================================
# VALIDATION TESTS - Empty String, Null, and Length Constraint Checks
# ============================================================================

class TestTextValidationEmptyAndNull:
"""Test that text parsing functions reject empty and null inputs."""

def test_parse_classifications_rejects_empty_string(self):
"""Empty strings should raise ValidationError."""
with pytest.raises(ValidationError, match="cannot be empty"):
parse_raw_response_to_classifications("")

def test_parse_classifications_rejects_whitespace_only(self):
"""Whitespace-only strings should raise ValidationError."""
with pytest.raises(ValidationError, match="cannot be empty"):
parse_raw_response_to_classifications(" \n\t ")

def test_parse_classifications_rejects_none(self):
"""None input should raise ValidationError."""
with pytest.raises(ValidationError, match="cannot be None"):
parse_raw_response_to_classifications(None)

def test_parse_profiles_rejects_empty_string(self):
"""Empty strings should raise ValidationError."""
with pytest.raises(ValidationError, match="cannot be empty"):
parse_raw_response_to_profiles("")

def test_parse_profiles_rejects_whitespace_only(self):
"""Whitespace-only strings should raise ValidationError."""
with pytest.raises(ValidationError, match="cannot be empty"):
parse_raw_response_to_profiles(" \n\t ")

def test_parse_profiles_rejects_none(self):
"""None input should raise ValidationError."""
with pytest.raises(ValidationError, match="cannot be None"):
parse_raw_response_to_profiles(None)

def test_parse_events_rejects_empty_string(self):
"""Empty strings should raise ValidationError."""
with pytest.raises(ValidationError, match="cannot be empty"):
parse_raw_response_to_events("")

def test_parse_events_rejects_whitespace_only(self):
"""Whitespace-only strings should raise ValidationError."""
with pytest.raises(ValidationError, match="cannot be empty"):
parse_raw_response_to_events(" \n\t ")

def test_parse_events_rejects_none(self):
"""None input should raise ValidationError."""
with pytest.raises(ValidationError, match="cannot be None"):
parse_raw_response_to_events(None)

def test_parse_event_rejects_empty_string(self):
"""Empty strings should raise ValidationError."""
with pytest.raises(ValidationError, match="cannot be empty"):
parse_raw_response_to_event("")

def test_parse_event_rejects_whitespace_only(self):
"""Whitespace-only strings should raise ValidationError."""
with pytest.raises(ValidationError, match="cannot be empty"):
parse_raw_response_to_event(" \n\t ")

def test_parse_event_rejects_none(self):
"""None input should raise ValidationError."""
with pytest.raises(ValidationError, match="cannot be None"):
parse_raw_response_to_event(None)

def test_parse_image_rejects_empty_string(self):
"""Empty strings should raise ValidationError."""
with pytest.raises(ValidationError, match="cannot be empty"):
parse_raw_response_to_image("")

def test_parse_image_rejects_whitespace_only(self):
"""Whitespace-only strings should raise ValidationError."""
with pytest.raises(ValidationError, match="cannot be empty"):
parse_raw_response_to_image(" \n\t ")

def test_parse_image_rejects_none(self):
"""None input should raise ValidationError."""
with pytest.raises(ValidationError, match="cannot be None"):
parse_raw_response_to_image(None)


class TestTextValidationTypeChecks:
"""Test that text parsing functions enforce type checks."""

def test_parse_classifications_rejects_non_string_types(self):
"""Non-string types should raise TypeError."""
with pytest.raises(TypeError, match="Input must be a string"):
parse_raw_response_to_classifications(123)

with pytest.raises(TypeError, match="Input must be a string"):
parse_raw_response_to_classifications(["profile::test"])

with pytest.raises(TypeError, match="Input must be a string"):
parse_raw_response_to_classifications({"text": "test"})

def test_parse_profiles_rejects_non_string_types(self):
"""Non-string types should raise TypeError."""
with pytest.raises(TypeError, match="Input must be a string"):
parse_raw_response_to_profiles(456)

def test_parse_events_rejects_non_string_types(self):
"""Non-string types should raise TypeError."""
with pytest.raises(TypeError, match="Input must be a string"):
parse_raw_response_to_events(789)

def test_parse_image_rejects_non_string_types(self):
"""Non-string types should raise TypeError."""
with pytest.raises(TypeError, match="Input must be a string"):
parse_raw_response_to_image(999)


class TestTextValidationLengthConstraints:
"""Test that text parsing functions enforce maximum length constraints."""

def test_parse_classifications_rejects_oversized_payload(self):
"""Oversized payloads should raise ValidationError."""
from src.utils.text import MAX_STRING_LENGTH

oversized = "x" * (MAX_STRING_LENGTH + 1)
with pytest.raises(ValidationError, match="exceeds maximum allowed length"):
parse_raw_response_to_classifications(oversized)

def test_parse_profiles_rejects_oversized_payload(self):
"""Oversized payloads should raise ValidationError."""
from src.utils.text import MAX_STRING_LENGTH

oversized = "x" * (MAX_STRING_LENGTH + 1)
with pytest.raises(ValidationError, match="exceeds maximum allowed length"):
parse_raw_response_to_profiles(oversized)

def test_parse_events_rejects_oversized_payload(self):
"""Oversized payloads should raise ValidationError."""
from src.utils.text import MAX_STRING_LENGTH

oversized = "x" * (MAX_STRING_LENGTH + 1)
with pytest.raises(ValidationError, match="exceeds maximum allowed length"):
parse_raw_response_to_events(oversized)

def test_parse_image_rejects_oversized_payload(self):
"""Oversized payloads should raise ValidationError."""
from src.utils.text import MAX_STRING_LENGTH

oversized = "x" * (MAX_STRING_LENGTH + 1)
with pytest.raises(ValidationError, match="exceeds maximum allowed length"):
parse_raw_response_to_image(oversized)


class TestTextValidationRegressions:
"""Test that valid inputs still work correctly after validation additions."""

def test_parse_classifications_valid_inputs_still_work(self):
"""Valid classification inputs should still parse correctly."""
result = parse_raw_response_to_classifications(
"profile::I work at XMem\ncode::Explain the API"
)
assert len(result) == 2
assert result[0]["source"] == "profile"
assert result[0]["query"] == "I work at XMem"

def test_parse_profiles_valid_inputs_still_work(self):
"""Valid profile inputs should still parse correctly."""
result = parse_raw_response_to_profiles(
"thinking\n---\nwork::company::Google\nbasic_info::name::Alice"
)
assert len(result) == 2
assert result[0]["topic"] == "work"

def test_parse_events_valid_inputs_still_work(self):
"""Valid event inputs should still parse correctly."""
result = parse_raw_response_to_events(
"DATE: 05-11\nEVENT_NAME: Demo\nYEAR: 2026"
)
assert len(result) == 1
assert result[0]["date"] == "05-11"

def test_parse_event_valid_inputs_still_work(self):
"""Valid single event inputs should still parse correctly."""
result = parse_raw_response_to_event(
"DATE: 05-11\nEVENT_NAME: Demo\nYEAR: 2026"
)
assert result["date"] == "05-11"
assert result["event_name"] == "Demo"

def test_parse_image_valid_inputs_still_work(self):
"""Valid image inputs should still parse correctly."""
result = parse_raw_response_to_image(
"DESCRIPTION: Test image\nOBSERVATIONS:\n- [document] test"
)
assert result["description"] == "Test image"
assert len(result["observations"]) == 1

def test_parse_events_no_event_keyword_still_works(self):
"""NO_EVENT keyword should still return empty list."""
result = parse_raw_response_to_events("NO_EVENT")
assert result == []

def test_parse_event_with_no_event_keyword_returns_none(self):
"""NO_EVENT keyword should still return None for single event parser."""
result = parse_raw_response_to_event("NO_EVENT")
assert result is None

Loading