diff --git a/src/utils/text.py b/src/utils/text.py index f04b6e2..9ddff64 100644 --- a/src/utils/text.py +++ b/src/utils/text.py @@ -7,10 +7,51 @@ from __future__ import annotations +import logging from typing import List from src.config.constants import LLM_TAB_SEPARATOR from src.schemas.classification import Classification +from src.utils.exceptions import ValidationError + + +logger = logging.getLogger(__name__) + +# Maximum string length threshold (10 MB) +MAX_STRING_LENGTH = 10 * 1024 * 1024 + + +# --------------------------------------------------------------------------- +# Validation helpers +# --------------------------------------------------------------------------- + + +def validate_input_text(content: str, function_name: str) -> None: + """ + Validate input text for null, empty, and size constraints. + + Raises: + ValidationError: If content is None, empty after stripping, or exceeds max length. + TypeError: If content is not a string. + """ + if content is None: + raise ValidationError(f"{function_name}: Input text cannot be None") + + if not isinstance(content, str): + raise TypeError(f"{function_name}: Input must be a string, got {type(content).__name__}") + + if not content.strip(): + raise ValidationError(f"{function_name}: Input text cannot be empty or whitespace-only") + + if len(content) > MAX_STRING_LENGTH: + logger.warning( + f"{function_name}: Input text exceeds maximum length. " + f"Length: {len(content)} bytes, Max: {MAX_STRING_LENGTH} bytes" + ) + raise ValidationError( + f"{function_name}: Input text exceeds maximum allowed length " + f"({len(content)} > {MAX_STRING_LENGTH} bytes)" + ) def attribute_unify(value: str) -> str: @@ -45,7 +86,12 @@ def parse_raw_response_to_classifications(content: str) -> List[Classification]: Expected line format:: SOURCE::QUERY + + Raises: + ValidationError: If content is None, empty, or exceeds maximum length. """ + validate_input_text(content, "parse_raw_response_to_classifications") + classifications: List[Classification] = [] for line in content.strip().splitlines(): @@ -99,7 +145,12 @@ def parse_raw_response_to_profiles(content: str) -> list[dict]: TOPIC::SUB_TOPIC::MEMO Lines before ``---`` are treated as the LLM's "thinking" and ignored. + + Raises: + ValidationError: If content is None, empty, or exceeds maximum length. """ + validate_input_text(content, "parse_raw_response_to_profiles") + facts: list[dict] = [] # Skip the thinking section (everything before '---') @@ -150,7 +201,12 @@ def parse_raw_response_to_events(content: str) -> list[dict]: Returns: List of event dicts. Empty list if no events found. + + Raises: + ValidationError: If content is None, empty, or exceeds maximum length. """ + validate_input_text(content, "parse_raw_response_to_events") + content = content.strip() if "NO_EVENT" in content.upper(): @@ -253,7 +309,12 @@ def parse_raw_response_to_event(content: str) -> dict | None: Returns: Dict with event data or *None* if no event was found. + + Raises: + ValidationError: If content is None, empty, or exceeds maximum length. """ + validate_input_text(content, "parse_raw_response_to_event") + events = parse_raw_response_to_events(content) return events[0] if events else None @@ -277,7 +338,12 @@ def parse_raw_response_to_image(content: str) -> dict: Returns: Dict with keys ``description`` (str) and ``observations`` (list of dicts). Each observation dict has ``category``, ``description``, and optional ``confidence``. + + Raises: + ValidationError: If content is None, empty, or exceeds maximum length. """ + validate_input_text(content, "parse_raw_response_to_image") + content = content.strip() result: dict = { diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 304ed78..6607a14 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -123,3 +123,208 @@ def test_xmem_error_serializes_context(): "operation": "write", "details": {"id": "1"}, } + + +# ============================================================================ +# VALIDATION TESTS - Empty String, Null, and Length Constraint Checks +# ============================================================================ + +class TestTextValidationEmptyAndNull: + """Test that text parsing functions reject empty and null inputs.""" + + def test_parse_classifications_rejects_empty_string(self): + """Empty strings should raise ValidationError.""" + with pytest.raises(ValidationError, match="cannot be empty"): + parse_raw_response_to_classifications("") + + def test_parse_classifications_rejects_whitespace_only(self): + """Whitespace-only strings should raise ValidationError.""" + with pytest.raises(ValidationError, match="cannot be empty"): + parse_raw_response_to_classifications(" \n\t ") + + def test_parse_classifications_rejects_none(self): + """None input should raise ValidationError.""" + with pytest.raises(ValidationError, match="cannot be None"): + parse_raw_response_to_classifications(None) + + def test_parse_profiles_rejects_empty_string(self): + """Empty strings should raise ValidationError.""" + with pytest.raises(ValidationError, match="cannot be empty"): + parse_raw_response_to_profiles("") + + def test_parse_profiles_rejects_whitespace_only(self): + """Whitespace-only strings should raise ValidationError.""" + with pytest.raises(ValidationError, match="cannot be empty"): + parse_raw_response_to_profiles(" \n\t ") + + def test_parse_profiles_rejects_none(self): + """None input should raise ValidationError.""" + with pytest.raises(ValidationError, match="cannot be None"): + parse_raw_response_to_profiles(None) + + def test_parse_events_rejects_empty_string(self): + """Empty strings should raise ValidationError.""" + with pytest.raises(ValidationError, match="cannot be empty"): + parse_raw_response_to_events("") + + def test_parse_events_rejects_whitespace_only(self): + """Whitespace-only strings should raise ValidationError.""" + with pytest.raises(ValidationError, match="cannot be empty"): + parse_raw_response_to_events(" \n\t ") + + def test_parse_events_rejects_none(self): + """None input should raise ValidationError.""" + with pytest.raises(ValidationError, match="cannot be None"): + parse_raw_response_to_events(None) + + def test_parse_event_rejects_empty_string(self): + """Empty strings should raise ValidationError.""" + with pytest.raises(ValidationError, match="cannot be empty"): + parse_raw_response_to_event("") + + def test_parse_event_rejects_whitespace_only(self): + """Whitespace-only strings should raise ValidationError.""" + with pytest.raises(ValidationError, match="cannot be empty"): + parse_raw_response_to_event(" \n\t ") + + def test_parse_event_rejects_none(self): + """None input should raise ValidationError.""" + with pytest.raises(ValidationError, match="cannot be None"): + parse_raw_response_to_event(None) + + def test_parse_image_rejects_empty_string(self): + """Empty strings should raise ValidationError.""" + with pytest.raises(ValidationError, match="cannot be empty"): + parse_raw_response_to_image("") + + def test_parse_image_rejects_whitespace_only(self): + """Whitespace-only strings should raise ValidationError.""" + with pytest.raises(ValidationError, match="cannot be empty"): + parse_raw_response_to_image(" \n\t ") + + def test_parse_image_rejects_none(self): + """None input should raise ValidationError.""" + with pytest.raises(ValidationError, match="cannot be None"): + parse_raw_response_to_image(None) + + +class TestTextValidationTypeChecks: + """Test that text parsing functions enforce type checks.""" + + def test_parse_classifications_rejects_non_string_types(self): + """Non-string types should raise TypeError.""" + with pytest.raises(TypeError, match="Input must be a string"): + parse_raw_response_to_classifications(123) + + with pytest.raises(TypeError, match="Input must be a string"): + parse_raw_response_to_classifications(["profile::test"]) + + with pytest.raises(TypeError, match="Input must be a string"): + parse_raw_response_to_classifications({"text": "test"}) + + def test_parse_profiles_rejects_non_string_types(self): + """Non-string types should raise TypeError.""" + with pytest.raises(TypeError, match="Input must be a string"): + parse_raw_response_to_profiles(456) + + def test_parse_events_rejects_non_string_types(self): + """Non-string types should raise TypeError.""" + with pytest.raises(TypeError, match="Input must be a string"): + parse_raw_response_to_events(789) + + def test_parse_image_rejects_non_string_types(self): + """Non-string types should raise TypeError.""" + with pytest.raises(TypeError, match="Input must be a string"): + parse_raw_response_to_image(999) + + +class TestTextValidationLengthConstraints: + """Test that text parsing functions enforce maximum length constraints.""" + + def test_parse_classifications_rejects_oversized_payload(self): + """Oversized payloads should raise ValidationError.""" + from src.utils.text import MAX_STRING_LENGTH + + oversized = "x" * (MAX_STRING_LENGTH + 1) + with pytest.raises(ValidationError, match="exceeds maximum allowed length"): + parse_raw_response_to_classifications(oversized) + + def test_parse_profiles_rejects_oversized_payload(self): + """Oversized payloads should raise ValidationError.""" + from src.utils.text import MAX_STRING_LENGTH + + oversized = "x" * (MAX_STRING_LENGTH + 1) + with pytest.raises(ValidationError, match="exceeds maximum allowed length"): + parse_raw_response_to_profiles(oversized) + + def test_parse_events_rejects_oversized_payload(self): + """Oversized payloads should raise ValidationError.""" + from src.utils.text import MAX_STRING_LENGTH + + oversized = "x" * (MAX_STRING_LENGTH + 1) + with pytest.raises(ValidationError, match="exceeds maximum allowed length"): + parse_raw_response_to_events(oversized) + + def test_parse_image_rejects_oversized_payload(self): + """Oversized payloads should raise ValidationError.""" + from src.utils.text import MAX_STRING_LENGTH + + oversized = "x" * (MAX_STRING_LENGTH + 1) + with pytest.raises(ValidationError, match="exceeds maximum allowed length"): + parse_raw_response_to_image(oversized) + + +class TestTextValidationRegressions: + """Test that valid inputs still work correctly after validation additions.""" + + def test_parse_classifications_valid_inputs_still_work(self): + """Valid classification inputs should still parse correctly.""" + result = parse_raw_response_to_classifications( + "profile::I work at XMem\ncode::Explain the API" + ) + assert len(result) == 2 + assert result[0]["source"] == "profile" + assert result[0]["query"] == "I work at XMem" + + def test_parse_profiles_valid_inputs_still_work(self): + """Valid profile inputs should still parse correctly.""" + result = parse_raw_response_to_profiles( + "thinking\n---\nwork::company::Google\nbasic_info::name::Alice" + ) + assert len(result) == 2 + assert result[0]["topic"] == "work" + + def test_parse_events_valid_inputs_still_work(self): + """Valid event inputs should still parse correctly.""" + result = parse_raw_response_to_events( + "DATE: 05-11\nEVENT_NAME: Demo\nYEAR: 2026" + ) + assert len(result) == 1 + assert result[0]["date"] == "05-11" + + def test_parse_event_valid_inputs_still_work(self): + """Valid single event inputs should still parse correctly.""" + result = parse_raw_response_to_event( + "DATE: 05-11\nEVENT_NAME: Demo\nYEAR: 2026" + ) + assert result["date"] == "05-11" + assert result["event_name"] == "Demo" + + def test_parse_image_valid_inputs_still_work(self): + """Valid image inputs should still parse correctly.""" + result = parse_raw_response_to_image( + "DESCRIPTION: Test image\nOBSERVATIONS:\n- [document] test" + ) + assert result["description"] == "Test image" + assert len(result["observations"]) == 1 + + def test_parse_events_no_event_keyword_still_works(self): + """NO_EVENT keyword should still return empty list.""" + result = parse_raw_response_to_events("NO_EVENT") + assert result == [] + + def test_parse_event_with_no_event_keyword_returns_none(self): + """NO_EVENT keyword should still return None for single event parser.""" + result = parse_raw_response_to_event("NO_EVENT") + assert result is None +