From 87c8e4003e5115ad868564bddeeceda2bab2d7c3 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Sun, 25 Jan 2026 11:32:49 +0530 Subject: [PATCH 1/6] Add score trace URL to evaluation run --- ...1_add_score_trace_url_to_evaluation_run.py | 32 +++++++++ backend/app/core/storage_utils.py | 37 ++++++++++- backend/app/crud/evaluations/core.py | 66 +++++++++++++++++-- backend/app/models/evaluation.py | 7 ++ .../app/services/evaluations/evaluation.py | 44 ++++++++++++- 5 files changed, 176 insertions(+), 10 deletions(-) create mode 100644 backend/app/alembic/versions/041_add_score_trace_url_to_evaluation_run.py diff --git a/backend/app/alembic/versions/041_add_score_trace_url_to_evaluation_run.py b/backend/app/alembic/versions/041_add_score_trace_url_to_evaluation_run.py new file mode 100644 index 000000000..60d5b4c26 --- /dev/null +++ b/backend/app/alembic/versions/041_add_score_trace_url_to_evaluation_run.py @@ -0,0 +1,32 @@ +"""Add score_trace_url to evaluation_run + +Revision ID: 041 +Revises: 040 +Create Date: 2026-01-24 19:34:46.763908 + +""" +from alembic import op +import sqlalchemy as sa +import sqlmodel.sql.sqltypes + + +revision = "041" +down_revision = "040" +branch_labels = None +depends_on = None + + +def upgrade(): + op.add_column( + "evaluation_run", + sa.Column( + "score_trace_url", + sqlmodel.sql.sqltypes.AutoString(), + nullable=True, + comment="S3 URL where per-trace evaluation scores are stored", + ), + ) + + +def downgrade(): + op.drop_column("evaluation_run", "score_trace_url") diff --git a/backend/app/core/storage_utils.py b/backend/app/core/storage_utils.py index 63830d7d0..5b557872a 100644 --- a/backend/app/core/storage_utils.py +++ b/backend/app/core/storage_utils.py @@ -88,6 +88,7 @@ def upload_jsonl_to_object_store( results: list[dict], filename: str, subdirectory: str, + as_json_array: bool = False, ) -> str | None: """ Upload JSONL (JSON Lines) content to object store. @@ -115,7 +116,10 @@ def upload_jsonl_to_object_store( file_path = Path(subdirectory) / filename # Convert results to JSONL - jsonl_content = "\n".join([json.dumps(result) for result in results]) + if as_json_array: + jsonl_content = json.dumps(results, ensure_ascii=False) + else: + jsonl_content = "\n".join([json.dumps(result) for result in results]) content_bytes = jsonl_content.encode("utf-8") # Create UploadFile-like object @@ -152,6 +156,37 @@ def upload_jsonl_to_object_store( return None +def load_json_from_object_store(storage: CloudStorage, url: str) -> list | None: + logger.info(f"[load_json_from_object_store] Loading JSON from '{url}") + try: + body = storage.stream(url) + content = body.read() + + data = json.loads(content.decode("utf-8")) + + logger.info( + f"[load_json_from_object_store] Download successful | " + f"url='{url}', size={len(content)} bytes" + ) + return data + except CloudStorageError as e: + logger.warning( + f"[load_json_from_object_store] failed to load JSON from '{url}': {e}", + ) + return None + except json.JSONDecodeError as e: + logger.warning( + f"[load_json_from_object_store] JSON decode error loading JSON from '{url}': {e}", + ) + return None + except Exception as e: + logger.warning( + f"[load_json_from_object_store] unexpected error loading JSON from '{url}': {e}", + exc_info=True, + ) + return None + + def generate_timestamped_filename(base_name: str, extension: str = "csv") -> str: """ Generate a filename with timestamp. diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index 33b6777f3..47526aa27 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -147,6 +147,7 @@ def update_evaluation_run( status: str | None = None, error_message: str | None = None, object_store_url: str | None = None, + score_trace_url: str | None = None, score: dict | None = None, embedding_batch_job_id: int | None = None, ) -> EvaluationRun: @@ -179,6 +180,8 @@ def update_evaluation_run( eval_run.score = score if embedding_batch_job_id is not None: eval_run.embedding_batch_job_id = embedding_batch_job_id + if score_trace_url is not None: + eval_run.score_trace_url = score_trace_url # Always update timestamp eval_run.updated_at = now() @@ -296,6 +299,8 @@ def save_score( Updated EvaluationRun instance, or None if not found """ from app.core.db import engine + from app.core.cloud.storage import get_cloud_storage + from app.core.storage_utils import upload_jsonl_to_object_store with Session(engine) as session: eval_run = get_evaluation_run_by_id( @@ -304,10 +309,59 @@ def save_score( organization_id=organization_id, project_id=project_id, ) - if eval_run: - update_evaluation_run(session=session, eval_run=eval_run, score=score) - logger.info( - f"[save_score] Saved score | evaluation_id={eval_run_id} | " - f"traces={len(score.get('traces', []))}" - ) + if not eval_run: + return None + + traces = score.get("traces", []) + summary_score = score.get("summary_scores", []) + score_trace_url = None + + if traces: + try: + storage = get_cloud_storage(session=session, project_id=project_id) + score_trace_url = upload_jsonl_to_object_store( + storage=storage, + results=traces, + filename=f"traces_{eval_run_id}.json", + subdirectory=f"evaluations/score/{eval_run_id}", + as_json_array=True, + ) + if score_trace_url: + logger.info( + f"[save_score] uploaded traces to S3 | " + f"evaluation_id={eval_run_id} | url={score_trace_url} | " + f"traces_count={len(traces)}" + ) + else: + logger.warning( + f"[save_score] failed to upload traces to S3, " + f"falling back to DB storage | evaluation_id={eval_run_id}" + ) + except Exception as e: + logger.error( + f"[save_score] Error uploading traces to S3: {e} | " + f"evaluation_id={eval_run_id}", + exc_info=True, + ) + + # IF TRACES DATA IS STORED IN S3 URL THEN HERE WE ARE JUST STORING THE SUMMARY SCORE + # TODO: Evaluate weather this behaviour is needed or completely discard the storing data in db + if score_trace_url: + db_score = {"summary_scores": summary_score} + else: + # fallback to store data in db if failed to store in s3 + db_score = score + + update_evaluation_run( + session=session, + eval_run=eval_run, + score=db_score, + score_trace_url=score_trace_url, + ) + + logger.info( + f"[save_score] Saved score | evaluation_id={eval_run_id} | " + f"traces={len(score.get('traces', []))}" + ) + return eval_run diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py index f99fbb27e..be489b8af 100644 --- a/backend/app/models/evaluation.py +++ b/backend/app/models/evaluation.py @@ -247,6 +247,13 @@ class EvaluationRun(SQLModel, table=True): description="Object store URL of processed evaluation results for future reference", sa_column_kwargs={"comment": "S3 URL of processed evaluation results"}, ) + score_trace_url: str | None = SQLField( + default=None, + description="S3 URL per-trace score data is stored", + sa_column_kwargs={ + "comment": "S3 URL where per-trace evaluation scores are stored" + }, + ) total_items: int = SQLField( default=0, description="Total number of items evaluated (set during processing)", diff --git a/backend/app/services/evaluations/evaluation.py b/backend/app/services/evaluations/evaluation.py index 4c1a5de74..620f3f929 100644 --- a/backend/app/services/evaluations/evaluation.py +++ b/backend/app/services/evaluations/evaluation.py @@ -244,6 +244,9 @@ def get_evaluation_with_scores( Returns: Tuple of (EvaluationRun or None, error_message or None) """ + from app.core.cloud.storage import get_cloud_storage + from app.core.storage_utils import load_json_from_object_store + logger.info( f"[get_evaluation_with_scores] Fetching status for evaluation run | " f"evaluation_id={evaluation_id} | " @@ -282,9 +285,41 @@ def get_evaluation_with_scores( return eval_run, None # Check if we already have cached traces - has_cached_traces = eval_run.score is not None and "traces" in eval_run.score - if not resync_score and has_cached_traces: - return eval_run, None + has_cached_traces_s3 = eval_run.score_trace_url is not None + has_cached_traces_db = eval_run.score is not None and "traces" in eval_run.score + if not resync_score: + if has_cached_traces_s3: + try: + storage = get_cloud_storage(session=session, project_id=project_id) + traces = load_json_from_object_store( + storage=storage, url=eval_run.score_trace_url + ) + if traces is not None: + eval_run.score = { + "summary_scores": (eval_run.score or {}).get( + "summary_scores", [] + ), + "traces": traces, + } + logger.info( + f"[get_evaluation_with_scores] Loaded traces from S3 | " + f"evaluation_id={evaluation_id} | " + f"traces_count={len(traces)}" + ) + return eval_run, None + except Exception as e: + logger.error( + f"[get_evaluation_with_scores] Error loading traces from S3: {e} | " + f"evaluation_id={evaluation_id}", + exc_info=True, + ) + + elif has_cached_traces_db: + logger.info( + f"[get_evaluation_with_scores] Returning traces from DB | " + f"evaluation_id={evaluation_id}" + ) + return eval_run, None langfuse = get_langfuse_client( session=session, @@ -344,4 +379,7 @@ def get_evaluation_with_scores( score=score, ) + if eval_run: + eval_run.score = score + return eval_run, None From 87363113f8c89909d85ee27050d8791067aca502 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Thu, 29 Jan 2026 12:07:22 +0530 Subject: [PATCH 2/6] Migration: Add score_trace_url column to evaluation_run --- ...un.py => 042_add_score_trace_url_to_evaluation_run.py} | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) rename backend/app/alembic/versions/{041_add_score_trace_url_to_evaluation_run.py => 042_add_score_trace_url_to_evaluation_run.py} (88%) diff --git a/backend/app/alembic/versions/041_add_score_trace_url_to_evaluation_run.py b/backend/app/alembic/versions/042_add_score_trace_url_to_evaluation_run.py similarity index 88% rename from backend/app/alembic/versions/041_add_score_trace_url_to_evaluation_run.py rename to backend/app/alembic/versions/042_add_score_trace_url_to_evaluation_run.py index 60d5b4c26..46231d4b2 100644 --- a/backend/app/alembic/versions/041_add_score_trace_url_to_evaluation_run.py +++ b/backend/app/alembic/versions/042_add_score_trace_url_to_evaluation_run.py @@ -1,7 +1,7 @@ """Add score_trace_url to evaluation_run -Revision ID: 041 -Revises: 040 +Revision ID: 042 +Revises: 041 Create Date: 2026-01-24 19:34:46.763908 """ @@ -10,8 +10,8 @@ import sqlmodel.sql.sqltypes -revision = "041" -down_revision = "040" +revision = "042" +down_revision = "041" branch_labels = None depends_on = None From 0ac775e974c562413334e9cafd43bb37e3af74e3 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Sat, 31 Jan 2026 00:01:00 +0530 Subject: [PATCH 3/6] Migration: Add score_trace_url column to evaluation_run --- ...un.py => 043_add_score_trace_url_to_evaluation_run.py} | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) rename backend/app/alembic/versions/{042_add_score_trace_url_to_evaluation_run.py => 043_add_score_trace_url_to_evaluation_run.py} (88%) diff --git a/backend/app/alembic/versions/042_add_score_trace_url_to_evaluation_run.py b/backend/app/alembic/versions/043_add_score_trace_url_to_evaluation_run.py similarity index 88% rename from backend/app/alembic/versions/042_add_score_trace_url_to_evaluation_run.py rename to backend/app/alembic/versions/043_add_score_trace_url_to_evaluation_run.py index 46231d4b2..19620401d 100644 --- a/backend/app/alembic/versions/042_add_score_trace_url_to_evaluation_run.py +++ b/backend/app/alembic/versions/043_add_score_trace_url_to_evaluation_run.py @@ -1,7 +1,7 @@ """Add score_trace_url to evaluation_run -Revision ID: 042 -Revises: 041 +Revision ID: 043 +Revises: 042 Create Date: 2026-01-24 19:34:46.763908 """ @@ -10,8 +10,8 @@ import sqlmodel.sql.sqltypes -revision = "042" -down_revision = "041" +revision = "043" +down_revision = "042" branch_labels = None depends_on = None From 6e3d12e082a8ca916f43e9eb2a884bc6cb76827d Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Sat, 31 Jan 2026 23:40:53 +0530 Subject: [PATCH 4/6] fixes - test added - bug fixes --- backend/app/core/storage_utils.py | 9 +- backend/app/crud/evaluations/core.py | 3 +- backend/app/tests/core/test_storage_utils.py | 96 ++++++++++++++ .../crud/evaluations/test_score_storage.py | 90 ++++++++++++++ backend/app/tests/services/__init__.py | 0 .../tests/services/evaluations/__init__.py | 1 + .../evaluations/test_evaluation_service_s3.py | 117 ++++++++++++++++++ 7 files changed, 307 insertions(+), 9 deletions(-) create mode 100644 backend/app/tests/core/test_storage_utils.py create mode 100644 backend/app/tests/crud/evaluations/test_score_storage.py create mode 100644 backend/app/tests/services/__init__.py create mode 100644 backend/app/tests/services/evaluations/__init__.py create mode 100644 backend/app/tests/services/evaluations/test_evaluation_service_s3.py diff --git a/backend/app/core/storage_utils.py b/backend/app/core/storage_utils.py index 5b557872a..6a4ae43d8 100644 --- a/backend/app/core/storage_utils.py +++ b/backend/app/core/storage_utils.py @@ -87,8 +87,7 @@ def upload_jsonl_to_object_store( storage: CloudStorage, results: list[dict], filename: str, - subdirectory: str, - as_json_array: bool = False, + subdirectory: str ) -> str | None: """ Upload JSONL (JSON Lines) content to object store. @@ -115,11 +114,7 @@ def upload_jsonl_to_object_store( # Create file path file_path = Path(subdirectory) / filename - # Convert results to JSONL - if as_json_array: - jsonl_content = json.dumps(results, ensure_ascii=False) - else: - jsonl_content = "\n".join([json.dumps(result) for result in results]) + jsonl_content = json.dumps(results, ensure_ascii=False) content_bytes = jsonl_content.encode("utf-8") # Create UploadFile-like object diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index 8f393e90b..afcec9261 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -363,8 +363,7 @@ def save_score( storage=storage, results=traces, filename=f"traces_{eval_run_id}.json", - subdirectory=f"evaluations/score/{eval_run_id}", - as_json_array=True, + subdirectory=f"evaluations/score/{eval_run_id}" ) if score_trace_url: logger.info( diff --git a/backend/app/tests/core/test_storage_utils.py b/backend/app/tests/core/test_storage_utils.py new file mode 100644 index 000000000..687230054 --- /dev/null +++ b/backend/app/tests/core/test_storage_utils.py @@ -0,0 +1,96 @@ +"""Tests for storage_utils.py - upload and load functions for object store.""" + +import json +from io import BytesIO +from unittest.mock import MagicMock + +import pytest + +from app.core.cloud.storage import CloudStorageError +from app.core.storage_utils import ( + load_json_from_object_store, + upload_jsonl_to_object_store, +) + + +class TestUploadJsonlToObjectStore: + """Test uploading JSON content to object store.""" + + def test_upload_success(self) -> None: + """Verify successful upload returns URL and content is correct.""" + mock_storage = MagicMock() + mock_storage.put.return_value = "s3://bucket/path/traces.json" + + results = [{"trace_id": "t1", "score": 0.9}] + + url = upload_jsonl_to_object_store( + storage=mock_storage, + results=results, + filename="traces.json", + subdirectory="evaluations/score/70", + ) + + assert url == "s3://bucket/path/traces.json" + mock_storage.put.assert_called_once() + + # Verify content is valid JSON + call_args = mock_storage.put.call_args + upload_file = call_args.kwargs.get("source") + content = upload_file.file.read().decode("utf-8") + assert json.loads(content) == results + + def test_upload_returns_none_on_error(self) -> None: + """Verify returns None on CloudStorageError.""" + mock_storage = MagicMock() + mock_storage.put.side_effect = CloudStorageError("Upload failed") + + url = upload_jsonl_to_object_store( + storage=mock_storage, + results=[{"id": 1}], + filename="test.json", + subdirectory="test", + ) + + assert url is None + + +class TestLoadJsonFromObjectStore: + """Test loading JSON from object store.""" + + def test_load_success(self) -> None: + """Verify successful load returns parsed JSON.""" + mock_storage = MagicMock() + test_data = [{"id": 1, "value": "test"}] + mock_storage.stream.return_value = BytesIO(json.dumps(test_data).encode("utf-8")) + + result = load_json_from_object_store( + storage=mock_storage, + url="s3://bucket/path/file.json", + ) + + assert result == test_data + mock_storage.stream.assert_called_once_with("s3://bucket/path/file.json") + + def test_load_returns_none_on_error(self) -> None: + """Verify returns None on CloudStorageError.""" + mock_storage = MagicMock() + mock_storage.stream.side_effect = CloudStorageError("Download failed") + + result = load_json_from_object_store( + storage=mock_storage, + url="s3://bucket/file.json", + ) + + assert result is None + + def test_load_returns_none_on_invalid_json(self) -> None: + """Verify returns None on invalid JSON content.""" + mock_storage = MagicMock() + mock_storage.stream.return_value = BytesIO(b"not valid json") + + result = load_json_from_object_store( + storage=mock_storage, + url="s3://bucket/file.json", + ) + + assert result is None diff --git a/backend/app/tests/crud/evaluations/test_score_storage.py b/backend/app/tests/crud/evaluations/test_score_storage.py new file mode 100644 index 000000000..059880076 --- /dev/null +++ b/backend/app/tests/crud/evaluations/test_score_storage.py @@ -0,0 +1,90 @@ +"""Tests for save_score() S3 upload functionality.""" + +from unittest.mock import MagicMock, patch + +import pytest + +from app.crud.evaluations.core import save_score +from app.models import EvaluationRun + + +class TestSaveScoreS3Upload: + """Test save_score() S3 upload functionality.""" + + @pytest.fixture + def mock_eval_run(self): + """Create a mock EvaluationRun.""" + eval_run = MagicMock(spec=EvaluationRun) + eval_run.id = 100 + return eval_run + + @patch("app.crud.evaluations.core.update_evaluation_run") + @patch("app.crud.evaluations.core.get_evaluation_run_by_id") + @patch("app.core.storage_utils.upload_jsonl_to_object_store") + @patch("app.core.cloud.storage.get_cloud_storage") + @patch("app.core.db.engine") + def test_uploads_traces_to_s3_and_stores_summary_only( + self, mock_engine, mock_get_storage, mock_upload, mock_get_eval, mock_update, mock_eval_run + ) -> None: + """Verify traces uploaded to S3, only summary_scores stored in DB.""" + mock_get_eval.return_value = mock_eval_run + mock_get_storage.return_value = MagicMock() + mock_upload.return_value = "s3://bucket/traces.json" + + score = { + "summary_scores": [{"name": "accuracy", "avg": 0.9}], + "traces": [{"trace_id": "t1"}], + } + + save_score(eval_run_id=100, organization_id=1, project_id=1, score=score) + + # Verify upload was called with traces + mock_upload.assert_called_once() + assert mock_upload.call_args.kwargs["results"] == [{"trace_id": "t1"}] + + # Verify DB gets summary only, not traces + call_kwargs = mock_update.call_args.kwargs + assert call_kwargs["score"] == {"summary_scores": [{"name": "accuracy", "avg": 0.9}]} + assert call_kwargs["score_trace_url"] == "s3://bucket/traces.json" + + @patch("app.crud.evaluations.core.update_evaluation_run") + @patch("app.crud.evaluations.core.get_evaluation_run_by_id") + @patch("app.core.storage_utils.upload_jsonl_to_object_store") + @patch("app.core.cloud.storage.get_cloud_storage") + @patch("app.core.db.engine") + def test_fallback_to_db_when_s3_fails( + self, mock_engine, mock_get_storage, mock_upload, mock_get_eval, mock_update, mock_eval_run + ) -> None: + """Verify full score stored in DB when S3 upload fails.""" + mock_get_eval.return_value = mock_eval_run + mock_get_storage.return_value = MagicMock() + mock_upload.return_value = None # S3 failed + + score = { + "summary_scores": [{"name": "accuracy", "avg": 0.9}], + "traces": [{"trace_id": "t1"}], + } + + save_score(eval_run_id=100, organization_id=1, project_id=1, score=score) + + # Full score stored in DB as fallback + call_kwargs = mock_update.call_args.kwargs + assert call_kwargs["score"] == score + assert call_kwargs["score_trace_url"] is None + + @patch("app.crud.evaluations.core.update_evaluation_run") + @patch("app.crud.evaluations.core.get_evaluation_run_by_id") + @patch("app.core.storage_utils.upload_jsonl_to_object_store") + @patch("app.core.cloud.storage.get_cloud_storage") + @patch("app.core.db.engine") + def test_no_s3_upload_when_no_traces( + self, mock_engine, mock_get_storage, mock_upload, mock_get_eval, mock_update, mock_eval_run + ) -> None: + """Verify S3 upload skipped when traces is empty.""" + mock_get_eval.return_value = mock_eval_run + + score = {"summary_scores": [{"name": "accuracy", "avg": 0.9}], "traces": []} + + save_score(eval_run_id=100, organization_id=1, project_id=1, score=score) + + mock_upload.assert_not_called() diff --git a/backend/app/tests/services/__init__.py b/backend/app/tests/services/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backend/app/tests/services/evaluations/__init__.py b/backend/app/tests/services/evaluations/__init__.py new file mode 100644 index 000000000..293031958 --- /dev/null +++ b/backend/app/tests/services/evaluations/__init__.py @@ -0,0 +1 @@ +"""Evaluation service tests.""" diff --git a/backend/app/tests/services/evaluations/test_evaluation_service_s3.py b/backend/app/tests/services/evaluations/test_evaluation_service_s3.py new file mode 100644 index 000000000..ea28d2aca --- /dev/null +++ b/backend/app/tests/services/evaluations/test_evaluation_service_s3.py @@ -0,0 +1,117 @@ +"""Tests for get_evaluation_with_scores() S3 retrieval.""" + +from unittest.mock import MagicMock, patch + +import pytest + +from app.models import EvaluationRun +from app.services.evaluations.evaluation import get_evaluation_with_scores + + +class TestGetEvaluationWithScoresS3: + """Test get_evaluation_with_scores() S3 retrieval.""" + + @pytest.fixture + def completed_eval_run_with_s3(self): + """Completed eval run with S3 URL.""" + eval_run = MagicMock(spec=EvaluationRun) + eval_run.id = 100 + eval_run.status = "completed" + eval_run.score = {"summary_scores": [{"name": "accuracy", "avg": 0.9}]} + eval_run.score_trace_url = "s3://bucket/traces.json" + eval_run.dataset_name = "test_dataset" + eval_run.run_name = "test_run" + return eval_run + + @pytest.fixture + def completed_eval_run_with_db_traces(self): + """Completed eval run with traces in DB.""" + eval_run = MagicMock(spec=EvaluationRun) + eval_run.id = 101 + eval_run.status = "completed" + eval_run.score = { + "summary_scores": [{"name": "accuracy", "avg": 0.85}], + "traces": [{"trace_id": "db_trace"}], + } + eval_run.score_trace_url = None + return eval_run + + @patch("app.services.evaluations.evaluation.get_evaluation_run_by_id") + @patch("app.core.storage_utils.load_json_from_object_store") + @patch("app.core.cloud.storage.get_cloud_storage") + def test_loads_traces_from_s3( + self, mock_get_storage, mock_load, mock_get_eval, completed_eval_run_with_s3 + ) -> None: + """Verify traces loaded from S3 and score reconstructed.""" + mock_get_eval.return_value = completed_eval_run_with_s3 + mock_get_storage.return_value = MagicMock() + mock_load.return_value = [{"trace_id": "s3_trace"}] + + result, error = get_evaluation_with_scores( + session=MagicMock(), + evaluation_id=100, + organization_id=1, + project_id=1, + get_trace_info=True, + resync_score=False, + ) + + assert error is None + mock_load.assert_called_once() + assert result.score["traces"] == [{"trace_id": "s3_trace"}] + assert result.score["summary_scores"] == [{"name": "accuracy", "avg": 0.9}] + + @patch("app.services.evaluations.evaluation.get_evaluation_run_by_id") + @patch("app.core.cloud.storage.get_cloud_storage") + def test_returns_db_traces_when_no_s3_url( + self, mock_get_storage, mock_get_eval, completed_eval_run_with_db_traces + ) -> None: + """Verify DB traces returned when no S3 URL.""" + mock_get_eval.return_value = completed_eval_run_with_db_traces + + result, error = get_evaluation_with_scores( + session=MagicMock(), + evaluation_id=101, + organization_id=1, + project_id=1, + get_trace_info=True, + resync_score=False, + ) + + assert error is None + mock_get_storage.assert_not_called() + assert result.score["traces"] == [{"trace_id": "db_trace"}] + + @patch("app.services.evaluations.evaluation.save_score") + @patch("app.services.evaluations.evaluation.fetch_trace_scores_from_langfuse") + @patch("app.services.evaluations.evaluation.get_langfuse_client") + @patch("app.services.evaluations.evaluation.get_evaluation_run_by_id") + @patch("app.core.storage_utils.load_json_from_object_store") + @patch("app.core.cloud.storage.get_cloud_storage") + def test_resync_bypasses_cache_and_fetches_langfuse( + self, + mock_get_storage, + mock_load, + mock_get_eval, + mock_get_langfuse, + mock_fetch_langfuse, + mock_save_score, + completed_eval_run_with_s3, + ) -> None: + """Verify resync=True skips S3/DB and fetches from Langfuse.""" + mock_get_eval.return_value = completed_eval_run_with_s3 + mock_get_langfuse.return_value = MagicMock() + mock_fetch_langfuse.return_value = {"summary_scores": [], "traces": [{"trace_id": "new"}]} + mock_save_score.return_value = completed_eval_run_with_s3 + + get_evaluation_with_scores( + session=MagicMock(), + evaluation_id=100, + organization_id=1, + project_id=1, + get_trace_info=True, + resync_score=True, + ) + + mock_load.assert_not_called() # S3 skipped + mock_fetch_langfuse.assert_called_once() # Langfuse called From afc0d700b62ea2b7e65a22bdc8b3e06653660751 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Sat, 31 Jan 2026 23:44:52 +0530 Subject: [PATCH 5/6] Refactor: Improve formatting and readability in storage utility and test files --- backend/app/core/storage_utils.py | 5 +--- backend/app/crud/evaluations/core.py | 2 +- backend/app/tests/core/test_storage_utils.py | 4 ++- .../crud/evaluations/test_score_storage.py | 28 ++++++++++++++++--- .../evaluations/test_evaluation_service_s3.py | 5 +++- 5 files changed, 33 insertions(+), 11 deletions(-) diff --git a/backend/app/core/storage_utils.py b/backend/app/core/storage_utils.py index 6a4ae43d8..bf87033f0 100644 --- a/backend/app/core/storage_utils.py +++ b/backend/app/core/storage_utils.py @@ -84,10 +84,7 @@ def __init__(self, content: bytes): def upload_jsonl_to_object_store( - storage: CloudStorage, - results: list[dict], - filename: str, - subdirectory: str + storage: CloudStorage, results: list[dict], filename: str, subdirectory: str ) -> str | None: """ Upload JSONL (JSON Lines) content to object store. diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index afcec9261..7e2b593b2 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -363,7 +363,7 @@ def save_score( storage=storage, results=traces, filename=f"traces_{eval_run_id}.json", - subdirectory=f"evaluations/score/{eval_run_id}" + subdirectory=f"evaluations/score/{eval_run_id}", ) if score_trace_url: logger.info( diff --git a/backend/app/tests/core/test_storage_utils.py b/backend/app/tests/core/test_storage_utils.py index 687230054..1df88168c 100644 --- a/backend/app/tests/core/test_storage_utils.py +++ b/backend/app/tests/core/test_storage_utils.py @@ -61,7 +61,9 @@ def test_load_success(self) -> None: """Verify successful load returns parsed JSON.""" mock_storage = MagicMock() test_data = [{"id": 1, "value": "test"}] - mock_storage.stream.return_value = BytesIO(json.dumps(test_data).encode("utf-8")) + mock_storage.stream.return_value = BytesIO( + json.dumps(test_data).encode("utf-8") + ) result = load_json_from_object_store( storage=mock_storage, diff --git a/backend/app/tests/crud/evaluations/test_score_storage.py b/backend/app/tests/crud/evaluations/test_score_storage.py index 059880076..6c455b7ad 100644 --- a/backend/app/tests/crud/evaluations/test_score_storage.py +++ b/backend/app/tests/crud/evaluations/test_score_storage.py @@ -24,7 +24,13 @@ def mock_eval_run(self): @patch("app.core.cloud.storage.get_cloud_storage") @patch("app.core.db.engine") def test_uploads_traces_to_s3_and_stores_summary_only( - self, mock_engine, mock_get_storage, mock_upload, mock_get_eval, mock_update, mock_eval_run + self, + mock_engine, + mock_get_storage, + mock_upload, + mock_get_eval, + mock_update, + mock_eval_run, ) -> None: """Verify traces uploaded to S3, only summary_scores stored in DB.""" mock_get_eval.return_value = mock_eval_run @@ -44,7 +50,9 @@ def test_uploads_traces_to_s3_and_stores_summary_only( # Verify DB gets summary only, not traces call_kwargs = mock_update.call_args.kwargs - assert call_kwargs["score"] == {"summary_scores": [{"name": "accuracy", "avg": 0.9}]} + assert call_kwargs["score"] == { + "summary_scores": [{"name": "accuracy", "avg": 0.9}] + } assert call_kwargs["score_trace_url"] == "s3://bucket/traces.json" @patch("app.crud.evaluations.core.update_evaluation_run") @@ -53,7 +61,13 @@ def test_uploads_traces_to_s3_and_stores_summary_only( @patch("app.core.cloud.storage.get_cloud_storage") @patch("app.core.db.engine") def test_fallback_to_db_when_s3_fails( - self, mock_engine, mock_get_storage, mock_upload, mock_get_eval, mock_update, mock_eval_run + self, + mock_engine, + mock_get_storage, + mock_upload, + mock_get_eval, + mock_update, + mock_eval_run, ) -> None: """Verify full score stored in DB when S3 upload fails.""" mock_get_eval.return_value = mock_eval_run @@ -78,7 +92,13 @@ def test_fallback_to_db_when_s3_fails( @patch("app.core.cloud.storage.get_cloud_storage") @patch("app.core.db.engine") def test_no_s3_upload_when_no_traces( - self, mock_engine, mock_get_storage, mock_upload, mock_get_eval, mock_update, mock_eval_run + self, + mock_engine, + mock_get_storage, + mock_upload, + mock_get_eval, + mock_update, + mock_eval_run, ) -> None: """Verify S3 upload skipped when traces is empty.""" mock_get_eval.return_value = mock_eval_run diff --git a/backend/app/tests/services/evaluations/test_evaluation_service_s3.py b/backend/app/tests/services/evaluations/test_evaluation_service_s3.py index ea28d2aca..28a91615c 100644 --- a/backend/app/tests/services/evaluations/test_evaluation_service_s3.py +++ b/backend/app/tests/services/evaluations/test_evaluation_service_s3.py @@ -101,7 +101,10 @@ def test_resync_bypasses_cache_and_fetches_langfuse( """Verify resync=True skips S3/DB and fetches from Langfuse.""" mock_get_eval.return_value = completed_eval_run_with_s3 mock_get_langfuse.return_value = MagicMock() - mock_fetch_langfuse.return_value = {"summary_scores": [], "traces": [{"trace_id": "new"}]} + mock_fetch_langfuse.return_value = { + "summary_scores": [], + "traces": [{"trace_id": "new"}], + } mock_save_score.return_value = completed_eval_run_with_s3 get_evaluation_with_scores( From 788f3ae9f6185189b90eefca760ecd16ca670717 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Mon, 2 Feb 2026 10:47:24 +0530 Subject: [PATCH 6/6] bug fix --- backend/app/services/evaluations/evaluation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/app/services/evaluations/evaluation.py b/backend/app/services/evaluations/evaluation.py index 3fc463086..42f1f44f8 100644 --- a/backend/app/services/evaluations/evaluation.py +++ b/backend/app/services/evaluations/evaluation.py @@ -259,7 +259,7 @@ def get_evaluation_with_scores( exc_info=True, ) - elif has_cached_traces_db: + if has_cached_traces_db: logger.info( f"[get_evaluation_with_scores] Returning traces from DB | " f"evaluation_id={evaluation_id}"