diff --git a/backend/app/alembic/versions/043_add_project_org_to_job_table.py b/backend/app/alembic/versions/043_add_project_org_to_job_table.py new file mode 100644 index 000000000..3bb2ccfff --- /dev/null +++ b/backend/app/alembic/versions/043_add_project_org_to_job_table.py @@ -0,0 +1,92 @@ +"""Add project_id and organization_id to job table + +Revision ID: 043 +Revises: 042 +Create Date: 2026-02-04 14:39:00.000000 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "043" +down_revision = "042" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + # Add organization_id column + op.add_column( + "job", + sa.Column( + "organization_id", + sa.Integer(), + nullable=False, + comment="Reference to the organization", + ), + ) + + # Add project_id column + op.add_column( + "job", + sa.Column( + "project_id", + sa.Integer(), + nullable=False, + comment="Reference to the project", + ), + ) + + # Create foreign key constraints + op.create_foreign_key( + "fk_job_organization_id", + "job", + "organization", + ["organization_id"], + ["id"], + ondelete="CASCADE", + ) + + op.create_foreign_key( + "fk_job_project_id", + "job", + "project", + ["project_id"], + ["id"], + ondelete="CASCADE", + ) + + # Create indexes + op.create_index( + "ix_job_organization_id", + "job", + ["organization_id"], + unique=False, + ) + + op.create_index( + "ix_job_project_id", + "job", + ["project_id"], + unique=False, + ) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + # Drop indexes + op.drop_index("ix_job_project_id", table_name="job") + op.drop_index("ix_job_organization_id", table_name="job") + + # Drop foreign key constraints + op.drop_constraint("fk_job_project_id", "job", type_="foreignkey") + op.drop_constraint("fk_job_organization_id", "job", type_="foreignkey") + + # Drop columns + op.drop_column("job", "project_id") + op.drop_column("job", "organization_id") + # ### end Alembic commands ### diff --git a/backend/app/alembic/versions/044_optimize_conversation_query.py b/backend/app/alembic/versions/044_optimize_conversation_query.py new file mode 100644 index 000000000..d08e68cf5 --- /dev/null +++ b/backend/app/alembic/versions/044_optimize_conversation_query.py @@ -0,0 +1,34 @@ +"""add composite index for conversation query optimization + +Revision ID: 044 +Revises: 043 +Create Date: 2026-02-04 15:24:00.000000 + +""" +from alembic import op + + +# revision identifiers, used by Alembic. +revision = "044" +down_revision = "043" +branch_labels = None +depends_on = None + + +def upgrade(): + # Create composite index to optimize the get_conversation_by_ancestor_id query + # This query filters by: ancestor_response_id, project_id, is_deleted + # and orders by: inserted_at DESC + op.create_index( + "ix_openai_conversation_ancestor_project_active_time", + "openai_conversation", + ["ancestor_response_id", "project_id", "is_deleted", "inserted_at"], + unique=False, + ) + + +def downgrade(): + op.drop_index( + "ix_openai_conversation_ancestor_project_active_time", + table_name="openai_conversation", + ) diff --git a/backend/app/api/docs/documents/upload.md b/backend/app/api/docs/documents/upload.md index e667015f5..c6e51bbe3 100644 --- a/backend/app/api/docs/documents/upload.md +++ b/backend/app/api/docs/documents/upload.md @@ -4,6 +4,12 @@ Upload a document to Kaapi. - If a target format is specified, a transformation job will also be created to transform document into target format in the background. The response will include both the uploaded document details and information about the transformation job. - If a callback URL is provided, you will receive a notification at that URL once the document transformation job is completed. +### File Size Restrictions + +- **Maximum file size**: 50MB (configurable via `MAX_DOCUMENT_UPLOAD_SIZE_MB` environment variable) +- Files exceeding the size limit will be rejected with a 413 (Payload Too Large) error +- Empty files will be rejected with a 422 (Unprocessable Entity) error + ### Supported Transformations The following (source_format → target_format) transformations are supported: diff --git a/backend/app/api/routes/documents.py b/backend/app/api/routes/documents.py index 58beb31b8..3e42e9952 100644 --- a/backend/app/api/routes/documents.py +++ b/backend/app/api/routes/documents.py @@ -34,6 +34,7 @@ build_document_schema, build_document_schemas, ) +from app.services.documents.validators import validate_document_file from app.utils import ( APIResponse, get_openai_client, @@ -123,6 +124,9 @@ async def upload_doc( if callback_url: validate_callback_url(callback_url) + # Validate file size before uploading to S3 + await validate_document_file(src) + source_format, actual_transformer = pre_transform_validation( src_filename=src.filename, target_format=target_format, diff --git a/backend/app/core/config.py b/backend/app/core/config.py index 40c770541..eaab6a6e0 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -123,6 +123,9 @@ def AWS_S3_BUCKET(self) -> str: CALLBACK_CONNECT_TIMEOUT: int = 3 CALLBACK_READ_TIMEOUT: int = 10 + # Document upload size limit (in MB) + MAX_DOCUMENT_UPLOAD_SIZE_MB: int = 512 + @computed_field # type: ignore[prop-decorator] @property def COMPUTED_CELERY_WORKER_CONCURRENCY(self) -> int: diff --git a/backend/app/crud/jobs.py b/backend/app/crud/jobs.py index 1a9005b69..f6da6b6f0 100644 --- a/backend/app/crud/jobs.py +++ b/backend/app/crud/jobs.py @@ -12,9 +12,17 @@ class JobCrud: def __init__(self, session: Session): self.session = session - def create(self, job_type: JobType, trace_id: str | None = None) -> Job: + def create( + self, + job_type: JobType, + project_id: int, + organization_id: int, + trace_id: str | None = None, + ) -> Job: new_job = Job( job_type=job_type, + project_id=project_id, + organization_id=organization_id, trace_id=trace_id, ) self.session.add(new_job) diff --git a/backend/app/models/job.py b/backend/app/models/job.py index b6a1a5ae7..753d1ab33 100644 --- a/backend/app/models/job.py +++ b/backend/app/models/job.py @@ -1,11 +1,16 @@ from datetime import datetime from enum import Enum +from typing import TYPE_CHECKING, Optional from uuid import UUID, uuid4 -from sqlmodel import Field, SQLModel +from sqlmodel import Field, Relationship, SQLModel from app.core.util import now +if TYPE_CHECKING: + from .organization import Organization + from .project import Project + class JobStatus(str, Enum): PENDING = "PENDING" @@ -58,6 +63,22 @@ class Job(SQLModel, table=True): }, ) + # Foreign keys + organization_id: int = Field( + foreign_key="organization.id", + nullable=False, + ondelete="CASCADE", + index=True, + sa_column_kwargs={"comment": "Reference to the organization"}, + ) + project_id: int = Field( + foreign_key="project.id", + nullable=False, + ondelete="CASCADE", + index=True, + sa_column_kwargs={"comment": "Reference to the project"}, + ) + # Timestamps created_at: datetime = Field( default_factory=now, @@ -68,6 +89,10 @@ class Job(SQLModel, table=True): sa_column_kwargs={"comment": "Timestamp when the job was last updated"}, ) + # Relationships + organization: Optional["Organization"] = Relationship() + project: Optional["Project"] = Relationship() + class JobUpdate(SQLModel): status: JobStatus | None = None diff --git a/backend/app/services/documents/validators.py b/backend/app/services/documents/validators.py new file mode 100644 index 000000000..8f5fcfba6 --- /dev/null +++ b/backend/app/services/documents/validators.py @@ -0,0 +1,49 @@ +"""Validation utilities for document uploads.""" + +import logging +from pathlib import Path + +from fastapi import HTTPException, UploadFile + +from app.core.config import settings +from app.utils import mask_string + +logger = logging.getLogger(__name__) + +# Maximum file size for document uploads (in bytes) +# Default: 512 MB, configurable via settings +MAX_DOCUMENT_SIZE = settings.MAX_DOCUMENT_UPLOAD_SIZE_MB * 1024 * 1024 + + +async def validate_document_file(file: UploadFile) -> int: + """ + Validate document file size. + + Args: + file: The uploaded file + + Returns: + File size in bytes if valid + """ + + # Get file size by seeking to end + file.file.seek(0, 2) + file_size = file.file.tell() + file.file.seek(0) + + if file_size > MAX_DOCUMENT_SIZE: + raise HTTPException( + status_code=413, + detail=f"File too large. Maximum size: {MAX_DOCUMENT_SIZE / (1024 * 1024):.0f}MB", + ) + + if file_size == 0: + raise HTTPException( + status_code=422, + detail="Empty file uploaded" + ) + + logger.info( + f"[validate_document_file] Document file validated: {mask_string(file.filename)} ({file_size} bytes)" + ) + return file_size diff --git a/backend/app/services/llm/jobs.py b/backend/app/services/llm/jobs.py index f4700b51b..793c5bf2e 100644 --- a/backend/app/services/llm/jobs.py +++ b/backend/app/services/llm/jobs.py @@ -26,7 +26,12 @@ def start_job( """Create an LLM job and schedule Celery task.""" trace_id = correlation_id.get() or "N/A" job_crud = JobCrud(session=db) - job = job_crud.create(job_type=JobType.LLM_API, trace_id=trace_id) + job = job_crud.create( + job_type=JobType.LLM_API, + project_id=project_id, + organization_id=organization_id, + trace_id=trace_id, + ) try: task_id = start_high_priority_job( diff --git a/backend/app/services/response/jobs.py b/backend/app/services/response/jobs.py index cab0f0e83..2b609b77d 100644 --- a/backend/app/services/response/jobs.py +++ b/backend/app/services/response/jobs.py @@ -19,7 +19,12 @@ def start_job( """Create a response job and schedule Celery task.""" trace_id = correlation_id.get() or "N/A" job_crud = JobCrud(session=db) - job = job_crud.create(job_type=JobType.RESPONSE, trace_id=trace_id) + job = job_crud.create( + job_type=JobType.RESPONSE, + project_id=project_id, + organization_id=organization_id, + trace_id=trace_id, + ) try: task_id = start_high_priority_job( diff --git a/backend/app/tests/api/routes/documents/test_route_document_upload.py b/backend/app/tests/api/routes/documents/test_route_document_upload.py index 6f16b52b1..3a4c78040 100644 --- a/backend/app/tests/api/routes/documents/test_route_document_upload.py +++ b/backend/app/tests/api/routes/documents/test_route_document_upload.py @@ -325,3 +325,102 @@ def test_upload_response_structure_without_transformation( assert field in response.data assert response.data["transformation_job"] is None + + @patch("app.services.documents.validators.MAX_DOCUMENT_SIZE", 1024 * 1024) # Mock to 1MB + def test_upload_file_exceeds_size_limit( + self, + db: Session, + route: Route, + uploader: WebUploader, + ) -> None: + """Test that files exceeding the size limit are rejected.""" + aws = AmazonCloudStorageClient() + aws.create() + + # Create a file larger than the 1MB mock limit + # For testing purposes, we'll create a 2MB file + with NamedTemporaryFile(mode="wb", suffix=".pdf", delete=False) as fp: + # Write 2MB of data (2 * 1024 * 1024 bytes) + chunk_size = 1024 * 1024 # 1MB chunks + for _ in range(2): + fp.write(b"0" * chunk_size) + fp.flush() + large_file = Path(fp.name) + + try: + response = uploader.put(route, large_file) + + assert response.status_code == 413 + error_data = response.json() + assert "File too large" in error_data["error"] + assert "Maximum size: 1MB" in error_data["error"] + + # Verify no document was created in the database + statement = select(Document).where(Document.fname == str(large_file)) + result = db.exec(statement).first() + assert result is None + finally: + large_file.unlink() + + def test_upload_empty_file( + self, + db: Session, + route: Route, + uploader: WebUploader, + ) -> None: + """Test that empty files are rejected.""" + aws = AmazonCloudStorageClient() + aws.create() + + # Create an empty file + with NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as fp: + # Don't write anything, just create an empty file + fp.flush() + empty_file = Path(fp.name) + + try: + response = uploader.put(route, empty_file) + + assert response.status_code == 422 + error_data = response.json() + assert "Empty file uploaded" in error_data["error"] + + # Verify no document was created in the database + statement = select(Document).where(Document.fname == str(empty_file)) + result = db.exec(statement).first() + assert result is None + finally: + empty_file.unlink() + + @patch("app.services.documents.validators.MAX_DOCUMENT_SIZE", 512 * 1024 * 1024) + def test_upload_file_within_size_limit( + self, + db: Session, + route: Route, + uploader: WebUploader, + ) -> None: + """Test that files within the size limit are accepted.""" + aws = AmazonCloudStorageClient() + aws.create() + + # Create a 1MB file (well within the limit) + with NamedTemporaryFile(mode="wb", suffix=".pdf", delete=False) as fp: + # Write 1MB of data + fp.write(b"0" * (1024 * 1024)) + fp.flush() + normal_file = Path(fp.name) + + try: + response = httpx_to_standard(uploader.put(route, normal_file)) + + assert response.success is True + assert "id" in response.data + doc_id = response.data["id"] + + # Verify document was created in database + statement = select(Document).where(Document.id == doc_id) + result = db.exec(statement).one() + assert result.fname == str(normal_file) + finally: + normal_file.unlink() +