From 828fa51a682a3eb7f166d60ad25a188897b0465a Mon Sep 17 00:00:00 2001 From: Ankit Mehta Date: Tue, 3 Feb 2026 18:37:55 +0530 Subject: [PATCH 1/2] Add file size validation to document upload endpoint --- backend/app/api/docs/documents/upload.md | 6 ++ backend/app/api/routes/documents.py | 4 + backend/app/core/config.py | 3 + backend/app/services/documents/validators.py | 54 +++++++++++ .../documents/test_route_document_upload.py | 97 +++++++++++++++++++ 5 files changed, 164 insertions(+) create mode 100644 backend/app/services/documents/validators.py diff --git a/backend/app/api/docs/documents/upload.md b/backend/app/api/docs/documents/upload.md index e667015f5..c6e51bbe3 100644 --- a/backend/app/api/docs/documents/upload.md +++ b/backend/app/api/docs/documents/upload.md @@ -4,6 +4,12 @@ Upload a document to Kaapi. - If a target format is specified, a transformation job will also be created to transform document into target format in the background. The response will include both the uploaded document details and information about the transformation job. - If a callback URL is provided, you will receive a notification at that URL once the document transformation job is completed. +### File Size Restrictions + +- **Maximum file size**: 50MB (configurable via `MAX_DOCUMENT_UPLOAD_SIZE_MB` environment variable) +- Files exceeding the size limit will be rejected with a 413 (Payload Too Large) error +- Empty files will be rejected with a 422 (Unprocessable Entity) error + ### Supported Transformations The following (source_format → target_format) transformations are supported: diff --git a/backend/app/api/routes/documents.py b/backend/app/api/routes/documents.py index 58beb31b8..3e42e9952 100644 --- a/backend/app/api/routes/documents.py +++ b/backend/app/api/routes/documents.py @@ -34,6 +34,7 @@ build_document_schema, build_document_schemas, ) +from app.services.documents.validators import validate_document_file from app.utils import ( APIResponse, get_openai_client, @@ -123,6 +124,9 @@ async def upload_doc( if callback_url: validate_callback_url(callback_url) + # Validate file size before uploading to S3 + await validate_document_file(src) + source_format, actual_transformer = pre_transform_validation( src_filename=src.filename, target_format=target_format, diff --git a/backend/app/core/config.py b/backend/app/core/config.py index 40c770541..832596008 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -123,6 +123,9 @@ def AWS_S3_BUCKET(self) -> str: CALLBACK_CONNECT_TIMEOUT: int = 3 CALLBACK_READ_TIMEOUT: int = 10 + # Document upload size limit (in MB) + MAX_DOCUMENT_UPLOAD_SIZE_MB: int = 50 + @computed_field # type: ignore[prop-decorator] @property def COMPUTED_CELERY_WORKER_CONCURRENCY(self) -> int: diff --git a/backend/app/services/documents/validators.py b/backend/app/services/documents/validators.py new file mode 100644 index 000000000..3db6f39ff --- /dev/null +++ b/backend/app/services/documents/validators.py @@ -0,0 +1,54 @@ +"""Validation utilities for document uploads.""" + +import logging +from pathlib import Path + +from fastapi import HTTPException, UploadFile + +from app.core.config import settings + +logger = logging.getLogger(__name__) + +# Maximum file size for document uploads (in bytes) +# Default: 50 MB, configurable via settings +MAX_DOCUMENT_SIZE = settings.MAX_DOCUMENT_UPLOAD_SIZE_MB * 1024 * 1024 + + +async def validate_document_file(file: UploadFile) -> int: + """ + Validate document file size. + + Args: + file: The uploaded file + + Returns: + File size in bytes if valid + + Raises: + HTTPException: If validation fails + """ + if not file.filename: + raise HTTPException( + status_code=422, + detail="File must have a filename", + ) + + # Get file size by seeking to end + file.file.seek(0, 2) + file_size = file.file.tell() + file.file.seek(0) + + if file_size > MAX_DOCUMENT_SIZE: + raise HTTPException( + status_code=413, + detail=f"File too large. Maximum size: {MAX_DOCUMENT_SIZE / (1024 * 1024):.0f}MB", + ) + + if file_size == 0: + raise HTTPException( + status_code=422, + detail="Empty file uploaded" + ) + + logger.info(f"Document file validated: {file.filename} ({file_size} bytes)") + return file_size diff --git a/backend/app/tests/api/routes/documents/test_route_document_upload.py b/backend/app/tests/api/routes/documents/test_route_document_upload.py index 6f16b52b1..f0fa01da2 100644 --- a/backend/app/tests/api/routes/documents/test_route_document_upload.py +++ b/backend/app/tests/api/routes/documents/test_route_document_upload.py @@ -325,3 +325,100 @@ def test_upload_response_structure_without_transformation( assert field in response.data assert response.data["transformation_job"] is None + + def test_upload_file_exceeds_size_limit( + self, + db: Session, + route: Route, + uploader: WebUploader, + ) -> None: + """Test that files exceeding the size limit are rejected.""" + aws = AmazonCloudStorageClient() + aws.create() + + # Create a file larger than the 50MB limit + # For testing purposes, we'll create a 51MB file + with NamedTemporaryFile(mode="wb", suffix=".pdf", delete=False) as fp: + # Write 51MB of data (51 * 1024 * 1024 bytes) + chunk_size = 1024 * 1024 # 1MB chunks + for _ in range(51): + fp.write(b"0" * chunk_size) + fp.flush() + large_file = Path(fp.name) + + try: + response = uploader.put(route, large_file) + + assert response.status_code == 413 + error_data = response.json() + assert "File too large" in error_data["error"] + assert "Maximum size: 50MB" in error_data["error"] + + # Verify no document was created in the database + statement = select(Document).where(Document.fname == str(large_file)) + result = db.exec(statement).first() + assert result is None + finally: + large_file.unlink() + + def test_upload_empty_file( + self, + db: Session, + route: Route, + uploader: WebUploader, + ) -> None: + """Test that empty files are rejected.""" + aws = AmazonCloudStorageClient() + aws.create() + + # Create an empty file + with NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as fp: + # Don't write anything, just create an empty file + fp.flush() + empty_file = Path(fp.name) + + try: + response = uploader.put(route, empty_file) + + assert response.status_code == 422 + error_data = response.json() + assert "Empty file uploaded" in error_data["error"] + + # Verify no document was created in the database + statement = select(Document).where(Document.fname == str(empty_file)) + result = db.exec(statement).first() + assert result is None + finally: + empty_file.unlink() + + def test_upload_file_within_size_limit( + self, + db: Session, + route: Route, + uploader: WebUploader, + ) -> None: + """Test that files within the size limit are accepted.""" + aws = AmazonCloudStorageClient() + aws.create() + + # Create a 1MB file (well within the 50MB limit) + with NamedTemporaryFile(mode="wb", suffix=".pdf", delete=False) as fp: + # Write 1MB of data + fp.write(b"0" * (1024 * 1024)) + fp.flush() + normal_file = Path(fp.name) + + try: + response = httpx_to_standard(uploader.put(route, normal_file)) + + assert response.success is True + assert "id" in response.data + doc_id = response.data["id"] + + # Verify document was created in database + statement = select(Document).where(Document.id == doc_id) + result = db.exec(statement).one() + assert result.fname == str(normal_file) + finally: + normal_file.unlink() + From 652adb9599c85c8b3d8ee6d7da4e97a9f5fd77ed Mon Sep 17 00:00:00 2001 From: Ankit Mehta Date: Tue, 3 Feb 2026 20:14:33 +0530 Subject: [PATCH 2/2] Remove trailing slashes from API endpoints --- backend/app/api/routes/login.py | 2 +- backend/app/api/routes/private.py | 2 +- backend/app/api/routes/utils.py | 2 +- backend/app/tests/api/routes/test_login.py | 4 ++-- backend/app/tests/api/routes/test_private.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/backend/app/api/routes/login.py b/backend/app/api/routes/login.py index 704a5e8d7..54c1dbeb1 100644 --- a/backend/app/api/routes/login.py +++ b/backend/app/api/routes/login.py @@ -83,7 +83,7 @@ def recover_password(email: str, session: SessionDep) -> Message: return Message(message="Password recovery email sent") -@router.post("/reset-password/", include_in_schema=False) +@router.post("/reset-password", include_in_schema=False) def reset_password(session: SessionDep, body: NewPassword) -> Message: """ Reset password diff --git a/backend/app/api/routes/private.py b/backend/app/api/routes/private.py index 04c002243..6100829c2 100644 --- a/backend/app/api/routes/private.py +++ b/backend/app/api/routes/private.py @@ -20,7 +20,7 @@ class PrivateUserCreate(BaseModel): is_verified: bool = False -@router.post("/users/", response_model=UserPublic, include_in_schema=False) +@router.post("/users", response_model=UserPublic, include_in_schema=False) def create_user(user_in: PrivateUserCreate, session: SessionDep) -> Any: """ Create a new user. diff --git a/backend/app/api/routes/utils.py b/backend/app/api/routes/utils.py index 56247b304..6f48f113d 100644 --- a/backend/app/api/routes/utils.py +++ b/backend/app/api/routes/utils.py @@ -27,6 +27,6 @@ def test_email(email_to: EmailStr) -> Message: return Message(message="Test email sent") -@router.get("/health/", include_in_schema=False) +@router.get("/health", include_in_schema=False) async def health_check() -> bool: return True diff --git a/backend/app/tests/api/routes/test_login.py b/backend/app/tests/api/routes/test_login.py index 34074546a..67ce7feb1 100644 --- a/backend/app/tests/api/routes/test_login.py +++ b/backend/app/tests/api/routes/test_login.py @@ -72,7 +72,7 @@ def test_reset_password(client: TestClient, db: Session) -> None: data = {"new_password": new_password, "token": token} r = client.post( - f"{settings.API_V1_STR}/reset-password/", + f"{settings.API_V1_STR}/reset-password", headers=headers, json=data, ) @@ -89,7 +89,7 @@ def test_reset_password_invalid_token( ) -> None: data = {"new_password": "changethis", "token": "invalid"} r = client.post( - f"{settings.API_V1_STR}/reset-password/", + f"{settings.API_V1_STR}/reset-password", headers=superuser_token_headers, json=data, ) diff --git a/backend/app/tests/api/routes/test_private.py b/backend/app/tests/api/routes/test_private.py index 1e1f98502..1b5a3794c 100644 --- a/backend/app/tests/api/routes/test_private.py +++ b/backend/app/tests/api/routes/test_private.py @@ -7,7 +7,7 @@ def test_create_user(client: TestClient, db: Session) -> None: r = client.post( - f"{settings.API_V1_STR}/private/users/", + f"{settings.API_V1_STR}/private/users", json={ "email": "pollo@listo.com", "password": "password123",