Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions backend/app/api/docs/documents/upload.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ Upload a document to Kaapi.
- If a target format is specified, a transformation job will also be created to transform document into target format in the background. The response will include both the uploaded document details and information about the transformation job.
- If a callback URL is provided, you will receive a notification at that URL once the document transformation job is completed.

### File Size Restrictions

- **Maximum file size**: 50MB (configurable via `MAX_DOCUMENT_UPLOAD_SIZE_MB` environment variable)
- Files exceeding the size limit will be rejected with a 413 (Payload Too Large) error
- Empty files will be rejected with a 422 (Unprocessable Entity) error

### Supported Transformations

The following (source_format → target_format) transformations are supported:
Expand Down
4 changes: 4 additions & 0 deletions backend/app/api/routes/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
build_document_schema,
build_document_schemas,
)
from app.services.documents.validators import validate_document_file
from app.utils import (
APIResponse,
get_openai_client,
Expand Down Expand Up @@ -123,6 +124,9 @@ async def upload_doc(
if callback_url:
validate_callback_url(callback_url)

# Validate file size before uploading to S3
await validate_document_file(src)

source_format, actual_transformer = pre_transform_validation(
src_filename=src.filename,
target_format=target_format,
Expand Down
2 changes: 1 addition & 1 deletion backend/app/api/routes/login.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def recover_password(email: str, session: SessionDep) -> Message:
return Message(message="Password recovery email sent")


@router.post("/reset-password/", include_in_schema=False)
@router.post("/reset-password", include_in_schema=False)
def reset_password(session: SessionDep, body: NewPassword) -> Message:
"""
Reset password
Expand Down
2 changes: 1 addition & 1 deletion backend/app/api/routes/private.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class PrivateUserCreate(BaseModel):
is_verified: bool = False


@router.post("/users/", response_model=UserPublic, include_in_schema=False)
@router.post("/users", response_model=UserPublic, include_in_schema=False)
def create_user(user_in: PrivateUserCreate, session: SessionDep) -> Any:
"""
Create a new user.
Expand Down
2 changes: 1 addition & 1 deletion backend/app/api/routes/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,6 @@ def test_email(email_to: EmailStr) -> Message:
return Message(message="Test email sent")


@router.get("/health/", include_in_schema=False)
@router.get("/health", include_in_schema=False)
async def health_check() -> bool:
return True
3 changes: 3 additions & 0 deletions backend/app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,9 @@ def AWS_S3_BUCKET(self) -> str:
CALLBACK_CONNECT_TIMEOUT: int = 3
CALLBACK_READ_TIMEOUT: int = 10

# Document upload size limit (in MB)
MAX_DOCUMENT_UPLOAD_SIZE_MB: int = 50

@computed_field # type: ignore[prop-decorator]
@property
def COMPUTED_CELERY_WORKER_CONCURRENCY(self) -> int:
Expand Down
54 changes: 54 additions & 0 deletions backend/app/services/documents/validators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""Validation utilities for document uploads."""

import logging
from pathlib import Path

from fastapi import HTTPException, UploadFile

from app.core.config import settings

logger = logging.getLogger(__name__)

# Maximum file size for document uploads (in bytes)
# Default: 50 MB, configurable via settings
MAX_DOCUMENT_SIZE = settings.MAX_DOCUMENT_UPLOAD_SIZE_MB * 1024 * 1024


async def validate_document_file(file: UploadFile) -> int:
"""
Validate document file size.

Args:
file: The uploaded file

Returns:
File size in bytes if valid

Raises:
HTTPException: If validation fails
"""
if not file.filename:
raise HTTPException(
status_code=422,
detail="File must have a filename",
)

# Get file size by seeking to end
file.file.seek(0, 2)
file_size = file.file.tell()
file.file.seek(0)

if file_size > MAX_DOCUMENT_SIZE:
raise HTTPException(
status_code=413,
detail=f"File too large. Maximum size: {MAX_DOCUMENT_SIZE / (1024 * 1024):.0f}MB",
)

if file_size == 0:
raise HTTPException(
status_code=422,
detail="Empty file uploaded"
)

logger.info(f"Document file validated: {file.filename} ({file_size} bytes)")
return file_size
Original file line number Diff line number Diff line change
Expand Up @@ -325,3 +325,100 @@ def test_upload_response_structure_without_transformation(
assert field in response.data

assert response.data["transformation_job"] is None

def test_upload_file_exceeds_size_limit(
self,
db: Session,
route: Route,
uploader: WebUploader,
) -> None:
"""Test that files exceeding the size limit are rejected."""
aws = AmazonCloudStorageClient()
aws.create()

# Create a file larger than the 50MB limit
# For testing purposes, we'll create a 51MB file
with NamedTemporaryFile(mode="wb", suffix=".pdf", delete=False) as fp:
# Write 51MB of data (51 * 1024 * 1024 bytes)
chunk_size = 1024 * 1024 # 1MB chunks
for _ in range(51):
fp.write(b"0" * chunk_size)
fp.flush()
large_file = Path(fp.name)

try:
response = uploader.put(route, large_file)

assert response.status_code == 413
error_data = response.json()
assert "File too large" in error_data["error"]
assert "Maximum size: 50MB" in error_data["error"]

# Verify no document was created in the database
statement = select(Document).where(Document.fname == str(large_file))
result = db.exec(statement).first()
assert result is None
finally:
large_file.unlink()

def test_upload_empty_file(
self,
db: Session,
route: Route,
uploader: WebUploader,
) -> None:
"""Test that empty files are rejected."""
aws = AmazonCloudStorageClient()
aws.create()

# Create an empty file
with NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as fp:
# Don't write anything, just create an empty file
fp.flush()
empty_file = Path(fp.name)

try:
response = uploader.put(route, empty_file)

assert response.status_code == 422
error_data = response.json()
assert "Empty file uploaded" in error_data["error"]

# Verify no document was created in the database
statement = select(Document).where(Document.fname == str(empty_file))
result = db.exec(statement).first()
assert result is None
finally:
empty_file.unlink()

def test_upload_file_within_size_limit(
self,
db: Session,
route: Route,
uploader: WebUploader,
) -> None:
"""Test that files within the size limit are accepted."""
aws = AmazonCloudStorageClient()
aws.create()

# Create a 1MB file (well within the 50MB limit)
with NamedTemporaryFile(mode="wb", suffix=".pdf", delete=False) as fp:
# Write 1MB of data
fp.write(b"0" * (1024 * 1024))
fp.flush()
normal_file = Path(fp.name)

try:
response = httpx_to_standard(uploader.put(route, normal_file))

assert response.success is True
assert "id" in response.data
doc_id = response.data["id"]

# Verify document was created in database
statement = select(Document).where(Document.id == doc_id)
result = db.exec(statement).one()
assert result.fname == str(normal_file)
finally:
normal_file.unlink()

4 changes: 2 additions & 2 deletions backend/app/tests/api/routes/test_login.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def test_reset_password(client: TestClient, db: Session) -> None:
data = {"new_password": new_password, "token": token}

r = client.post(
f"{settings.API_V1_STR}/reset-password/",
f"{settings.API_V1_STR}/reset-password",
headers=headers,
json=data,
)
Expand All @@ -89,7 +89,7 @@ def test_reset_password_invalid_token(
) -> None:
data = {"new_password": "changethis", "token": "invalid"}
r = client.post(
f"{settings.API_V1_STR}/reset-password/",
f"{settings.API_V1_STR}/reset-password",
headers=superuser_token_headers,
json=data,
)
Expand Down
2 changes: 1 addition & 1 deletion backend/app/tests/api/routes/test_private.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

def test_create_user(client: TestClient, db: Session) -> None:
r = client.post(
f"{settings.API_V1_STR}/private/users/",
f"{settings.API_V1_STR}/private/users",
json={
"email": "pollo@listo.com",
"password": "password123",
Expand Down