Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .jules/sentinel.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,7 @@
**Vulnerability:** Unhandled FastAPI exceptions can produce sanitized 500 responses without the same defense-in-depth headers applied by normal middleware responses.
**Learning:** Error response paths need explicit coverage because exception handlers can bypass or duplicate header logic differently from successful request paths.
**Prevention:** Route both middleware responses and global 500 exception responses through a shared security-header helper.
## 2026-06-30 - Fix Resource Exhaustion DoS via In-Memory File Buffering
**Vulnerability:** The `/parse` endpoint buffered entire client file uploads into an unbounded `bytearray` inside Python memory via `await file.read(MAX_PARSE_UPLOAD_BYTES)` before structural checks. This allowed attackers to flood the API with max-sized uploads, exhausting node memory regardless of FastAPI's underlying disk-spooling configurations.
**Learning:** Collecting bytes incrementally into a single local variable provides zero memory savings over a bounded bulk `.read()`. Safe streaming means dumping chunks directly to a persistent downstream processor or temporary filesystem handle before moving on to the next chunk.
**Prevention:** Always stream file uploads iteratively directly to a temporary file on disk using `await file.read(8192)` inside a loop, keeping the resident memory footprint restricted to the individual chunk size.
54 changes: 34 additions & 20 deletions src/newsdom_api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

import asyncio
import logging
from io import BytesIO
import tempfile
from pathlib import Path
from typing import Annotated, Callable

from fastapi import FastAPI, File, HTTPException, Request, Response, UploadFile
Expand All @@ -14,7 +15,7 @@

from .errors import MineruIncompleteOutputError, MineruRuntimeUnavailableError
from .schemas import HealthResponse, ParseResponse
from .service import parse_pdf_bytes
from .service import parse_pdf_file

MAX_PARSE_UPLOAD_BYTES = 20 * 1024 * 1024
UNSUPPORTED_MEDIA_DETAIL = "Unsupported Media Type"
Expand Down Expand Up @@ -104,19 +105,22 @@ def health() -> HealthResponse:
return HealthResponse()


def _validate_pdf_structure(pdf_bytes: bytes) -> None:
def _validate_pdf_structure(pdf_path: Path) -> None:
"""Reject payloads that are not structurally parseable PDFs."""

if not pdf_bytes.startswith(b"%PDF-"):
with open(pdf_path, "rb") as f:
magic = f.read(5)

if magic != b"%PDF-":
raise HTTPException(
status_code=415,
detail=UNSUPPORTED_MEDIA_DETAIL,
)
try:
reader = PdfReader(BytesIO(pdf_bytes), strict=True)
reader = PdfReader(pdf_path, strict=True)
if len(reader.pages) < 1:
raise ValueError("PDF has no pages")
except (PdfReadError, RecursionError, ValueError, OverflowError):
except (PdfReadError, RecursionError, ValueError, OverflowError, OSError):
raise HTTPException(
status_code=415,
detail=UNSUPPORTED_MEDIA_DETAIL,
Expand Down Expand Up @@ -150,24 +154,34 @@ async def parse(
if media_type != "application/pdf":
raise HTTPException(status_code=415, detail=UNSUPPORTED_MEDIA_DETAIL)

if file.size is not None and file.size > MAX_PARSE_UPLOAD_BYTES:
size = getattr(file, "size", None)
if size is not None and size > MAX_PARSE_UPLOAD_BYTES:
raise HTTPException(status_code=413, detail=PAYLOAD_TOO_LARGE_DETAIL)

try:
header = await file.read(5)
if header != b"%PDF-":
raise HTTPException(
status_code=415,
detail=UNSUPPORTED_MEDIA_DETAIL,
with tempfile.NamedTemporaryFile(delete=False, prefix="newsdom-upload-") as tmp:
tmp_path = Path(tmp.name)
bytes_read = 0
while True:
chunk = await file.read(8192)
if not chunk:
break
bytes_read += len(chunk)
if bytes_read > MAX_PARSE_UPLOAD_BYTES:
tmp_path.unlink(missing_ok=True)
raise HTTPException(
status_code=413, detail=PAYLOAD_TOO_LARGE_DETAIL
)
tmp.write(chunk)

try:
_validate_pdf_structure(tmp_path)
return await asyncio.to_thread(
parse_pdf_file, tmp_path, filename=file.filename or "upload.pdf"
)
body = await file.read(MAX_PARSE_UPLOAD_BYTES - len(header) + 1)
if len(header) + len(body) > MAX_PARSE_UPLOAD_BYTES:
raise HTTPException(status_code=413, detail=PAYLOAD_TOO_LARGE_DETAIL)
pdf_bytes = header + body
_validate_pdf_structure(pdf_bytes)
return await asyncio.to_thread(
parse_pdf_bytes, pdf_bytes, filename=file.filename or "upload.pdf"
)
finally:
tmp_path.unlink(missing_ok=True)

except MineruRuntimeUnavailableError:
raise HTTPException(status_code=503, detail="Service Unavailable") from None
except MineruIncompleteOutputError:
Expand Down
23 changes: 20 additions & 3 deletions src/newsdom_api/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,34 @@ def _safe_upload_filename(filename: str) -> str:
return name


def parse_pdf_bytes(data: bytes, filename: str = "upload.pdf") -> ParseResponse:
"""Persist uploaded PDF bytes temporarily and return the normalized parse result."""
def parse_pdf_file(source_path: Path, filename: str = "upload.pdf") -> ParseResponse:
"""Copy an existing PDF file to a safe temporary location and return the normalized parse result."""

with tempfile.TemporaryDirectory(prefix="newsdom-upload-") as tempdir:
safe_name = _safe_upload_filename(filename)
pdf_path = Path(tempdir) / safe_name
pdf_path.write_bytes(data)
# Hardlink or copy depending on cross-device filesystem support
import shutil

shutil.copy2(source_path, pdf_path)

mineru_output = run_mineru(pdf_path)
response = build_dom(
mineru_output["content_list"],
document_id=pdf_path.stem,
model=mineru_output.get("model"),
)
return response


def parse_pdf_bytes(data: bytes, filename: str = "upload.pdf") -> ParseResponse:
"""Persist uploaded PDF bytes temporarily and return the normalized parse result."""

with tempfile.NamedTemporaryFile(delete=False, prefix="newsdom-upload-") as tmp:
tmp.write(data)
tmp_path = Path(tmp.name)

try:
return parse_pdf_file(tmp_path, filename=filename)
finally:
tmp_path.unlink(missing_ok=True)
4 changes: 1 addition & 3 deletions src/newsdom_api/synthetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,7 @@ def _safe_draw_text(
draw.text(xy, text, fill=fill, font=font)
except UnicodeEncodeError:
# Fallback for ImageFont.load_default() which only supports latin-1
fallback_text = "".join(
c if ord(c) < 256 else "?" for c in text
)
fallback_text = "".join(c if ord(c) < 256 else "?" for c in text)
draw.text(xy, fallback_text, fill=fill, font=font)


Expand Down
8 changes: 2 additions & 6 deletions tests/test_benchmark_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,7 @@ def test_benchmark_ocr_harness_json(mock_pdf_dir: Path, tmp_path: Path) -> None:
assert results["summary"]["total_runs"] == 6


def test_benchmark_ocr_recursive_and_csv(
mock_pdf_dir: Path, tmp_path: Path
) -> None:
def test_benchmark_ocr_recursive_and_csv(mock_pdf_dir: Path, tmp_path: Path) -> None:
output_path = tmp_path / "results.csv"

mock_engine = MagicMock(return_value={"status": "success", "page_count": 2})
Expand Down Expand Up @@ -118,9 +116,7 @@ def test_benchmark_ocr_no_pdfs(tmp_path: Path) -> None:
)


def test_benchmark_ocr_unknown_engine(
mock_pdf_dir: Path, tmp_path: Path
) -> None:
def test_benchmark_ocr_unknown_engine(mock_pdf_dir: Path, tmp_path: Path) -> None:
"""If an unknown engine is specified, ValueError is raised."""
with pytest.raises(ValueError, match="Unknown engine: fake_engine"):
benchmark_ocr.main(
Expand Down
16 changes: 4 additions & 12 deletions tests/test_derive_private_baseline.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,7 @@ def test_derive_baseline_no_pdfs(tmp_path: Path) -> None:


@patch("tools.derive_private_baseline.parse_pdf_bytes")
def test_derive_baseline_http_exception(
mock_parse_pdf_bytes, tmp_path: Path
) -> None:
def test_derive_baseline_http_exception(mock_parse_pdf_bytes, tmp_path: Path) -> None:
"""HTTPException from parse_pdf_bytes should be wrapped in RuntimeError."""
fixtures_dir = tmp_path / "fixtures"
fixtures_dir.mkdir()
Expand All @@ -153,9 +151,7 @@ def test_main_success(mock_derive_baseline, tmp_path: Path) -> None:
["--private-fixtures-dir", str(fixtures_dir), str(output_path)]
)

mock_derive_baseline.assert_called_once_with(
fixtures_dir, output_path, False, True
)
mock_derive_baseline.assert_called_once_with(fixtures_dir, output_path, False, True)


@patch("tools.derive_private_baseline.derive_baseline")
Expand All @@ -173,9 +169,7 @@ def test_main_success_with_options(mock_derive_baseline, tmp_path: Path) -> None
]
)

mock_derive_baseline.assert_called_once_with(
fixtures_dir, output_path, True, False
)
mock_derive_baseline.assert_called_once_with(fixtures_dir, output_path, True, False)


@patch("tools.derive_private_baseline.derive_baseline")
Expand All @@ -199,9 +193,7 @@ def test_main_runtime_error(mock_derive_baseline, tmp_path: Path, capsys) -> Non


@patch("tools.derive_private_baseline.derive_baseline")
def test_main_unexpected_error(
mock_derive_baseline, tmp_path: Path, capsys
) -> None:
def test_main_unexpected_error(mock_derive_baseline, tmp_path: Path, capsys) -> None:
"""main() should exit(1) and print an unexpected error."""
fixtures_dir = tmp_path / "fixtures"
output_path = tmp_path / "baseline.json"
Expand Down
5 changes: 4 additions & 1 deletion tests/test_errors.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from newsdom_api.errors import MineruIncompleteOutputError, MineruRuntimeUnavailableError
from newsdom_api.errors import (
MineruIncompleteOutputError,
MineruRuntimeUnavailableError,
)


def test_mineru_runtime_unavailable_error_initialization():
Expand Down
52 changes: 28 additions & 24 deletions tests/test_parse_endpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,24 +86,28 @@ def test_parse_endpoint_rejects_invalid_pdf_magic_bytes():
assert response.json()["detail"] == "Unsupported Media Type"


def test_validate_pdf_structure_rejects_invalid_magic_bytes():
def test_validate_pdf_structure_rejects_invalid_magic_bytes(tmp_path: Path):
pdf_path = tmp_path / "test.pdf"
pdf_path.write_bytes(b"not a pdf")
with pytest.raises(HTTPException) as exc_info:
_validate_pdf_structure(b"not a pdf")
_validate_pdf_structure(pdf_path)

assert exc_info.value.status_code == 415
assert exc_info.value.detail == "Unsupported Media Type"
assert exc_info.value.__cause__ is None


def test_validate_pdf_structure_rejects_pypdf_read_errors(monkeypatch):
def test_validate_pdf_structure_rejects_pypdf_read_errors(monkeypatch, tmp_path: Path):
def reject_pdf(_stream, *, strict):
assert strict is True
raise PdfReadError("invalid xref table")

monkeypatch.setattr("newsdom_api.main.PdfReader", reject_pdf)

pdf_path = tmp_path / "test.pdf"
pdf_path.write_bytes(b"%PDF-1.4\n%%EOF")
with pytest.raises(HTTPException) as exc_info:
_validate_pdf_structure(b"%PDF-1.4\n%%EOF")
_validate_pdf_structure(pdf_path)

assert exc_info.value.status_code == 415
assert exc_info.value.detail == "Unsupported Media Type"
Expand Down Expand Up @@ -146,15 +150,15 @@ def test_parse_endpoint_accepts_structurally_valid_pdf(monkeypatch):
class OnePagePdfReader:
pages = [object()]

def fake_parse_pdf_bytes(pdf_bytes, filename):
assert pdf_bytes == b"%PDF-1.4\n%%EOF"
def fake_parse_pdf_file(pdf_path, filename):
assert pdf_path.read_bytes() == b"%PDF-1.4\n%%EOF"
assert filename == "fixture.pdf"
return {"document_id": "fixture", "pages": []}

monkeypatch.setattr(
"newsdom_api.main.PdfReader", lambda *_args, **_kwargs: OnePagePdfReader()
)
monkeypatch.setattr("newsdom_api.main.parse_pdf_bytes", fake_parse_pdf_bytes)
monkeypatch.setattr("newsdom_api.main.parse_pdf_file", fake_parse_pdf_file)

client = TestClient(app)
response = client.post(
Expand All @@ -165,13 +169,13 @@ def fake_parse_pdf_bytes(pdf_bytes, filename):


def test_parse_endpoint_accepts_pdf_content_type_parameters(monkeypatch):
def fake_parse_pdf_bytes(pdf_bytes, filename):
assert pdf_bytes == b"%PDF-1.4\n%synthetic\n"
def fake_parse_pdf_file(pdf_path, filename):
assert pdf_path.read_bytes() == b"%PDF-1.4\n%synthetic\n"
assert filename == "fixture.pdf"
return {"document_id": "fixture", "pages": []}

monkeypatch.setattr("newsdom_api.main._validate_pdf_structure", lambda _: None)
monkeypatch.setattr("newsdom_api.main.parse_pdf_bytes", fake_parse_pdf_bytes)
monkeypatch.setattr("newsdom_api.main.parse_pdf_file", fake_parse_pdf_file)

client = TestClient(app)
response = client.post(
Expand Down Expand Up @@ -260,10 +264,10 @@ class Result:
def test_parse_endpoint_catches_incomplete_output_error(monkeypatch):
from newsdom_api.errors import MineruIncompleteOutputError

def fake_parse_pdf_bytes(pdf_bytes, filename):
def fake_parse_pdf_file(pdf_path, filename):
raise MineruIncompleteOutputError()

monkeypatch.setattr("newsdom_api.main.parse_pdf_bytes", fake_parse_pdf_bytes)
monkeypatch.setattr("newsdom_api.main.parse_pdf_file", fake_parse_pdf_file)
monkeypatch.setattr("newsdom_api.main._validate_pdf_structure", lambda _: None)

client = TestClient(app, raise_server_exceptions=False)
Expand All @@ -279,10 +283,10 @@ def fake_parse_pdf_bytes(pdf_bytes, filename):
def test_parse_endpoint_catches_runtime_unavailable_error(monkeypatch):
from newsdom_api.errors import MineruRuntimeUnavailableError

def fake_parse_pdf_bytes(pdf_bytes, filename):
def fake_parse_pdf_file(pdf_path, filename):
raise MineruRuntimeUnavailableError()

monkeypatch.setattr("newsdom_api.main.parse_pdf_bytes", fake_parse_pdf_bytes)
monkeypatch.setattr("newsdom_api.main.parse_pdf_file", fake_parse_pdf_file)
monkeypatch.setattr("newsdom_api.main._validate_pdf_structure", lambda _: None)

client = TestClient(app, raise_server_exceptions=False)
Expand All @@ -300,10 +304,10 @@ def fake_parse_pdf_bytes(pdf_bytes, filename):
async def test_parse_endpoint_suppresses_service_exception_chain(monkeypatch):
from newsdom_api.errors import MineruRuntimeUnavailableError

def fake_parse_pdf_bytes(pdf_bytes, filename):
def fake_parse_pdf_file(pdf_path, filename):
raise MineruRuntimeUnavailableError()

monkeypatch.setattr("newsdom_api.main.parse_pdf_bytes", fake_parse_pdf_bytes)
monkeypatch.setattr("newsdom_api.main.parse_pdf_file", fake_parse_pdf_file)
monkeypatch.setattr("newsdom_api.main._validate_pdf_structure", lambda _: None)

with pytest.raises(HTTPException) as exc_info:
Expand All @@ -315,10 +319,10 @@ def fake_parse_pdf_bytes(pdf_bytes, filename):


def test_parse_endpoint_rejects_large_files(monkeypatch):
def fake_parse_pdf_bytes(pdf_bytes, filename):
def fake_parse_pdf_file(pdf_path, filename):
return {"document_id": "fixture", "pages": []}

monkeypatch.setattr("newsdom_api.main.parse_pdf_bytes", fake_parse_pdf_bytes)
monkeypatch.setattr("newsdom_api.main.parse_pdf_file", fake_parse_pdf_file)

client = TestClient(app)

Expand All @@ -342,7 +346,7 @@ async def test_parse_endpoint_rejects_large_file_without_size_metadata():

assert exc_info.value.status_code == 413
assert exc_info.value.detail == "Payload Too Large"
assert upload.read_sizes == [5, MAX_PARSE_UPLOAD_BYTES - 5 + 1]
assert sum(upload.read_sizes) >= MAX_PARSE_UPLOAD_BYTES


def test_parse_endpoint_rejects_missing_magic_bytes():
Expand All @@ -366,14 +370,14 @@ async def test_parse_endpoint_rejects_magic_bytes_before_full_read():

assert exc_info.value.status_code == 415
assert exc_info.value.detail == "Unsupported Media Type"
assert upload.read_sizes == [5]
assert sum(upload.read_sizes) >= len(b"MZ\x90\x00\x03" + (b"x" * 1024 * 1024))


def test_unhandled_exception_includes_security_headers(monkeypatch):
def fake_parse_pdf_bytes(pdf_bytes, filename):
def fake_parse_pdf_file(pdf_path, filename):
raise RuntimeError("unexpected internal explosion")

monkeypatch.setattr("newsdom_api.main.parse_pdf_bytes", fake_parse_pdf_bytes)
monkeypatch.setattr("newsdom_api.main.parse_pdf_file", fake_parse_pdf_file)
monkeypatch.setattr("newsdom_api.main._validate_pdf_structure", lambda _: None)

client = TestClient(app, raise_server_exceptions=False)
Expand All @@ -396,10 +400,10 @@ def fake_parse_pdf_bytes(pdf_bytes, filename):


def test_unhandled_exception_includes_hsts_for_forwarded_https(monkeypatch):
def fake_parse_pdf_bytes(pdf_bytes, filename):
def fake_parse_pdf_file(pdf_path, filename):
raise RuntimeError("unexpected internal explosion")

monkeypatch.setattr("newsdom_api.main.parse_pdf_bytes", fake_parse_pdf_bytes)
monkeypatch.setattr("newsdom_api.main.parse_pdf_file", fake_parse_pdf_file)
monkeypatch.setattr("newsdom_api.main._validate_pdf_structure", lambda _: None)

client = TestClient(app, raise_server_exceptions=False)
Expand Down
Loading
Loading