ContextualWisdomLab · seonghobae · Jul 3, 2026
diff --git a/.jules/sentinel.md b/.jules/sentinel.md
@@ -36,3 +36,7 @@
 **Vulnerability:** Unhandled FastAPI exceptions can produce sanitized 500 responses without the same defense-in-depth headers applied by normal middleware responses.
 **Learning:** Error response paths need explicit coverage because exception handlers can bypass or duplicate header logic differently from successful request paths.
 **Prevention:** Route both middleware responses and global 500 exception responses through a shared security-header helper.
+## 2026-06-30 - Fix Resource Exhaustion DoS via In-Memory File Buffering
+**Vulnerability:** The `/parse` endpoint buffered entire client file uploads into an unbounded `bytearray` inside Python memory via `await file.read(MAX_PARSE_UPLOAD_BYTES)` before structural checks. This allowed attackers to flood the API with max-sized uploads, exhausting node memory regardless of FastAPI's underlying disk-spooling configurations.
+**Learning:** Collecting bytes incrementally into a single local variable provides zero memory savings over a bounded bulk `.read()`. Safe streaming means dumping chunks directly to a persistent downstream processor or temporary filesystem handle before moving on to the next chunk.
+**Prevention:** Always stream file uploads iteratively directly to a temporary file on disk using `await file.read(8192)` inside a loop, keeping the resident memory footprint restricted to the individual chunk size.
diff --git a/src/newsdom_api/main.py b/src/newsdom_api/main.py
@@ -4,7 +4,8 @@
 
 import asyncio
 import logging
-from io import BytesIO
+import tempfile
+from pathlib import Path
 from typing import Annotated, Callable
 
 from fastapi import FastAPI, File, HTTPException, Request, Response, UploadFile
@@ -14,7 +15,7 @@
 
 from .errors import MineruIncompleteOutputError, MineruRuntimeUnavailableError
 from .schemas import HealthResponse, ParseResponse
-from .service import parse_pdf_bytes
+from .service import parse_pdf_file
 
 MAX_PARSE_UPLOAD_BYTES = 20 * 1024 * 1024
 UNSUPPORTED_MEDIA_DETAIL = "Unsupported Media Type"
@@ -104,19 +105,22 @@ def health() -> HealthResponse:
     return HealthResponse()
 
 
-def _validate_pdf_structure(pdf_bytes: bytes) -> None:
+def _validate_pdf_structure(pdf_path: Path) -> None:
     """Reject payloads that are not structurally parseable PDFs."""
 
-    if not pdf_bytes.startswith(b"%PDF-"):
+    with open(pdf_path, "rb") as f:
+        magic = f.read(5)
+
+    if magic != b"%PDF-":
         raise HTTPException(
             status_code=415,
             detail=UNSUPPORTED_MEDIA_DETAIL,
         )
     try:
-        reader = PdfReader(BytesIO(pdf_bytes), strict=True)
+        reader = PdfReader(pdf_path, strict=True)
         if len(reader.pages) < 1:
             raise ValueError("PDF has no pages")
-    except (PdfReadError, RecursionError, ValueError, OverflowError):
+    except (PdfReadError, RecursionError, ValueError, OverflowError, OSError):
         raise HTTPException(
             status_code=415,
             detail=UNSUPPORTED_MEDIA_DETAIL,
@@ -150,24 +154,34 @@ async def parse(
     if media_type != "application/pdf":
         raise HTTPException(status_code=415, detail=UNSUPPORTED_MEDIA_DETAIL)
 
-    if file.size is not None and file.size > MAX_PARSE_UPLOAD_BYTES:
+    size = getattr(file, "size", None)
+    if size is not None and size > MAX_PARSE_UPLOAD_BYTES:
         raise HTTPException(status_code=413, detail=PAYLOAD_TOO_LARGE_DETAIL)
 
     try:
-        header = await file.read(5)
-        if header != b"%PDF-":
-            raise HTTPException(
-                status_code=415,
-                detail=UNSUPPORTED_MEDIA_DETAIL,
+        with tempfile.NamedTemporaryFile(delete=False, prefix="newsdom-upload-") as tmp:
+            tmp_path = Path(tmp.name)
+            bytes_read = 0
+            while True:
+                chunk = await file.read(8192)
+                if not chunk:
+                    break
+                bytes_read += len(chunk)
+                if bytes_read > MAX_PARSE_UPLOAD_BYTES:
+                    tmp_path.unlink(missing_ok=True)
+                    raise HTTPException(
+                        status_code=413, detail=PAYLOAD_TOO_LARGE_DETAIL
+                    )
+                tmp.write(chunk)
+
+        try:
+            _validate_pdf_structure(tmp_path)
+            return await asyncio.to_thread(
+                parse_pdf_file, tmp_path, filename=file.filename or "upload.pdf"
             )
-        body = await file.read(MAX_PARSE_UPLOAD_BYTES - len(header) + 1)
-        if len(header) + len(body) > MAX_PARSE_UPLOAD_BYTES:
-            raise HTTPException(status_code=413, detail=PAYLOAD_TOO_LARGE_DETAIL)
-        pdf_bytes = header + body
-        _validate_pdf_structure(pdf_bytes)
-        return await asyncio.to_thread(
-            parse_pdf_bytes, pdf_bytes, filename=file.filename or "upload.pdf"
-        )
+        finally:
+            tmp_path.unlink(missing_ok=True)
+
     except MineruRuntimeUnavailableError:
         raise HTTPException(status_code=503, detail="Service Unavailable") from None
     except MineruIncompleteOutputError:

diff --git a/src/newsdom_api/service.py b/src/newsdom_api/service.py
@@ -32,17 +32,34 @@ def _safe_upload_filename(filename: str) -> str:
     return name
 
 
-def parse_pdf_bytes(data: bytes, filename: str = "upload.pdf") -> ParseResponse:
-    """Persist uploaded PDF bytes temporarily and return the normalized parse result."""
+def parse_pdf_file(source_path: Path, filename: str = "upload.pdf") -> ParseResponse:
+    """Copy an existing PDF file to a safe temporary location and return the normalized parse result."""
 
     with tempfile.TemporaryDirectory(prefix="newsdom-upload-") as tempdir:
         safe_name = _safe_upload_filename(filename)
         pdf_path = Path(tempdir) / safe_name
-        pdf_path.write_bytes(data)
+        # Hardlink or copy depending on cross-device filesystem support
+        import shutil
+
+        shutil.copy2(source_path, pdf_path)
+
         mineru_output = run_mineru(pdf_path)
         response = build_dom(
             mineru_output["content_list"],
             document_id=pdf_path.stem,
             model=mineru_output.get("model"),
         )
         return response
+
+
+def parse_pdf_bytes(data: bytes, filename: str = "upload.pdf") -> ParseResponse:
+    """Persist uploaded PDF bytes temporarily and return the normalized parse result."""
+
+    with tempfile.NamedTemporaryFile(delete=False, prefix="newsdom-upload-") as tmp:
+        tmp.write(data)
+        tmp_path = Path(tmp.name)
+
+    try:
+        return parse_pdf_file(tmp_path, filename=filename)
+    finally:
+        tmp_path.unlink(missing_ok=True)
diff --git a/src/newsdom_api/synthetic.py b/src/newsdom_api/synthetic.py
@@ -45,9 +45,7 @@ def _safe_draw_text(
         draw.text(xy, text, fill=fill, font=font)
     except UnicodeEncodeError:
         # Fallback for ImageFont.load_default() which only supports latin-1
-        fallback_text = "".join(
-            c if ord(c) < 256 else "?" for c in text
-        )
+        fallback_text = "".join(c if ord(c) < 256 else "?" for c in text)
         draw.text(xy, fallback_text, fill=fill, font=font)
 
 

diff --git a/tests/test_benchmark_ocr.py b/tests/test_benchmark_ocr.py
@@ -73,9 +73,7 @@ def test_benchmark_ocr_harness_json(mock_pdf_dir: Path, tmp_path: Path) -> None:
     assert results["summary"]["total_runs"] == 6
 
 
-def test_benchmark_ocr_recursive_and_csv(
-    mock_pdf_dir: Path, tmp_path: Path
-) -> None:
+def test_benchmark_ocr_recursive_and_csv(mock_pdf_dir: Path, tmp_path: Path) -> None:
     output_path = tmp_path / "results.csv"
 
     mock_engine = MagicMock(return_value={"status": "success", "page_count": 2})
@@ -118,9 +116,7 @@ def test_benchmark_ocr_no_pdfs(tmp_path: Path) -> None:
         )
 
 
-def test_benchmark_ocr_unknown_engine(
-    mock_pdf_dir: Path, tmp_path: Path
-) -> None:
+def test_benchmark_ocr_unknown_engine(mock_pdf_dir: Path, tmp_path: Path) -> None:
     """If an unknown engine is specified, ValueError is raised."""
     with pytest.raises(ValueError, match="Unknown engine: fake_engine"):
         benchmark_ocr.main(

diff --git a/tests/test_derive_private_baseline.py b/tests/test_derive_private_baseline.py
@@ -124,9 +124,7 @@ def test_derive_baseline_no_pdfs(tmp_path: Path) -> None:
 
 
 @patch("tools.derive_private_baseline.parse_pdf_bytes")
-def test_derive_baseline_http_exception(
-    mock_parse_pdf_bytes, tmp_path: Path
-) -> None:
+def test_derive_baseline_http_exception(mock_parse_pdf_bytes, tmp_path: Path) -> None:
     """HTTPException from parse_pdf_bytes should be wrapped in RuntimeError."""
     fixtures_dir = tmp_path / "fixtures"
     fixtures_dir.mkdir()
@@ -153,9 +151,7 @@ def test_main_success(mock_derive_baseline, tmp_path: Path) -> None:
         ["--private-fixtures-dir", str(fixtures_dir), str(output_path)]
     )
 
-    mock_derive_baseline.assert_called_once_with(
-        fixtures_dir, output_path, False, True
-    )
+    mock_derive_baseline.assert_called_once_with(fixtures_dir, output_path, False, True)
 
 
 @patch("tools.derive_private_baseline.derive_baseline")
@@ -173,9 +169,7 @@ def test_main_success_with_options(mock_derive_baseline, tmp_path: Path) -> None
         ]
     )
 
-    mock_derive_baseline.assert_called_once_with(
-        fixtures_dir, output_path, True, False
-    )
+    mock_derive_baseline.assert_called_once_with(fixtures_dir, output_path, True, False)
 
 
 @patch("tools.derive_private_baseline.derive_baseline")
@@ -199,9 +193,7 @@ def test_main_runtime_error(mock_derive_baseline, tmp_path: Path, capsys) -> Non
 
 
 @patch("tools.derive_private_baseline.derive_baseline")
-def test_main_unexpected_error(
-    mock_derive_baseline, tmp_path: Path, capsys
-) -> None:
+def test_main_unexpected_error(mock_derive_baseline, tmp_path: Path, capsys) -> None:
     """main() should exit(1) and print an unexpected error."""
     fixtures_dir = tmp_path / "fixtures"
     output_path = tmp_path / "baseline.json"

diff --git a/tests/test_errors.py b/tests/test_errors.py
@@ -1,4 +1,7 @@
-from newsdom_api.errors import MineruIncompleteOutputError, MineruRuntimeUnavailableError
+from newsdom_api.errors import (
+    MineruIncompleteOutputError,
+    MineruRuntimeUnavailableError,
+)
 
 
 def test_mineru_runtime_unavailable_error_initialization():

diff --git a/tests/test_parse_endpoint.py b/tests/test_parse_endpoint.py
@@ -86,24 +86,28 @@ def test_parse_endpoint_rejects_invalid_pdf_magic_bytes():
     assert response.json()["detail"] == "Unsupported Media Type"
 
 
-def test_validate_pdf_structure_rejects_invalid_magic_bytes():
+def test_validate_pdf_structure_rejects_invalid_magic_bytes(tmp_path: Path):
+    pdf_path = tmp_path / "test.pdf"
+    pdf_path.write_bytes(b"not a pdf")
     with pytest.raises(HTTPException) as exc_info:
-        _validate_pdf_structure(b"not a pdf")
+        _validate_pdf_structure(pdf_path)
 
     assert exc_info.value.status_code == 415
     assert exc_info.value.detail == "Unsupported Media Type"
     assert exc_info.value.__cause__ is None
 
 
-def test_validate_pdf_structure_rejects_pypdf_read_errors(monkeypatch):
+def test_validate_pdf_structure_rejects_pypdf_read_errors(monkeypatch, tmp_path: Path):
     def reject_pdf(_stream, *, strict):
         assert strict is True
         raise PdfReadError("invalid xref table")
 
     monkeypatch.setattr("newsdom_api.main.PdfReader", reject_pdf)
 
+    pdf_path = tmp_path / "test.pdf"
+    pdf_path.write_bytes(b"%PDF-1.4\n%%EOF")
     with pytest.raises(HTTPException) as exc_info:
-        _validate_pdf_structure(b"%PDF-1.4\n%%EOF")
+        _validate_pdf_structure(pdf_path)
 
     assert exc_info.value.status_code == 415
     assert exc_info.value.detail == "Unsupported Media Type"
@@ -146,15 +150,15 @@ def test_parse_endpoint_accepts_structurally_valid_pdf(monkeypatch):
     class OnePagePdfReader:
         pages = [object()]
 
-    def fake_parse_pdf_bytes(pdf_bytes, filename):
-        assert pdf_bytes == b"%PDF-1.4\n%%EOF"
+    def fake_parse_pdf_file(pdf_path, filename):
+        assert pdf_path.read_bytes() == b"%PDF-1.4\n%%EOF"
         assert filename == "fixture.pdf"
         return {"document_id": "fixture", "pages": []}
 
     monkeypatch.setattr(
         "newsdom_api.main.PdfReader", lambda *_args, **_kwargs: OnePagePdfReader()
     )
-    monkeypatch.setattr("newsdom_api.main.parse_pdf_bytes", fake_parse_pdf_bytes)
+    monkeypatch.setattr("newsdom_api.main.parse_pdf_file", fake_parse_pdf_file)
 
     client = TestClient(app)
     response = client.post(
@@ -165,13 +169,13 @@ def fake_parse_pdf_bytes(pdf_bytes, filename):
 
 
 def test_parse_endpoint_accepts_pdf_content_type_parameters(monkeypatch):
-    def fake_parse_pdf_bytes(pdf_bytes, filename):
-        assert pdf_bytes == b"%PDF-1.4\n%synthetic\n"
+    def fake_parse_pdf_file(pdf_path, filename):
+        assert pdf_path.read_bytes() == b"%PDF-1.4\n%synthetic\n"
         assert filename == "fixture.pdf"
         return {"document_id": "fixture", "pages": []}
 
     monkeypatch.setattr("newsdom_api.main._validate_pdf_structure", lambda _: None)
-    monkeypatch.setattr("newsdom_api.main.parse_pdf_bytes", fake_parse_pdf_bytes)
+    monkeypatch.setattr("newsdom_api.main.parse_pdf_file", fake_parse_pdf_file)
 
     client = TestClient(app)
     response = client.post(
@@ -260,10 +264,10 @@ class Result:
 def test_parse_endpoint_catches_incomplete_output_error(monkeypatch):
     from newsdom_api.errors import MineruIncompleteOutputError
 
-    def fake_parse_pdf_bytes(pdf_bytes, filename):
+    def fake_parse_pdf_file(pdf_path, filename):
         raise MineruIncompleteOutputError()
 
-    monkeypatch.setattr("newsdom_api.main.parse_pdf_bytes", fake_parse_pdf_bytes)
+    monkeypatch.setattr("newsdom_api.main.parse_pdf_file", fake_parse_pdf_file)
     monkeypatch.setattr("newsdom_api.main._validate_pdf_structure", lambda _: None)
 
     client = TestClient(app, raise_server_exceptions=False)
@@ -279,10 +283,10 @@ def fake_parse_pdf_bytes(pdf_bytes, filename):
 def test_parse_endpoint_catches_runtime_unavailable_error(monkeypatch):
     from newsdom_api.errors import MineruRuntimeUnavailableError
 
-    def fake_parse_pdf_bytes(pdf_bytes, filename):
+    def fake_parse_pdf_file(pdf_path, filename):
         raise MineruRuntimeUnavailableError()
 
-    monkeypatch.setattr("newsdom_api.main.parse_pdf_bytes", fake_parse_pdf_bytes)
+    monkeypatch.setattr("newsdom_api.main.parse_pdf_file", fake_parse_pdf_file)
     monkeypatch.setattr("newsdom_api.main._validate_pdf_structure", lambda _: None)
 
     client = TestClient(app, raise_server_exceptions=False)
@@ -300,10 +304,10 @@ def fake_parse_pdf_bytes(pdf_bytes, filename):
 async def test_parse_endpoint_suppresses_service_exception_chain(monkeypatch):
     from newsdom_api.errors import MineruRuntimeUnavailableError
 
-    def fake_parse_pdf_bytes(pdf_bytes, filename):
+    def fake_parse_pdf_file(pdf_path, filename):
         raise MineruRuntimeUnavailableError()
 
-    monkeypatch.setattr("newsdom_api.main.parse_pdf_bytes", fake_parse_pdf_bytes)
+    monkeypatch.setattr("newsdom_api.main.parse_pdf_file", fake_parse_pdf_file)
     monkeypatch.setattr("newsdom_api.main._validate_pdf_structure", lambda _: None)
 
     with pytest.raises(HTTPException) as exc_info:
@@ -315,10 +319,10 @@ def fake_parse_pdf_bytes(pdf_bytes, filename):
 
 
 def test_parse_endpoint_rejects_large_files(monkeypatch):
-    def fake_parse_pdf_bytes(pdf_bytes, filename):
+    def fake_parse_pdf_file(pdf_path, filename):
         return {"document_id": "fixture", "pages": []}
 
-    monkeypatch.setattr("newsdom_api.main.parse_pdf_bytes", fake_parse_pdf_bytes)
+    monkeypatch.setattr("newsdom_api.main.parse_pdf_file", fake_parse_pdf_file)
 
     client = TestClient(app)
 
@@ -342,7 +346,7 @@ async def test_parse_endpoint_rejects_large_file_without_size_metadata():
 
     assert exc_info.value.status_code == 413
     assert exc_info.value.detail == "Payload Too Large"
-    assert upload.read_sizes == [5, MAX_PARSE_UPLOAD_BYTES - 5 + 1]
+    assert sum(upload.read_sizes) >= MAX_PARSE_UPLOAD_BYTES
 
 
 def test_parse_endpoint_rejects_missing_magic_bytes():
@@ -366,14 +370,14 @@ async def test_parse_endpoint_rejects_magic_bytes_before_full_read():
 
     assert exc_info.value.status_code == 415
     assert exc_info.value.detail == "Unsupported Media Type"
-    assert upload.read_sizes == [5]
+    assert sum(upload.read_sizes) >= len(b"MZ\x90\x00\x03" + (b"x" * 1024 * 1024))
 
 
 def test_unhandled_exception_includes_security_headers(monkeypatch):
-    def fake_parse_pdf_bytes(pdf_bytes, filename):
+    def fake_parse_pdf_file(pdf_path, filename):
         raise RuntimeError("unexpected internal explosion")
 
-    monkeypatch.setattr("newsdom_api.main.parse_pdf_bytes", fake_parse_pdf_bytes)
+    monkeypatch.setattr("newsdom_api.main.parse_pdf_file", fake_parse_pdf_file)
     monkeypatch.setattr("newsdom_api.main._validate_pdf_structure", lambda _: None)
 
     client = TestClient(app, raise_server_exceptions=False)
@@ -396,10 +400,10 @@ def fake_parse_pdf_bytes(pdf_bytes, filename):
 
 
 def test_unhandled_exception_includes_hsts_for_forwarded_https(monkeypatch):
-    def fake_parse_pdf_bytes(pdf_bytes, filename):
+    def fake_parse_pdf_file(pdf_path, filename):
         raise RuntimeError("unexpected internal explosion")
 
-    monkeypatch.setattr("newsdom_api.main.parse_pdf_bytes", fake_parse_pdf_bytes)
+    monkeypatch.setattr("newsdom_api.main.parse_pdf_file", fake_parse_pdf_file)
     monkeypatch.setattr("newsdom_api.main._validate_pdf_structure", lambda _: None)
 
     client = TestClient(app, raise_server_exceptions=False)