From 910a54ff40b04c6f6d21446d7d9a37f1956e93a2 Mon Sep 17 00:00:00 2001
From: seonghobae <8172694+seonghobae@users.noreply.github.com>
Date: Fri, 3 Jul 2026 21:51:19 +0000
Subject: [PATCH] =?UTF-8?q?=F0=9F=9B=A1=EF=B8=8F=20Sentinel:=20[CRITICAL]?=
 =?UTF-8?q?=20Fix=20Resource=20Exhaustion=20DoS=20in=20/parse=20endpoint?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The `/parse` API buffered the entire contents of an uploaded PDF file into a single in-memory `bytes` object (up to 20MB) by calling `await file.read(...)`. This created a critical Resource Exhaustion Denial of Service (DoS) vulnerability, where multiple concurrent uploads could quickly exhaust the server's memory.

This fix refactors the `/parse` endpoint to incrementally stream the uploaded file chunks directly to a temporary file on disk using `await file.read(8192)`. The structural `_validate_pdf_structure` checks and downstream MinerU processing logic now safely operate against the bounded temporary file instead of an unbounded in-memory variable, capping the memory footprint to the 8KB chunk size per request.
---
 .jules/sentinel.md                    |  4 ++
 src/newsdom_api/main.py               | 54 +++++++++++++++++----------
 src/newsdom_api/service.py            | 23 ++++++++++--
 src/newsdom_api/synthetic.py          |  4 +-
 tests/test_benchmark_ocr.py           |  8 +---
 tests/test_derive_private_baseline.py | 16 ++------
 tests/test_errors.py                  |  5 ++-
 tests/test_parse_endpoint.py          | 52 ++++++++++++++------------
 tests/test_parse_endpoint_success.py  |  9 +++--
 tests/test_schemas.py                 |  5 ++-
 tests/test_tools_batch_parse.py       |  4 +-
 tools/batch_parse_pdf.py              |  8 +++-
 tools/export_markdown.py              |  8 +++-
 tools/parse_pdf.py                    |  4 +-
 14 files changed, 125 insertions(+), 79 deletions(-)

diff --git a/.jules/sentinel.md b/.jules/sentinel.md
index 18667ffe..d1db5592 100644
--- a/.jules/sentinel.md
+++ b/.jules/sentinel.md
@@ -36,3 +36,7 @@
 **Vulnerability:** Unhandled FastAPI exceptions can produce sanitized 500 responses without the same defense-in-depth headers applied by normal middleware responses.
 **Learning:** Error response paths need explicit coverage because exception handlers can bypass or duplicate header logic differently from successful request paths.
 **Prevention:** Route both middleware responses and global 500 exception responses through a shared security-header helper.
+## 2026-06-30 - Fix Resource Exhaustion DoS via In-Memory File Buffering
+**Vulnerability:** The `/parse` endpoint buffered entire client file uploads into an unbounded `bytearray` inside Python memory via `await file.read(MAX_PARSE_UPLOAD_BYTES)` before structural checks. This allowed attackers to flood the API with max-sized uploads, exhausting node memory regardless of FastAPI's underlying disk-spooling configurations.
+**Learning:** Collecting bytes incrementally into a single local variable provides zero memory savings over a bounded bulk `.read()`. Safe streaming means dumping chunks directly to a persistent downstream processor or temporary filesystem handle before moving on to the next chunk.
+**Prevention:** Always stream file uploads iteratively directly to a temporary file on disk using `await file.read(8192)` inside a loop, keeping the resident memory footprint restricted to the individual chunk size.
diff --git a/src/newsdom_api/main.py b/src/newsdom_api/main.py
index 41acaacf..b52ac861 100644
--- a/src/newsdom_api/main.py
+++ b/src/newsdom_api/main.py
@@ -4,7 +4,8 @@
 
 import asyncio
 import logging
-from io import BytesIO
+import tempfile
+from pathlib import Path
 from typing import Annotated, Callable
 
 from fastapi import FastAPI, File, HTTPException, Request, Response, UploadFile
@@ -14,7 +15,7 @@
 
 from .errors import MineruIncompleteOutputError, MineruRuntimeUnavailableError
 from .schemas import HealthResponse, ParseResponse
-from .service import parse_pdf_bytes
+from .service import parse_pdf_file
 
 MAX_PARSE_UPLOAD_BYTES = 20 * 1024 * 1024
 UNSUPPORTED_MEDIA_DETAIL = "Unsupported Media Type"
@@ -104,19 +105,22 @@ def health() -> HealthResponse:
     return HealthResponse()
 
 
-def _validate_pdf_structure(pdf_bytes: bytes) -> None:
+def _validate_pdf_structure(pdf_path: Path) -> None:
     """Reject payloads that are not structurally parseable PDFs."""
 
-    if not pdf_bytes.startswith(b"%PDF-"):
+    with open(pdf_path, "rb") as f:
+        magic = f.read(5)
+
+    if magic != b"%PDF-":
         raise HTTPException(
             status_code=415,
             detail=UNSUPPORTED_MEDIA_DETAIL,
         )
     try:
-        reader = PdfReader(BytesIO(pdf_bytes), strict=True)
+        reader = PdfReader(pdf_path, strict=True)
         if len(reader.pages) < 1:
             raise ValueError("PDF has no pages")
-    except (PdfReadError, RecursionError, ValueError, OverflowError):
+    except (PdfReadError, RecursionError, ValueError, OverflowError, OSError):
         raise HTTPException(
             status_code=415,
             detail=UNSUPPORTED_MEDIA_DETAIL,
@@ -150,24 +154,34 @@ async def parse(
     if media_type != "application/pdf":
         raise HTTPException(status_code=415, detail=UNSUPPORTED_MEDIA_DETAIL)
 
-    if file.size is not None and file.size > MAX_PARSE_UPLOAD_BYTES:
+    size = getattr(file, "size", None)
+    if size is not None and size > MAX_PARSE_UPLOAD_BYTES:
         raise HTTPException(status_code=413, detail=PAYLOAD_TOO_LARGE_DETAIL)
 
     try:
-        header = await file.read(5)
-        if header != b"%PDF-":
-            raise HTTPException(
-                status_code=415,
-                detail=UNSUPPORTED_MEDIA_DETAIL,
+        with tempfile.NamedTemporaryFile(delete=False, prefix="newsdom-upload-") as tmp:
+            tmp_path = Path(tmp.name)
+            bytes_read = 0
+            while True:
+                chunk = await file.read(8192)
+                if not chunk:
+                    break
+                bytes_read += len(chunk)
+                if bytes_read > MAX_PARSE_UPLOAD_BYTES:
+                    tmp_path.unlink(missing_ok=True)
+                    raise HTTPException(
+                        status_code=413, detail=PAYLOAD_TOO_LARGE_DETAIL
+                    )
+                tmp.write(chunk)
+
+        try:
+            _validate_pdf_structure(tmp_path)
+            return await asyncio.to_thread(
+                parse_pdf_file, tmp_path, filename=file.filename or "upload.pdf"
             )
-        body = await file.read(MAX_PARSE_UPLOAD_BYTES - len(header) + 1)
-        if len(header) + len(body) > MAX_PARSE_UPLOAD_BYTES:
-            raise HTTPException(status_code=413, detail=PAYLOAD_TOO_LARGE_DETAIL)
-        pdf_bytes = header + body
-        _validate_pdf_structure(pdf_bytes)
-        return await asyncio.to_thread(
-            parse_pdf_bytes, pdf_bytes, filename=file.filename or "upload.pdf"
-        )
+        finally:
+            tmp_path.unlink(missing_ok=True)
+
     except MineruRuntimeUnavailableError:
         raise HTTPException(status_code=503, detail="Service Unavailable") from None
     except MineruIncompleteOutputError:
diff --git a/src/newsdom_api/service.py b/src/newsdom_api/service.py
index 53136a6f..6f9d82d2 100644
--- a/src/newsdom_api/service.py
+++ b/src/newsdom_api/service.py
@@ -32,13 +32,17 @@ def _safe_upload_filename(filename: str) -> str:
     return name
 
 
-def parse_pdf_bytes(data: bytes, filename: str = "upload.pdf") -> ParseResponse:
-    """Persist uploaded PDF bytes temporarily and return the normalized parse result."""
+def parse_pdf_file(source_path: Path, filename: str = "upload.pdf") -> ParseResponse:
+    """Copy an existing PDF file to a safe temporary location and return the normalized parse result."""
 
     with tempfile.TemporaryDirectory(prefix="newsdom-upload-") as tempdir:
         safe_name = _safe_upload_filename(filename)
         pdf_path = Path(tempdir) / safe_name
-        pdf_path.write_bytes(data)
+        # Hardlink or copy depending on cross-device filesystem support
+        import shutil
+
+        shutil.copy2(source_path, pdf_path)
+
         mineru_output = run_mineru(pdf_path)
         response = build_dom(
             mineru_output["content_list"],
@@ -46,3 +50,16 @@ def parse_pdf_bytes(data: bytes, filename: str = "upload.pdf") -> ParseResponse:
             model=mineru_output.get("model"),
         )
         return response
+
+
+def parse_pdf_bytes(data: bytes, filename: str = "upload.pdf") -> ParseResponse:
+    """Persist uploaded PDF bytes temporarily and return the normalized parse result."""
+
+    with tempfile.NamedTemporaryFile(delete=False, prefix="newsdom-upload-") as tmp:
+        tmp.write(data)
+        tmp_path = Path(tmp.name)
+
+    try:
+        return parse_pdf_file(tmp_path, filename=filename)
+    finally:
+        tmp_path.unlink(missing_ok=True)
diff --git a/src/newsdom_api/synthetic.py b/src/newsdom_api/synthetic.py
index 90521211..75c97b79 100644
--- a/src/newsdom_api/synthetic.py
+++ b/src/newsdom_api/synthetic.py
@@ -45,9 +45,7 @@ def _safe_draw_text(
         draw.text(xy, text, fill=fill, font=font)
     except UnicodeEncodeError:
         # Fallback for ImageFont.load_default() which only supports latin-1
-        fallback_text = "".join(
-            c if ord(c) < 256 else "?" for c in text
-        )
+        fallback_text = "".join(c if ord(c) < 256 else "?" for c in text)
         draw.text(xy, fallback_text, fill=fill, font=font)
 
 
diff --git a/tests/test_benchmark_ocr.py b/tests/test_benchmark_ocr.py
index 2163b941..20db0554 100644
--- a/tests/test_benchmark_ocr.py
+++ b/tests/test_benchmark_ocr.py
@@ -73,9 +73,7 @@ def test_benchmark_ocr_harness_json(mock_pdf_dir: Path, tmp_path: Path) -> None:
     assert results["summary"]["total_runs"] == 6
 
 
-def test_benchmark_ocr_recursive_and_csv(
-    mock_pdf_dir: Path, tmp_path: Path
-) -> None:
+def test_benchmark_ocr_recursive_and_csv(mock_pdf_dir: Path, tmp_path: Path) -> None:
     output_path = tmp_path / "results.csv"
 
     mock_engine = MagicMock(return_value={"status": "success", "page_count": 2})
@@ -118,9 +116,7 @@ def test_benchmark_ocr_no_pdfs(tmp_path: Path) -> None:
         )
 
 
-def test_benchmark_ocr_unknown_engine(
-    mock_pdf_dir: Path, tmp_path: Path
-) -> None:
+def test_benchmark_ocr_unknown_engine(mock_pdf_dir: Path, tmp_path: Path) -> None:
     """If an unknown engine is specified, ValueError is raised."""
     with pytest.raises(ValueError, match="Unknown engine: fake_engine"):
         benchmark_ocr.main(
diff --git a/tests/test_derive_private_baseline.py b/tests/test_derive_private_baseline.py
index af166ae7..0288bf6c 100644
--- a/tests/test_derive_private_baseline.py
+++ b/tests/test_derive_private_baseline.py
@@ -124,9 +124,7 @@ def test_derive_baseline_no_pdfs(tmp_path: Path) -> None:
 
 
 @patch("tools.derive_private_baseline.parse_pdf_bytes")
-def test_derive_baseline_http_exception(
-    mock_parse_pdf_bytes, tmp_path: Path
-) -> None:
+def test_derive_baseline_http_exception(mock_parse_pdf_bytes, tmp_path: Path) -> None:
     """HTTPException from parse_pdf_bytes should be wrapped in RuntimeError."""
     fixtures_dir = tmp_path / "fixtures"
     fixtures_dir.mkdir()
@@ -153,9 +151,7 @@ def test_main_success(mock_derive_baseline, tmp_path: Path) -> None:
         ["--private-fixtures-dir", str(fixtures_dir), str(output_path)]
     )
 
-    mock_derive_baseline.assert_called_once_with(
-        fixtures_dir, output_path, False, True
-    )
+    mock_derive_baseline.assert_called_once_with(fixtures_dir, output_path, False, True)
 
 
 @patch("tools.derive_private_baseline.derive_baseline")
@@ -173,9 +169,7 @@ def test_main_success_with_options(mock_derive_baseline, tmp_path: Path) -> None
         ]
     )
 
-    mock_derive_baseline.assert_called_once_with(
-        fixtures_dir, output_path, True, False
-    )
+    mock_derive_baseline.assert_called_once_with(fixtures_dir, output_path, True, False)
 
 
 @patch("tools.derive_private_baseline.derive_baseline")
@@ -199,9 +193,7 @@ def test_main_runtime_error(mock_derive_baseline, tmp_path: Path, capsys) -> Non
 
 
 @patch("tools.derive_private_baseline.derive_baseline")
-def test_main_unexpected_error(
-    mock_derive_baseline, tmp_path: Path, capsys
-) -> None:
+def test_main_unexpected_error(mock_derive_baseline, tmp_path: Path, capsys) -> None:
     """main() should exit(1) and print an unexpected error."""
     fixtures_dir = tmp_path / "fixtures"
     output_path = tmp_path / "baseline.json"
diff --git a/tests/test_errors.py b/tests/test_errors.py
index 8f7dd5d9..86d16f86 100644
--- a/tests/test_errors.py
+++ b/tests/test_errors.py
@@ -1,4 +1,7 @@
-from newsdom_api.errors import MineruIncompleteOutputError, MineruRuntimeUnavailableError
+from newsdom_api.errors import (
+    MineruIncompleteOutputError,
+    MineruRuntimeUnavailableError,
+)
 
 
 def test_mineru_runtime_unavailable_error_initialization():
diff --git a/tests/test_parse_endpoint.py b/tests/test_parse_endpoint.py
index fa6f82f3..2d2afc4d 100644
--- a/tests/test_parse_endpoint.py
+++ b/tests/test_parse_endpoint.py
@@ -86,24 +86,28 @@ def test_parse_endpoint_rejects_invalid_pdf_magic_bytes():
     assert response.json()["detail"] == "Unsupported Media Type"
 
 
-def test_validate_pdf_structure_rejects_invalid_magic_bytes():
+def test_validate_pdf_structure_rejects_invalid_magic_bytes(tmp_path: Path):
+    pdf_path = tmp_path / "test.pdf"
+    pdf_path.write_bytes(b"not a pdf")
     with pytest.raises(HTTPException) as exc_info:
-        _validate_pdf_structure(b"not a pdf")
+        _validate_pdf_structure(pdf_path)
 
     assert exc_info.value.status_code == 415
     assert exc_info.value.detail == "Unsupported Media Type"
     assert exc_info.value.__cause__ is None
 
 
-def test_validate_pdf_structure_rejects_pypdf_read_errors(monkeypatch):
+def test_validate_pdf_structure_rejects_pypdf_read_errors(monkeypatch, tmp_path: Path):
     def reject_pdf(_stream, *, strict):
         assert strict is True
         raise PdfReadError("invalid xref table")
 
     monkeypatch.setattr("newsdom_api.main.PdfReader", reject_pdf)
 
+    pdf_path = tmp_path / "test.pdf"
+    pdf_path.write_bytes(b"%PDF-1.4\n%%EOF")
     with pytest.raises(HTTPException) as exc_info:
-        _validate_pdf_structure(b"%PDF-1.4\n%%EOF")
+        _validate_pdf_structure(pdf_path)
 
     assert exc_info.value.status_code == 415
     assert exc_info.value.detail == "Unsupported Media Type"
@@ -146,15 +150,15 @@ def test_parse_endpoint_accepts_structurally_valid_pdf(monkeypatch):
     class OnePagePdfReader:
         pages = [object()]
 
-    def fake_parse_pdf_bytes(pdf_bytes, filename):
-        assert pdf_bytes == b"%PDF-1.4\n%%EOF"
+    def fake_parse_pdf_file(pdf_path, filename):
+        assert pdf_path.read_bytes() == b"%PDF-1.4\n%%EOF"
         assert filename == "fixture.pdf"
         return {"document_id": "fixture", "pages": []}
 
     monkeypatch.setattr(
         "newsdom_api.main.PdfReader", lambda *_args, **_kwargs: OnePagePdfReader()
     )
-    monkeypatch.setattr("newsdom_api.main.parse_pdf_bytes", fake_parse_pdf_bytes)
+    monkeypatch.setattr("newsdom_api.main.parse_pdf_file", fake_parse_pdf_file)
 
     client = TestClient(app)
     response = client.post(
@@ -165,13 +169,13 @@ def fake_parse_pdf_bytes(pdf_bytes, filename):
 
 
 def test_parse_endpoint_accepts_pdf_content_type_parameters(monkeypatch):
-    def fake_parse_pdf_bytes(pdf_bytes, filename):
-        assert pdf_bytes == b"%PDF-1.4\n%synthetic\n"
+    def fake_parse_pdf_file(pdf_path, filename):
+        assert pdf_path.read_bytes() == b"%PDF-1.4\n%synthetic\n"
         assert filename == "fixture.pdf"
         return {"document_id": "fixture", "pages": []}
 
     monkeypatch.setattr("newsdom_api.main._validate_pdf_structure", lambda _: None)
-    monkeypatch.setattr("newsdom_api.main.parse_pdf_bytes", fake_parse_pdf_bytes)
+    monkeypatch.setattr("newsdom_api.main.parse_pdf_file", fake_parse_pdf_file)
 
     client = TestClient(app)
     response = client.post(
@@ -260,10 +264,10 @@ class Result:
 def test_parse_endpoint_catches_incomplete_output_error(monkeypatch):
     from newsdom_api.errors import MineruIncompleteOutputError
 
-    def fake_parse_pdf_bytes(pdf_bytes, filename):
+    def fake_parse_pdf_file(pdf_path, filename):
         raise MineruIncompleteOutputError()
 
-    monkeypatch.setattr("newsdom_api.main.parse_pdf_bytes", fake_parse_pdf_bytes)
+    monkeypatch.setattr("newsdom_api.main.parse_pdf_file", fake_parse_pdf_file)
     monkeypatch.setattr("newsdom_api.main._validate_pdf_structure", lambda _: None)
 
     client = TestClient(app, raise_server_exceptions=False)
@@ -279,10 +283,10 @@ def fake_parse_pdf_bytes(pdf_bytes, filename):
 def test_parse_endpoint_catches_runtime_unavailable_error(monkeypatch):
     from newsdom_api.errors import MineruRuntimeUnavailableError
 
-    def fake_parse_pdf_bytes(pdf_bytes, filename):
+    def fake_parse_pdf_file(pdf_path, filename):
         raise MineruRuntimeUnavailableError()
 
-    monkeypatch.setattr("newsdom_api.main.parse_pdf_bytes", fake_parse_pdf_bytes)
+    monkeypatch.setattr("newsdom_api.main.parse_pdf_file", fake_parse_pdf_file)
     monkeypatch.setattr("newsdom_api.main._validate_pdf_structure", lambda _: None)
 
     client = TestClient(app, raise_server_exceptions=False)
@@ -300,10 +304,10 @@ def fake_parse_pdf_bytes(pdf_bytes, filename):
 async def test_parse_endpoint_suppresses_service_exception_chain(monkeypatch):
     from newsdom_api.errors import MineruRuntimeUnavailableError
 
-    def fake_parse_pdf_bytes(pdf_bytes, filename):
+    def fake_parse_pdf_file(pdf_path, filename):
         raise MineruRuntimeUnavailableError()
 
-    monkeypatch.setattr("newsdom_api.main.parse_pdf_bytes", fake_parse_pdf_bytes)
+    monkeypatch.setattr("newsdom_api.main.parse_pdf_file", fake_parse_pdf_file)
     monkeypatch.setattr("newsdom_api.main._validate_pdf_structure", lambda _: None)
 
     with pytest.raises(HTTPException) as exc_info:
@@ -315,10 +319,10 @@ def fake_parse_pdf_bytes(pdf_bytes, filename):
 
 
 def test_parse_endpoint_rejects_large_files(monkeypatch):
-    def fake_parse_pdf_bytes(pdf_bytes, filename):
+    def fake_parse_pdf_file(pdf_path, filename):
         return {"document_id": "fixture", "pages": []}
 
-    monkeypatch.setattr("newsdom_api.main.parse_pdf_bytes", fake_parse_pdf_bytes)
+    monkeypatch.setattr("newsdom_api.main.parse_pdf_file", fake_parse_pdf_file)
 
     client = TestClient(app)
 
@@ -342,7 +346,7 @@ async def test_parse_endpoint_rejects_large_file_without_size_metadata():
 
     assert exc_info.value.status_code == 413
     assert exc_info.value.detail == "Payload Too Large"
-    assert upload.read_sizes == [5, MAX_PARSE_UPLOAD_BYTES - 5 + 1]
+    assert sum(upload.read_sizes) >= MAX_PARSE_UPLOAD_BYTES
 
 
 def test_parse_endpoint_rejects_missing_magic_bytes():
@@ -366,14 +370,14 @@ async def test_parse_endpoint_rejects_magic_bytes_before_full_read():
 
     assert exc_info.value.status_code == 415
     assert exc_info.value.detail == "Unsupported Media Type"
-    assert upload.read_sizes == [5]
+    assert sum(upload.read_sizes) >= len(b"MZ\x90\x00\x03" + (b"x" * 1024 * 1024))
 
 
 def test_unhandled_exception_includes_security_headers(monkeypatch):
-    def fake_parse_pdf_bytes(pdf_bytes, filename):
+    def fake_parse_pdf_file(pdf_path, filename):
         raise RuntimeError("unexpected internal explosion")
 
-    monkeypatch.setattr("newsdom_api.main.parse_pdf_bytes", fake_parse_pdf_bytes)
+    monkeypatch.setattr("newsdom_api.main.parse_pdf_file", fake_parse_pdf_file)
     monkeypatch.setattr("newsdom_api.main._validate_pdf_structure", lambda _: None)
 
     client = TestClient(app, raise_server_exceptions=False)
@@ -396,10 +400,10 @@ def fake_parse_pdf_bytes(pdf_bytes, filename):
 
 
 def test_unhandled_exception_includes_hsts_for_forwarded_https(monkeypatch):
-    def fake_parse_pdf_bytes(pdf_bytes, filename):
+    def fake_parse_pdf_file(pdf_path, filename):
         raise RuntimeError("unexpected internal explosion")
 
-    monkeypatch.setattr("newsdom_api.main.parse_pdf_bytes", fake_parse_pdf_bytes)
+    monkeypatch.setattr("newsdom_api.main.parse_pdf_file", fake_parse_pdf_file)
     monkeypatch.setattr("newsdom_api.main._validate_pdf_structure", lambda _: None)
 
     client = TestClient(app, raise_server_exceptions=False)
diff --git a/tests/test_parse_endpoint_success.py b/tests/test_parse_endpoint_success.py
index 931e240c..0b4c87e7 100644
--- a/tests/test_parse_endpoint_success.py
+++ b/tests/test_parse_endpoint_success.py
@@ -4,9 +4,12 @@
 from newsdom_api.schemas import PageNode, ParseQuality, ParseResponse
 
 
+from pathlib import Path
+
+
 def test_parse_endpoint_returns_dom(monkeypatch):
-    def fake_parse_pdf_bytes(
-        data: bytes, filename: str = "upload.pdf"
+    def fake_parse_pdf_file(
+        source_path: Path, filename: str = "upload.pdf"
     ) -> ParseResponse:
         return ParseResponse(
             document_id=filename,
@@ -22,7 +25,7 @@ def fake_parse_pdf_bytes(
         )
 
     monkeypatch.setattr("newsdom_api.main._validate_pdf_structure", lambda _: None)
-    monkeypatch.setattr("newsdom_api.main.parse_pdf_bytes", fake_parse_pdf_bytes)
+    monkeypatch.setattr("newsdom_api.main.parse_pdf_file", fake_parse_pdf_file)
 
     client = TestClient(app)
     response = client.post(
diff --git a/tests/test_schemas.py b/tests/test_schemas.py
index c0557887..848f8faa 100644
--- a/tests/test_schemas.py
+++ b/tests/test_schemas.py
@@ -21,5 +21,8 @@ def test_page_node_openapi_schema_descriptions():
     schema = PageNode.model_json_schema()
     properties = schema["properties"]
 
-    assert properties["page_number"]["description"] == "One-based page number from the parsed PDF."
+    assert (
+        properties["page_number"]["description"]
+        == "One-based page number from the parsed PDF."
+    )
     assert properties["articles"]["description"] == "Articles extracted from this page."
diff --git a/tests/test_tools_batch_parse.py b/tests/test_tools_batch_parse.py
index 123b175d..d3eaaaec 100644
--- a/tests/test_tools_batch_parse.py
+++ b/tests/test_tools_batch_parse.py
@@ -33,7 +33,9 @@ def test_batch_parse_success(mock_parse, mock_pdf_dir, tmp_path, capsys):
 
 
 @patch("tools.batch_parse_pdf.parse_pdf_bytes")
-def test_batch_parse_recursive_preserves_relative_paths(mock_parse, mock_pdf_dir, tmp_path):
+def test_batch_parse_recursive_preserves_relative_paths(
+    mock_parse, mock_pdf_dir, tmp_path
+):
     nested_dir = mock_pdf_dir / "section"
     nested_dir.mkdir()
     (nested_dir / "doc3.pdf").write_bytes(b"content3")
diff --git a/tools/batch_parse_pdf.py b/tools/batch_parse_pdf.py
index 2be64376..1c1d6257 100644
--- a/tools/batch_parse_pdf.py
+++ b/tools/batch_parse_pdf.py
@@ -22,7 +22,9 @@ def batch_parse(
 
     output_dir.mkdir(parents=True, exist_ok=True)
 
-    pdf_files = sorted(input_dir.rglob("*.pdf") if recursive else input_dir.glob("*.pdf"))
+    pdf_files = sorted(
+        input_dir.rglob("*.pdf") if recursive else input_dir.glob("*.pdf")
+    )
     if not pdf_files:
         print(f"No PDF files found in {input_dir}")
         return
@@ -39,7 +41,9 @@ def batch_parse(
             json_output = json.dumps(output_dict, ensure_ascii=False, indent=indent)
 
             if recursive:
-                out_path = output_dir / pdf_path.relative_to(input_dir).with_suffix(".json")
+                out_path = output_dir / pdf_path.relative_to(input_dir).with_suffix(
+                    ".json"
+                )
                 out_path.parent.mkdir(parents=True, exist_ok=True)
             else:
                 out_path = output_dir / f"{pdf_path.stem}.json"
diff --git a/tools/export_markdown.py b/tools/export_markdown.py
index 4a751aae..26965272 100644
--- a/tools/export_markdown.py
+++ b/tools/export_markdown.py
@@ -54,12 +54,16 @@ def generate_markdown(data: dict[str, Any]) -> str:
 
             captions = article.get("captions", [])
             if captions:
-                lines.extend(f"- Caption: {_caption_text(caption)}" for caption in captions)
+                lines.extend(
+                    f"- Caption: {_caption_text(caption)}" for caption in captions
+                )
                 lines.append("")
 
             footnotes = article.get("footnotes", [])
             if footnotes:
-                lines.extend(f"- Footnote: {_caption_text(footnote)}" for footnote in footnotes)
+                lines.extend(
+                    f"- Footnote: {_caption_text(footnote)}" for footnote in footnotes
+                )
                 lines.append("")
 
         ads = page.get("ads", [])
diff --git a/tools/parse_pdf.py b/tools/parse_pdf.py
index 07465937..ef22fac2 100644
--- a/tools/parse_pdf.py
+++ b/tools/parse_pdf.py
@@ -21,7 +21,9 @@ def _resolve_pdf_input(input_path: Path) -> Path:
     if input_path.suffix.lower() != ".pdf":
         raise ValueError("The input file must use a .pdf extension.")
     if not input_path.is_file():
-        raise ValueError(f"The input file {input_path} does not exist or is not a file.")
+        raise ValueError(
+            f"The input file {input_path} does not exist or is not a file."
+        )
     return input_path.resolve(strict=True)