From 3f9f0013ae133dc8b020db666a16321faa999e90 Mon Sep 17 00:00:00 2001 From: seonghobae <8172694+seonghobae@users.noreply.github.com> Date: Fri, 3 Jul 2026 21:02:37 +0000 Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=20Bolt:=20=EB=8B=A8=EC=9D=BC=20?= =?UTF-8?q?=EC=95=84=ED=8B=B0=ED=8C=A9=ED=8A=B8=20=EB=94=94=EB=A0=89?= =?UTF-8?q?=ED=86=A0=EB=A6=AC=20=ED=83=90=EC=83=89=20=EC=8B=9C=20=EB=A6=AC?= =?UTF-8?q?=EC=8A=A4=ED=8A=B8=20=ED=95=A0=EB=8B=B9=20=EC=A0=9C=EA=B1=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `_find_output_dir` 함수에서 `glob("*/ocr")`의 결과를 `list()`로 감싸는 대신 `next()`를 사용하여 첫 번째 항목만 가져오도록 최적화했습니다. 이를 통해 파일 시스템에서 발견된 모든 매치 항목을 반복하고 메모리에 할당하는 오버헤드를 제거했습니다. --- .jules/bolt.md | 4 ++++ src/newsdom_api/mineru_runner.py | 7 ++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/.jules/bolt.md b/.jules/bolt.md index 2389b4f..8956cf3 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -40,3 +40,7 @@ ## 2026-06-30 - Regex over Generator `any` string loops **Learning:** Using `any(...)` with a generator comprehension in string evaluation paths allocates a new generator and adds Python-level loop overhead for every character. **Action:** Replace `any()` generators with a pre-compiled regex (`re.compile().search()`) to evaluate string patterns in C, achieving a ~7x speedup for text-heavy operations. + +## 2024-07-01 - Avoid List Evaluation on Glob Matches +**Learning:** Calling `list()` on a generator like `path.glob("*/ocr")` iterates over all matches and allocates a list, even if we only need the first element. In our profiling of file discovery routines, eagerly resolving a glob generator caused unnecessary directory traversals and list allocations. +**Action:** Use `next(path.glob(...))` with a `try/except StopIteration` block when looking for the first matching directory to avoid evaluating the entire glob iterator. diff --git a/src/newsdom_api/mineru_runner.py b/src/newsdom_api/mineru_runner.py index 2ad987b..74e862e 100644 --- a/src/newsdom_api/mineru_runner.py +++ b/src/newsdom_api/mineru_runner.py @@ -78,10 +78,11 @@ def _resolve_mineru_bin() -> str: def _find_output_dir(base_output_dir: Path) -> Path: """Locate the OCR output directory created by MinerU.""" - candidates = list(base_output_dir.glob("*/ocr")) - if not candidates: + try: + # ⚡ Bolt: Use next() to avoid evaluating the entire glob iterator into a list + return next(base_output_dir.glob("*/ocr")) + except StopIteration: raise FileNotFoundError("MinerU OCR output directory was not produced") - return candidates[0] def _execute_mineru(cmd: list[str]) -> subprocess.CompletedProcess[str]: