diff --git a/.jules/bolt.md b/.jules/bolt.md index 2389b4f..8956cf3 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -40,3 +40,7 @@ ## 2026-06-30 - Regex over Generator `any` string loops **Learning:** Using `any(...)` with a generator comprehension in string evaluation paths allocates a new generator and adds Python-level loop overhead for every character. **Action:** Replace `any()` generators with a pre-compiled regex (`re.compile().search()`) to evaluate string patterns in C, achieving a ~7x speedup for text-heavy operations. + +## 2024-07-01 - Avoid List Evaluation on Glob Matches +**Learning:** Calling `list()` on a generator like `path.glob("*/ocr")` iterates over all matches and allocates a list, even if we only need the first element. In our profiling of file discovery routines, eagerly resolving a glob generator caused unnecessary directory traversals and list allocations. +**Action:** Use `next(path.glob(...))` with a `try/except StopIteration` block when looking for the first matching directory to avoid evaluating the entire glob iterator. diff --git a/src/newsdom_api/mineru_runner.py b/src/newsdom_api/mineru_runner.py index 2ad987b..74e862e 100644 --- a/src/newsdom_api/mineru_runner.py +++ b/src/newsdom_api/mineru_runner.py @@ -78,10 +78,11 @@ def _resolve_mineru_bin() -> str: def _find_output_dir(base_output_dir: Path) -> Path: """Locate the OCR output directory created by MinerU.""" - candidates = list(base_output_dir.glob("*/ocr")) - if not candidates: + try: + # ⚡ Bolt: Use next() to avoid evaluating the entire glob iterator into a list + return next(base_output_dir.glob("*/ocr")) + except StopIteration: raise FileNotFoundError("MinerU OCR output directory was not produced") - return candidates[0] def _execute_mineru(cmd: list[str]) -> subprocess.CompletedProcess[str]: