From 4c9860cea04f4c3d53e910577d1bee2c35f46309 Mon Sep 17 00:00:00 2001 From: seonghobae <8172694+seonghobae@users.noreply.github.com> Date: Wed, 1 Jul 2026 15:11:38 +0000 Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=20Bolt:=20=EC=A7=80=EC=97=B0=20?= =?UTF-8?q?=ED=8F=89=EA=B0=80=EB=A5=BC=20=ED=86=B5=ED=95=9C=20=EC=8A=A4?= =?UTF-8?q?=EC=BA=90=EB=84=88=20=ED=8C=8C=EC=9D=BC=20I/O=20=EC=B5=9C?= =?UTF-8?q?=EC=A0=81=ED=99=94=20(#153)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 성능 최적화: `_scan_file` 내 불필요한 `stat()` 시스템 콜 제거 `scanner/cli/appguardrail.py`의 `_scan_file` 함수 시작 부분에서 조건 없이 평가되던 `base_path.is_dir()` 및 `Path(".").resolve()` 호출을 지연 평가하도록 변경했습니다. 두 함수는 모두 동기식 `stat()` 시스템 콜을 발생시키며, 취약점이 없거나 필터링되는 대다수의 안전한 파일에 대해서는 이 값이 필요하지 않음에도 불구하고 막대한 I/O 오버헤드를 유발하고 있었습니다. 이를 `resolved_base_path = None`으로 초기화하고, 실제로 경로 변환(`relative_to`)이 필요한 시점에만 평가하도록 늦춤으로써 스캔 성능을 개선했습니다. --- .jules/bolt.md | 3 +++ scanner/cli/appguardrail.py | 27 ++++++++++++++++++--------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/.jules/bolt.md b/.jules/bolt.md index c0c5641..eb880c5 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -43,3 +43,6 @@ ## 2026-07-01 - O(N*M) Line Counting Optimization **Learning:** In `scanner/cli/appguardrail.py`, the `_scan_file` loop calculates line numbers by calling `count_newlines("\n", 0, start_idx)` for *every* regex match. In files with many matches, this repeatedly scans the string from the beginning, resulting in O(N*M) performance (where N is file length and M is matches). This is a massive bottleneck. **Action:** Since `re.finditer` yields matches strictly in order, always calculate line numbers progressively using a tracking variable `current_line` and `current_pos`. Update `current_line += count_newlines("\n", current_pos, start_idx)`. This makes the line calculation strictly O(N), bringing up to a 15x speedup for files with many hits. +## 2026-07-02 - Deferring base_path.is_dir() and Path(".").resolve() +**Learning:** Evaluating `base_path.is_dir()` and `Path(".").resolve()` invokes expensive synchronous `stat()` system calls. Pre-calculating these at the beginning of a hot path (like file scanning) incurs pure overhead for all files, especially since the vast majority of files don't have vulnerabilities and don't need this value. +**Action:** Always initialize these variables to `None` and only evaluate `base_path.is_dir()` and `Path(".").resolve()` lazily right before they are actually needed (e.g. constructing match findings) to avoid unnecessary I/O. diff --git a/scanner/cli/appguardrail.py b/scanner/cli/appguardrail.py index 13e479f..a55034e 100644 --- a/scanner/cli/appguardrail.py +++ b/scanner/cli/appguardrail.py @@ -1873,7 +1873,9 @@ def _run_bandit_scan(scan_path: Path): if process.returncode not in {0, 1}: detail = (process.stderr or process.stdout).strip().splitlines() - raise RuntimeError("Bandit scan failed" + (f": {detail[-1]}" if detail else ".")) + raise RuntimeError( + "Bandit scan failed" + (f": {detail[-1]}" if detail else ".") + ) try: report = json.loads(process.stdout or "{}") @@ -1972,9 +1974,7 @@ def _semgrep_findings(report: dict, base_path: Path): for item in report.get("results") or []: extra = item.get("extra") or {} start = item.get("start") or {} - path = _sanitize_terminal_output( - _trivy_target(item.get("path", ""), base_path) - ) + path = _sanitize_terminal_output(_trivy_target(item.get("path", ""), base_path)) check_id = item.get("check_id") or "semgrep" findings.append( _build_finding( @@ -2171,11 +2171,6 @@ def _scan_file(file_path: Path, base_path: Path): """Scan a single file and return a list of findings.""" findings = [] - # ⚡ Bolt: Hoist expensive relative_to base_path resolution outside of loops. - # Path.is_dir() and Path.resolve() invoke stat() system calls. Doing this inside - # the finding iteration loop for every match was causing massive I/O overhead. - resolved_base_path = base_path if base_path.is_dir() else Path(".").resolve() - # ⚡ Bolt: Optimize stat calls by using os.lstat instead of Path objects # Impact: Combines symlink, file type, and size checks into a single stat call try: @@ -2198,6 +2193,8 @@ def _scan_file(file_path: Path, base_path: Path): # ⚡ Bolt: Defer expensive Pathlib operations (like relative_to) and string # sanitization until a match is actually found. This avoids significant overhead # for the vast majority of files that have no vulnerabilities. + # We also defer `base_path.is_dir()` and `Path(".").resolve()` which invoke `stat()`. + resolved_base_path = None rel_path_str = None rel_path_for_filters = None build_finding = _build_finding @@ -2223,6 +2220,12 @@ def _scan_file(file_path: Path, base_path: Path): if include_paths or exclude_paths: if rel_path_for_filters is None: try: + if resolved_base_path is None: + resolved_base_path = ( + base_path + if base_path.is_dir() + else Path(".").resolve() + ) rel_path = file_path.relative_to(resolved_base_path) except ValueError: rel_path = ( @@ -2246,6 +2249,12 @@ def _scan_file(file_path: Path, base_path: Path): for match in finditer(content): if rel_path_str is None: try: + if resolved_base_path is None: + resolved_base_path = ( + base_path + if base_path.is_dir() + else Path(".").resolve() + ) rel_path = file_path.relative_to(resolved_base_path) except ValueError: rel_path = (