From 4c9860cea04f4c3d53e910577d1bee2c35f46309 Mon Sep 17 00:00:00 2001
From: seonghobae <8172694+seonghobae@users.noreply.github.com>
Date: Wed, 1 Jul 2026 15:11:38 +0000
Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=20Bolt:=20=EC=A7=80=EC=97=B0=20?=
 =?UTF-8?q?=ED=8F=89=EA=B0=80=EB=A5=BC=20=ED=86=B5=ED=95=9C=20=EC=8A=A4?=
 =?UTF-8?q?=EC=BA=90=EB=84=88=20=ED=8C=8C=EC=9D=BC=20I/O=20=EC=B5=9C?=
 =?UTF-8?q?=EC=A0=81=ED=99=94=20(#153)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 성능 최적화: `_scan_file` 내 불필요한 `stat()` 시스템 콜 제거

`scanner/cli/appguardrail.py`의 `_scan_file` 함수 시작 부분에서 조건 없이 평가되던 `base_path.is_dir()` 및 `Path(".").resolve()` 호출을 지연 평가하도록 변경했습니다.
두 함수는 모두 동기식 `stat()` 시스템 콜을 발생시키며, 취약점이 없거나 필터링되는 대다수의 안전한 파일에 대해서는 이 값이 필요하지 않음에도 불구하고 막대한 I/O 오버헤드를 유발하고 있었습니다.
이를 `resolved_base_path = None`으로 초기화하고, 실제로 경로 변환(`relative_to`)이 필요한 시점에만 평가하도록 늦춤으로써 스캔 성능을 개선했습니다.
---
 .jules/bolt.md              |  3 +++
 scanner/cli/appguardrail.py | 27 ++++++++++++++++++---------
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/.jules/bolt.md b/.jules/bolt.md
index c0c5641..eb880c5 100644
--- a/.jules/bolt.md
+++ b/.jules/bolt.md
@@ -43,3 +43,6 @@
 ## 2026-07-01 - O(N*M) Line Counting Optimization
 **Learning:** In `scanner/cli/appguardrail.py`, the `_scan_file` loop calculates line numbers by calling `count_newlines("\n", 0, start_idx)` for *every* regex match. In files with many matches, this repeatedly scans the string from the beginning, resulting in O(N*M) performance (where N is file length and M is matches). This is a massive bottleneck.
 **Action:** Since `re.finditer` yields matches strictly in order, always calculate line numbers progressively using a tracking variable `current_line` and `current_pos`. Update `current_line += count_newlines("\n", current_pos, start_idx)`. This makes the line calculation strictly O(N), bringing up to a 15x speedup for files with many hits.
+## 2026-07-02 - Deferring base_path.is_dir() and Path(".").resolve()
+**Learning:** Evaluating `base_path.is_dir()` and `Path(".").resolve()` invokes expensive synchronous `stat()` system calls. Pre-calculating these at the beginning of a hot path (like file scanning) incurs pure overhead for all files, especially since the vast majority of files don't have vulnerabilities and don't need this value.
+**Action:** Always initialize these variables to `None` and only evaluate `base_path.is_dir()` and `Path(".").resolve()` lazily right before they are actually needed (e.g. constructing match findings) to avoid unnecessary I/O.
diff --git a/scanner/cli/appguardrail.py b/scanner/cli/appguardrail.py
index 13e479f..a55034e 100644
--- a/scanner/cli/appguardrail.py
+++ b/scanner/cli/appguardrail.py
@@ -1873,7 +1873,9 @@ def _run_bandit_scan(scan_path: Path):
 
     if process.returncode not in {0, 1}:
         detail = (process.stderr or process.stdout).strip().splitlines()
-        raise RuntimeError("Bandit scan failed" + (f": {detail[-1]}" if detail else "."))
+        raise RuntimeError(
+            "Bandit scan failed" + (f": {detail[-1]}" if detail else ".")
+        )
 
     try:
         report = json.loads(process.stdout or "{}")
@@ -1972,9 +1974,7 @@ def _semgrep_findings(report: dict, base_path: Path):
     for item in report.get("results") or []:
         extra = item.get("extra") or {}
         start = item.get("start") or {}
-        path = _sanitize_terminal_output(
-            _trivy_target(item.get("path", ""), base_path)
-        )
+        path = _sanitize_terminal_output(_trivy_target(item.get("path", ""), base_path))
         check_id = item.get("check_id") or "semgrep"
         findings.append(
             _build_finding(
@@ -2171,11 +2171,6 @@ def _scan_file(file_path: Path, base_path: Path):
     """Scan a single file and return a list of findings."""
     findings = []
 
-    # ⚡ Bolt: Hoist expensive relative_to base_path resolution outside of loops.
-    # Path.is_dir() and Path.resolve() invoke stat() system calls. Doing this inside
-    # the finding iteration loop for every match was causing massive I/O overhead.
-    resolved_base_path = base_path if base_path.is_dir() else Path(".").resolve()
-
     # ⚡ Bolt: Optimize stat calls by using os.lstat instead of Path objects
     # Impact: Combines symlink, file type, and size checks into a single stat call
     try:
@@ -2198,6 +2193,8 @@ def _scan_file(file_path: Path, base_path: Path):
     # ⚡ Bolt: Defer expensive Pathlib operations (like relative_to) and string
     # sanitization until a match is actually found. This avoids significant overhead
     # for the vast majority of files that have no vulnerabilities.
+    # We also defer `base_path.is_dir()` and `Path(".").resolve()` which invoke `stat()`.
+    resolved_base_path = None
     rel_path_str = None
     rel_path_for_filters = None
     build_finding = _build_finding
@@ -2223,6 +2220,12 @@ def _scan_file(file_path: Path, base_path: Path):
                 if include_paths or exclude_paths:
                     if rel_path_for_filters is None:
                         try:
+                            if resolved_base_path is None:
+                                resolved_base_path = (
+                                    base_path
+                                    if base_path.is_dir()
+                                    else Path(".").resolve()
+                                )
                             rel_path = file_path.relative_to(resolved_base_path)
                         except ValueError:
                             rel_path = (
@@ -2246,6 +2249,12 @@ def _scan_file(file_path: Path, base_path: Path):
                 for match in finditer(content):
                     if rel_path_str is None:
                         try:
+                            if resolved_base_path is None:
+                                resolved_base_path = (
+                                    base_path
+                                    if base_path.is_dir()
+                                    else Path(".").resolve()
+                                )
                             rel_path = file_path.relative_to(resolved_base_path)
                         except ValueError:
                             rel_path = (