diff --git a/.jules/bolt.md b/.jules/bolt.md index c0c5641..eb880c5 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -43,3 +43,6 @@ ## 2026-07-01 - O(N*M) Line Counting Optimization **Learning:** In `scanner/cli/appguardrail.py`, the `_scan_file` loop calculates line numbers by calling `count_newlines("\n", 0, start_idx)` for *every* regex match. In files with many matches, this repeatedly scans the string from the beginning, resulting in O(N*M) performance (where N is file length and M is matches). This is a massive bottleneck. **Action:** Since `re.finditer` yields matches strictly in order, always calculate line numbers progressively using a tracking variable `current_line` and `current_pos`. Update `current_line += count_newlines("\n", current_pos, start_idx)`. This makes the line calculation strictly O(N), bringing up to a 15x speedup for files with many hits. +## 2026-07-02 - Deferring base_path.is_dir() and Path(".").resolve() +**Learning:** Evaluating `base_path.is_dir()` and `Path(".").resolve()` invokes expensive synchronous `stat()` system calls. Pre-calculating these at the beginning of a hot path (like file scanning) incurs pure overhead for all files, especially since the vast majority of files don't have vulnerabilities and don't need this value. +**Action:** Always initialize these variables to `None` and only evaluate `base_path.is_dir()` and `Path(".").resolve()` lazily right before they are actually needed (e.g. constructing match findings) to avoid unnecessary I/O. diff --git a/scanner/cli/appguardrail.py b/scanner/cli/appguardrail.py index 13e479f..a55034e 100644 --- a/scanner/cli/appguardrail.py +++ b/scanner/cli/appguardrail.py @@ -1873,7 +1873,9 @@ def _run_bandit_scan(scan_path: Path): if process.returncode not in {0, 1}: detail = (process.stderr or process.stdout).strip().splitlines() - raise RuntimeError("Bandit scan failed" + (f": {detail[-1]}" if detail else ".")) + raise RuntimeError( + "Bandit scan failed" + (f": {detail[-1]}" if detail else ".") + ) try: report = json.loads(process.stdout or "{}") @@ -1972,9 +1974,7 @@ def _semgrep_findings(report: dict, base_path: Path): for item in report.get("results") or []: extra = item.get("extra") or {} start = item.get("start") or {} - path = _sanitize_terminal_output( - _trivy_target(item.get("path", ""), base_path) - ) + path = _sanitize_terminal_output(_trivy_target(item.get("path", ""), base_path)) check_id = item.get("check_id") or "semgrep" findings.append( _build_finding( @@ -2171,11 +2171,6 @@ def _scan_file(file_path: Path, base_path: Path): """Scan a single file and return a list of findings.""" findings = [] - # ⚡ Bolt: Hoist expensive relative_to base_path resolution outside of loops. - # Path.is_dir() and Path.resolve() invoke stat() system calls. Doing this inside - # the finding iteration loop for every match was causing massive I/O overhead. - resolved_base_path = base_path if base_path.is_dir() else Path(".").resolve() - # ⚡ Bolt: Optimize stat calls by using os.lstat instead of Path objects # Impact: Combines symlink, file type, and size checks into a single stat call try: @@ -2198,6 +2193,8 @@ def _scan_file(file_path: Path, base_path: Path): # ⚡ Bolt: Defer expensive Pathlib operations (like relative_to) and string # sanitization until a match is actually found. This avoids significant overhead # for the vast majority of files that have no vulnerabilities. + # We also defer `base_path.is_dir()` and `Path(".").resolve()` which invoke `stat()`. + resolved_base_path = None rel_path_str = None rel_path_for_filters = None build_finding = _build_finding @@ -2223,6 +2220,12 @@ def _scan_file(file_path: Path, base_path: Path): if include_paths or exclude_paths: if rel_path_for_filters is None: try: + if resolved_base_path is None: + resolved_base_path = ( + base_path + if base_path.is_dir() + else Path(".").resolve() + ) rel_path = file_path.relative_to(resolved_base_path) except ValueError: rel_path = ( @@ -2246,6 +2249,12 @@ def _scan_file(file_path: Path, base_path: Path): for match in finditer(content): if rel_path_str is None: try: + if resolved_base_path is None: + resolved_base_path = ( + base_path + if base_path.is_dir() + else Path(".").resolve() + ) rel_path = file_path.relative_to(resolved_base_path) except ValueError: rel_path = (