ContextualWisdomLab · seonghobae · Jul 3, 2026
diff --git a/.jules/bolt.md b/.jules/bolt.md
@@ -47,3 +47,7 @@
 ## 2026-07-02 - Remove `re.search` fast-path pre-check
 **Learning:** Python's `re.finditer` evaluates lazily by allocating a lightweight C-level `ScannerObject`. Using `re.search` as a fast-path pre-check before `re.finditer` is an anti-pattern that addresses a non-existent bottleneck and degrades performance for matched paths by evaluating the regex twice.
 **Action:** Do not use `re.search` before `re.finditer` for optimization purposes.
+
+## 2024-07-03 - Defer expensive pathlib operations
+**Learning:** Path.is_dir() and Path.resolve() trigger costly stat() system calls. Executing them unconditionally for every file scanned in a hot path causes significant overhead.
+**Action:** Defer these calls using lazy evaluation (initializing to None and computing only when needed) to avoid overhead for files that don't match any rules.
diff --git a/scanner/cli/appguardrail.py b/scanner/cli/appguardrail.py
@@ -57,32 +57,21 @@
     sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
 
 from appguardrail_core.external import build_external_scan_plan
-from appguardrail_core.findings import (
-    NON_BLOCKING_CONTEXTS,
-    is_deploy_blocking as core_is_deploy_blocking,
-    normalize_findings,
-)
-from appguardrail_core.language import (
-    LANGUAGE_EXTENSIONS,
-    detect_language_axes,
-    detect_stack_profile,
-)
-from appguardrail_core.org_bundle import (
-    OrgBundleError,
-    annotate_missing_pr_repositories,
-    gh_error_message,
-    gh_pr_list,
-    gh_repo_list,
-    load_json as load_org_json,
-    render_org_evidence,
-    write_bundle,
-)
-from appguardrail_core.reports import (
-    REPORT_TYPE_LABELS,
-    ReportContext,
-    render_report,
-    supported_report_types,
-)
+from appguardrail_core.findings import NON_BLOCKING_CONTEXTS
+from appguardrail_core.findings import \
+    is_deploy_blocking as core_is_deploy_blocking
+from appguardrail_core.findings import normalize_findings
+from appguardrail_core.language import (LANGUAGE_EXTENSIONS,
+                                        detect_language_axes,
+                                        detect_stack_profile)
+from appguardrail_core.org_bundle import (OrgBundleError,
+                                          annotate_missing_pr_repositories,
+                                          gh_error_message, gh_pr_list,
+                                          gh_repo_list)
+from appguardrail_core.org_bundle import load_json as load_org_json
+from appguardrail_core.org_bundle import render_org_evidence, write_bundle
+from appguardrail_core.reports import (REPORT_TYPE_LABELS, ReportContext,
+                                       render_report, supported_report_types)
 from appguardrail_core.rules import build_rule_metadata
 
 __version__ = "0.1.1"
@@ -725,7 +714,7 @@
             r"(?i)<a\b(?=[^>\n]*target\s*=\s*[\"']_blank[\"'])(?![^>\n]*rel\s*=\s*[\"'][^\"']*(?:noopener|noreferrer))[^>\n]*href\s*=\s*[\"']https?://"
         ),
         "severity": "WARNING",
-        "message": "External target=_blank link is missing rel=\"noopener noreferrer\". Add rel attributes to prevent reverse tabnabbing. [OWASP A05:2021 - Security Misconfiguration]",
+        "message": 'External target=_blank link is missing rel="noopener noreferrer". Add rel attributes to prevent reverse tabnabbing. [OWASP A05:2021 - Security Misconfiguration]',
         "extensions": [".html", ".htm"],
     },
     {
@@ -873,6 +862,7 @@
     },
 ]
 
+
 def _unquote_rule_scalar(value: str) -> str:
     """Return a simple YAML scalar value from the controlled rule files."""
     value = value.strip()
@@ -1451,10 +1441,7 @@ def cmd_scan(args):
         try:
             findings.extend(_run_semgrep_scan(scan_path, semgrep_config))
         except RuntimeError as exc:
-            if (
-                external_plan.semgrep.auto_selected
-                and not external_plan.semgrep.forced
-            ):
+            if external_plan.semgrep.auto_selected and not external_plan.semgrep.forced:
                 print(f"⚠️  Skipping Semgrep auto integration: {exc}\n")
             else:
                 print(f"❌ Error: {exc}", file=sys.stderr)
@@ -1576,8 +1563,7 @@ def cmd_report(args):
         or "Application source, configuration, and security workflow evidence.",
         client_name=getattr(args, "client_name", None) or "n/a",
         reviewer=getattr(args, "reviewer", None) or "AppGuardrail",
-        engagement_type=getattr(args, "engagement_type", None)
-        or "Pre-launch review",
+        engagement_type=getattr(args, "engagement_type", None) or "Pre-launch review",
         based_on=getattr(args, "based_on", None) or "AppGuardrail findings JSON",
     )
     report = render_report(report_type, findings, context)
@@ -1614,11 +1600,13 @@ def cmd_org_bundle(args):
             prs, collection_warnings = gh_pr_list(owner, repos, per_repo_pr_limit)
         if prs_repository:
             prs = annotate_missing_pr_repositories(prs, prs_repository)
-        generated_at, report, evidence_payload, inventory, pr_summary = render_org_evidence(
-            repos,
-            prs,
-            active_repository_target=active_repository_target,
-            generated_at=getattr(args, "generated_at", None),
+        generated_at, report, evidence_payload, inventory, pr_summary = (
+            render_org_evidence(
+                repos,
+                prs,
+                active_repository_target=active_repository_target,
+                generated_at=getattr(args, "generated_at", None),
+            )
         )
         manifest = write_bundle(
             bundle_dir,
@@ -1643,7 +1631,9 @@ def cmd_org_bundle(args):
         )
         return 1
     except subprocess.CalledProcessError as exc:
-        print(f"❌ Error: GitHub command failed: {gh_error_message(exc)}", file=sys.stderr)
+        print(
+            f"❌ Error: GitHub command failed: {gh_error_message(exc)}", file=sys.stderr
+        )
         print(
             "💡 Hint: Retry later or provide --repos-json and --prs-json.",
             file=sys.stderr,
@@ -2298,20 +2288,22 @@ def _run_semgrep_scan(scan_path: Path, config: str = "auto"):
 
     config = config or "auto"
     try:
-        process = subprocess.run(  # noqa: S603 - Semgrep path resolved with shutil.which
-            [
-                semgrep,
-                "scan",
-                "--config",
-                config,
-                "--json",
-                str(scan_path),
-            ],
-            shell=False,
-            capture_output=True,
-            text=True,
-            check=False,
-            timeout=600,
+        process = (
+            subprocess.run(  # noqa: S603 - Semgrep path resolved with shutil.which
+                [
+                    semgrep,
+                    "scan",
+                    "--config",
+                    config,
+                    "--json",
+                    str(scan_path),
+                ],
+                shell=False,
+                capture_output=True,
+                text=True,
+                check=False,
+                timeout=600,
+            )
         )
     except subprocess.TimeoutExpired as exc:
         raise RuntimeError("Semgrep scan timed out.") from exc
@@ -2378,13 +2370,15 @@ def _run_zap_baseline(target_url: str):
     with tempfile.TemporaryDirectory() as tmpdir:
         report_path = Path(tmpdir) / "zap-baseline.json"
         try:
-            process = subprocess.run(  # noqa: S603 - ZAP path resolved with shutil.which
-                [zap, "-t", target_url, "-J", str(report_path), "-I"],
-                shell=False,
-                capture_output=True,
-                text=True,
-                check=False,
-                timeout=900,
+            process = (
+                subprocess.run(  # noqa: S603 - ZAP path resolved with shutil.which
+                    [zap, "-t", target_url, "-J", str(report_path), "-I"],
+                    shell=False,
+                    capture_output=True,
+                    text=True,
+                    check=False,
+                    timeout=900,
+                )
             )
         except subprocess.TimeoutExpired as exc:
             raise RuntimeError("ZAP baseline scan timed out.") from exc
@@ -2474,7 +2468,7 @@ def _scan_file(file_path: Path, base_path: Path):
     # ⚡ Bolt: Hoist expensive relative_to base_path resolution outside of loops.
     # Path.is_dir() and Path.resolve() invoke stat() system calls. Doing this inside
     # the finding iteration loop for every match was causing massive I/O overhead.
-    resolved_base_path = base_path if base_path.is_dir() else Path(".").resolve()
+    resolved_base_path = None
 
     # ⚡ Bolt: Optimize stat calls by using os.lstat instead of Path objects
     # Impact: Combines symlink, file type, and size checks into a single stat call
@@ -2521,6 +2515,10 @@ def _scan_file(file_path: Path, base_path: Path):
             ) in applicable_rules:
                 if include_paths or exclude_paths:
                     if rel_path_for_filters is None:
+                        if resolved_base_path is None:
+                            resolved_base_path = (
+                                base_path if base_path.is_dir() else Path(".").resolve()
+                            )
                         try:
                             rel_path = file_path.relative_to(resolved_base_path)
                         except ValueError:
@@ -2540,6 +2538,10 @@ def _scan_file(file_path: Path, base_path: Path):
 
                 for match in finditer(content):
                     if rel_path_str is None:
+                        if resolved_base_path is None:
+                            resolved_base_path = (
+                                base_path if base_path.is_dir() else Path(".").resolve()
+                            )
                         try:
                             rel_path = file_path.relative_to(resolved_base_path)
                         except ValueError:
@@ -2817,9 +2819,7 @@ def add_report_arguments(parser):
         )
         parser.add_argument("--app-name", default=None, help="Application name")
         parser.add_argument("--repository", default=None, help="Repository name")
-        parser.add_argument(
-            "--commit", default=None, help="Commit SHA or version"
-        )
+        parser.add_argument("--commit", default=None, help="Commit SHA or version")
         parser.add_argument(
             "--generated-at", default=None, help="Report timestamp in ISO-8601 form"
         )