diff --git a/.jules/bolt.md b/.jules/bolt.md index 49b1916..387278c 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -47,3 +47,7 @@ ## 2026-07-02 - Remove `re.search` fast-path pre-check **Learning:** Python's `re.finditer` evaluates lazily by allocating a lightweight C-level `ScannerObject`. Using `re.search` as a fast-path pre-check before `re.finditer` is an anti-pattern that addresses a non-existent bottleneck and degrades performance for matched paths by evaluating the regex twice. **Action:** Do not use `re.search` before `re.finditer` for optimization purposes. + +## 2024-07-03 - Defer expensive pathlib operations +**Learning:** Path.is_dir() and Path.resolve() trigger costly stat() system calls. Executing them unconditionally for every file scanned in a hot path causes significant overhead. +**Action:** Defer these calls using lazy evaluation (initializing to None and computing only when needed) to avoid overhead for files that don't match any rules. diff --git a/scanner/cli/appguardrail.py b/scanner/cli/appguardrail.py index 9314ee4..ae89853 100644 --- a/scanner/cli/appguardrail.py +++ b/scanner/cli/appguardrail.py @@ -57,32 +57,21 @@ sys.path.insert(0, str(Path(__file__).resolve().parents[2])) from appguardrail_core.external import build_external_scan_plan -from appguardrail_core.findings import ( - NON_BLOCKING_CONTEXTS, - is_deploy_blocking as core_is_deploy_blocking, - normalize_findings, -) -from appguardrail_core.language import ( - LANGUAGE_EXTENSIONS, - detect_language_axes, - detect_stack_profile, -) -from appguardrail_core.org_bundle import ( - OrgBundleError, - annotate_missing_pr_repositories, - gh_error_message, - gh_pr_list, - gh_repo_list, - load_json as load_org_json, - render_org_evidence, - write_bundle, -) -from appguardrail_core.reports import ( - REPORT_TYPE_LABELS, - ReportContext, - render_report, - supported_report_types, -) +from appguardrail_core.findings import NON_BLOCKING_CONTEXTS +from appguardrail_core.findings import \ + is_deploy_blocking as core_is_deploy_blocking +from appguardrail_core.findings import normalize_findings +from appguardrail_core.language import (LANGUAGE_EXTENSIONS, + detect_language_axes, + detect_stack_profile) +from appguardrail_core.org_bundle import (OrgBundleError, + annotate_missing_pr_repositories, + gh_error_message, gh_pr_list, + gh_repo_list) +from appguardrail_core.org_bundle import load_json as load_org_json +from appguardrail_core.org_bundle import render_org_evidence, write_bundle +from appguardrail_core.reports import (REPORT_TYPE_LABELS, ReportContext, + render_report, supported_report_types) from appguardrail_core.rules import build_rule_metadata __version__ = "0.1.1" @@ -725,7 +714,7 @@ r"(?i)\n]*target\s*=\s*[\"']_blank[\"'])(?![^>\n]*rel\s*=\s*[\"'][^\"']*(?:noopener|noreferrer))[^>\n]*href\s*=\s*[\"']https?://" ), "severity": "WARNING", - "message": "External target=_blank link is missing rel=\"noopener noreferrer\". Add rel attributes to prevent reverse tabnabbing. [OWASP A05:2021 - Security Misconfiguration]", + "message": 'External target=_blank link is missing rel="noopener noreferrer". Add rel attributes to prevent reverse tabnabbing. [OWASP A05:2021 - Security Misconfiguration]', "extensions": [".html", ".htm"], }, { @@ -873,6 +862,7 @@ }, ] + def _unquote_rule_scalar(value: str) -> str: """Return a simple YAML scalar value from the controlled rule files.""" value = value.strip() @@ -1451,10 +1441,7 @@ def cmd_scan(args): try: findings.extend(_run_semgrep_scan(scan_path, semgrep_config)) except RuntimeError as exc: - if ( - external_plan.semgrep.auto_selected - and not external_plan.semgrep.forced - ): + if external_plan.semgrep.auto_selected and not external_plan.semgrep.forced: print(f"⚠️ Skipping Semgrep auto integration: {exc}\n") else: print(f"❌ Error: {exc}", file=sys.stderr) @@ -1576,8 +1563,7 @@ def cmd_report(args): or "Application source, configuration, and security workflow evidence.", client_name=getattr(args, "client_name", None) or "n/a", reviewer=getattr(args, "reviewer", None) or "AppGuardrail", - engagement_type=getattr(args, "engagement_type", None) - or "Pre-launch review", + engagement_type=getattr(args, "engagement_type", None) or "Pre-launch review", based_on=getattr(args, "based_on", None) or "AppGuardrail findings JSON", ) report = render_report(report_type, findings, context) @@ -1614,11 +1600,13 @@ def cmd_org_bundle(args): prs, collection_warnings = gh_pr_list(owner, repos, per_repo_pr_limit) if prs_repository: prs = annotate_missing_pr_repositories(prs, prs_repository) - generated_at, report, evidence_payload, inventory, pr_summary = render_org_evidence( - repos, - prs, - active_repository_target=active_repository_target, - generated_at=getattr(args, "generated_at", None), + generated_at, report, evidence_payload, inventory, pr_summary = ( + render_org_evidence( + repos, + prs, + active_repository_target=active_repository_target, + generated_at=getattr(args, "generated_at", None), + ) ) manifest = write_bundle( bundle_dir, @@ -1643,7 +1631,9 @@ def cmd_org_bundle(args): ) return 1 except subprocess.CalledProcessError as exc: - print(f"❌ Error: GitHub command failed: {gh_error_message(exc)}", file=sys.stderr) + print( + f"❌ Error: GitHub command failed: {gh_error_message(exc)}", file=sys.stderr + ) print( "💡 Hint: Retry later or provide --repos-json and --prs-json.", file=sys.stderr, @@ -2298,20 +2288,22 @@ def _run_semgrep_scan(scan_path: Path, config: str = "auto"): config = config or "auto" try: - process = subprocess.run( # noqa: S603 - Semgrep path resolved with shutil.which - [ - semgrep, - "scan", - "--config", - config, - "--json", - str(scan_path), - ], - shell=False, - capture_output=True, - text=True, - check=False, - timeout=600, + process = ( + subprocess.run( # noqa: S603 - Semgrep path resolved with shutil.which + [ + semgrep, + "scan", + "--config", + config, + "--json", + str(scan_path), + ], + shell=False, + capture_output=True, + text=True, + check=False, + timeout=600, + ) ) except subprocess.TimeoutExpired as exc: raise RuntimeError("Semgrep scan timed out.") from exc @@ -2378,13 +2370,15 @@ def _run_zap_baseline(target_url: str): with tempfile.TemporaryDirectory() as tmpdir: report_path = Path(tmpdir) / "zap-baseline.json" try: - process = subprocess.run( # noqa: S603 - ZAP path resolved with shutil.which - [zap, "-t", target_url, "-J", str(report_path), "-I"], - shell=False, - capture_output=True, - text=True, - check=False, - timeout=900, + process = ( + subprocess.run( # noqa: S603 - ZAP path resolved with shutil.which + [zap, "-t", target_url, "-J", str(report_path), "-I"], + shell=False, + capture_output=True, + text=True, + check=False, + timeout=900, + ) ) except subprocess.TimeoutExpired as exc: raise RuntimeError("ZAP baseline scan timed out.") from exc @@ -2474,7 +2468,7 @@ def _scan_file(file_path: Path, base_path: Path): # ⚡ Bolt: Hoist expensive relative_to base_path resolution outside of loops. # Path.is_dir() and Path.resolve() invoke stat() system calls. Doing this inside # the finding iteration loop for every match was causing massive I/O overhead. - resolved_base_path = base_path if base_path.is_dir() else Path(".").resolve() + resolved_base_path = None # ⚡ Bolt: Optimize stat calls by using os.lstat instead of Path objects # Impact: Combines symlink, file type, and size checks into a single stat call @@ -2521,6 +2515,10 @@ def _scan_file(file_path: Path, base_path: Path): ) in applicable_rules: if include_paths or exclude_paths: if rel_path_for_filters is None: + if resolved_base_path is None: + resolved_base_path = ( + base_path if base_path.is_dir() else Path(".").resolve() + ) try: rel_path = file_path.relative_to(resolved_base_path) except ValueError: @@ -2540,6 +2538,10 @@ def _scan_file(file_path: Path, base_path: Path): for match in finditer(content): if rel_path_str is None: + if resolved_base_path is None: + resolved_base_path = ( + base_path if base_path.is_dir() else Path(".").resolve() + ) try: rel_path = file_path.relative_to(resolved_base_path) except ValueError: @@ -2817,9 +2819,7 @@ def add_report_arguments(parser): ) parser.add_argument("--app-name", default=None, help="Application name") parser.add_argument("--repository", default=None, help="Repository name") - parser.add_argument( - "--commit", default=None, help="Commit SHA or version" - ) + parser.add_argument("--commit", default=None, help="Commit SHA or version") parser.add_argument( "--generated-at", default=None, help="Report timestamp in ISO-8601 form" )