From 4f173545bbf9918579a97ae4d9e6b141298c0b80 Mon Sep 17 00:00:00 2001 From: Frenchie Date: Fri, 5 Jun 2026 19:13:43 -0700 Subject: [PATCH] Add --exclude glob flag to skill scan Adds a repeatable `--exclude PATTERN` option to `skillspector scan`. Patterns use fnmatch semantics against the path relative to the scan root and are applied during file discovery in build_context, so excluded files are absent from both the components list and any analyzer findings. The motivating case is binary assets (e.g. a marketing-template PDF in `assets/`) whose raw bytes happen to match static regex patterns like `shell=True` or `-rf /`, producing HIGH-severity false positives that block adoption in CI. With this flag, those files can be filtered without moving them out of the skill bundle. Excluded files are logged at DEBUG, surfaced via `--verbose`. Excluding everything is valid; the scan succeeds with no findings. --- README.md | 5 ++ src/skillspector/cli.py | 18 ++++- src/skillspector/nodes/build_context.py | 17 ++++- src/skillspector/state.py | 3 + tests/nodes/test_build_context.py | 58 ++++++++++++++++ tests/unit/test_cli.py | 88 +++++++++++++++++++++++++ 6 files changed, 185 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index ab84623..0f5be6a 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,10 @@ skillspector scan https://github.com/user/my-skill # Scan a zip file skillspector scan ./my-skill.zip + +# Exclude files by glob (repeatable). Useful for binary assets that +# can trip up the regex scanner with false positives. +skillspector scan ./my-skill/ --exclude '*.pdf' --exclude 'assets/*' ``` ### Output Formats @@ -360,6 +364,7 @@ Options: -f, --format [terminal|json|markdown|sarif] Output format [default: terminal] -o, --output PATH Output file path --no-llm Skip LLM analysis (static only) + --exclude TEXT Glob (relative to scan root) to exclude. Repeatable. -V, --verbose Show detailed progress --help Show this message and exit ``` diff --git a/src/skillspector/cli.py b/src/skillspector/cli.py index 83e3224..733cd58 100644 --- a/src/skillspector/cli.py +++ b/src/skillspector/cli.py @@ -91,6 +91,7 @@ def _scan_state( format: FormatChoice, no_llm: bool, yara_rules_dir: str | None = None, + exclude_patterns: list[str] | None = None, ) -> dict[str, object]: """Build initial graph state from scan CLI args.""" state: dict[str, object] = { @@ -100,6 +101,8 @@ def _scan_state( } if yara_rules_dir is not None: state["yara_rules_dir"] = yara_rules_dir + if exclude_patterns: + state["exclude_patterns"] = list(exclude_patterns) return state @@ -171,6 +174,13 @@ def scan( help="Directory containing additional YARA rule files (.yar/.yara) to load alongside built-in rules.", ), ] = None, + exclude: Annotated[ + list[str] | None, + typer.Option( + "--exclude", + help="Glob pattern (relative to scan root) to exclude from the scan. Repeatable.", + ), + ] = None, verbose: Annotated[ bool, typer.Option( @@ -208,7 +218,13 @@ def scan( result = None try: yara_dir = str(yara_rules_dir.resolve()) if yara_rules_dir else None - state = _scan_state(input_path, format, no_llm, yara_rules_dir=yara_dir) + state = _scan_state( + input_path, + format, + no_llm, + yara_rules_dir=yara_dir, + exclude_patterns=exclude, + ) if verbose: set_level("DEBUG") console.print("[dim]Running scan...[/dim]") diff --git a/src/skillspector/nodes/build_context.py b/src/skillspector/nodes/build_context.py index 06355f3..f858e85 100644 --- a/src/skillspector/nodes/build_context.py +++ b/src/skillspector/nodes/build_context.py @@ -21,6 +21,7 @@ from __future__ import annotations +import fnmatch import re from pathlib import Path @@ -72,11 +73,14 @@ def _resolve_skill_dir(state: SkillspectorState) -> Path | None: return resolved -def _walk_skill_files(skill_dir: Path) -> list[str]: +def _walk_skill_files(skill_dir: Path, exclude_patterns: list[str] | None = None) -> list[str]: """Walk skill directory and return sorted relative path strings. Skips _SKIP_DIRS and hidden files except those starting with .claude. + Also skips paths matching any glob in ``exclude_patterns`` (fnmatch + semantics, matched against the path relative to ``skill_dir``). """ + patterns = exclude_patterns or [] paths: list[str] = [] for item in skill_dir.rglob("*"): if not item.is_file(): @@ -87,10 +91,15 @@ def _walk_skill_files(skill_dir: Path) -> list[str]: continue try: rel = item.relative_to(skill_dir) - paths.append(str(rel)) except ValueError: logger.debug("Skipping path (not under skill_dir): %s", item) continue + rel_str = rel.as_posix() + matched = next((p for p in patterns if fnmatch.fnmatch(rel_str, p)), None) + if matched is not None: + logger.debug("Excluded by --exclude %r: %s", matched, rel_str) + continue + paths.append(str(rel)) paths.sort() return paths @@ -231,7 +240,9 @@ def build_context(state: SkillspectorState) -> dict[str, object]: logger.debug("skill_path missing or not a directory; returning minimal context") return _minimal_update() - components = _walk_skill_files(skill_dir) + raw_patterns = state.get("exclude_patterns") or [] + exclude_patterns = [p for p in raw_patterns if isinstance(p, str) and p] + components = _walk_skill_files(skill_dir, exclude_patterns) file_cache = _read_file_cache(skill_dir, components) manifest = _parse_manifest(skill_dir) component_metadata, has_executable_scripts = _build_component_metadata(skill_dir, components) diff --git a/src/skillspector/state.py b/src/skillspector/state.py index 8a44727..fe565e6 100644 --- a/src/skillspector/state.py +++ b/src/skillspector/state.py @@ -71,6 +71,9 @@ class SkillspectorState(TypedDict, total=False): # Additional YARA rules directory (user-specified via --yara-rules-dir) yara_rules_dir: str | None + # Glob patterns (relative to scan root) to exclude from scanning (--exclude) + exclude_patterns: list[str] + # Node IDs that use an LLM. Each such node should check use_llm at the top and return # immediately (e.g. fallback / no-op) when False; no graph-level routing. diff --git a/tests/nodes/test_build_context.py b/tests/nodes/test_build_context.py index b905ecb..8ee3a8c 100644 --- a/tests/nodes/test_build_context.py +++ b/tests/nodes/test_build_context.py @@ -185,3 +185,61 @@ def test_build_context_skill_md_lowercase(tmp_path: Path) -> None: assert result["manifest"]["description"] == "d" assert "skill.md" in result["components"] assert "references/guide.md" in result["components"] + + +def test_build_context_exclude_glob_filters_components(tmp_path: Path) -> None: + """exclude_patterns drops matching files from components, file_cache, and metadata.""" + _make_skill_spec_dir(tmp_path) + (tmp_path / "assets" / "template-style.pdf").write_bytes(b"%PDF-1.4\nshell=True\n%%EOF") + (tmp_path / "notes.pdf").write_bytes(b"%PDF-1.4 trailing") + + state: SkillspectorState = { + "skill_path": str(tmp_path), + "exclude_patterns": ["*.pdf"], + } + result = build_context(state) + + components = result["components"] + assert "assets/template-style.pdf" not in components + assert "notes.pdf" not in components + assert "SKILL.md" in components + assert "scripts/run.py" in components + assert "assets/template-style.pdf" not in result["file_cache"] + assert all(m.get("path") != "assets/template-style.pdf" for m in result["component_metadata"]) + + +def test_build_context_exclude_directory_pattern(tmp_path: Path) -> None: + """Glob like 'assets/*' excludes all files under that directory.""" + _make_skill_spec_dir(tmp_path) + state: SkillspectorState = { + "skill_path": str(tmp_path), + "exclude_patterns": ["assets/*"], + } + result = build_context(state) + assert not any(p.startswith("assets/") for p in result["components"]) + assert "scripts/run.py" in result["components"] + + +def test_build_context_exclude_no_match_keeps_all(tmp_path: Path) -> None: + """Patterns that don't match anything leave the components list untouched.""" + _make_skill_spec_dir(tmp_path) + state: SkillspectorState = { + "skill_path": str(tmp_path), + "exclude_patterns": ["*.xyz"], + } + result = build_context(state) + assert "SKILL.md" in result["components"] + assert "assets/icon.png" in result["components"] + + +def test_build_context_exclude_everything_yields_empty(tmp_path: Path) -> None: + """Excluding every file is valid — empty components, empty file_cache.""" + _make_skill_spec_dir(tmp_path) + state: SkillspectorState = { + "skill_path": str(tmp_path), + "exclude_patterns": ["*"], + } + result = build_context(state) + assert result["components"] == [] + assert result["file_cache"] == {} + assert result["component_metadata"] == [] diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 60053f1..7f15e4e 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -15,6 +15,7 @@ """Tests for skillspector CLI (skillspector scan, --version).""" +import json from pathlib import Path from typer.testing import CliRunner @@ -24,6 +25,15 @@ runner = CliRunner() +# Minimal PDF-like bytes containing a TM1 trigger (shell=True). The static +# pattern scanner reads files with utf-8 + errors='replace', so binary assets +# can match regex patterns and produce spurious HIGH findings — which is +# exactly the false positive --exclude is meant to suppress. +_PDF_WITH_TM1 = ( + b"%PDF-1.4\n1 0 obj<>endobj\n% subprocess.run(cmd, shell=True)\n%%EOF\n" +) + + def test_cli_version() -> None: """--version prints version and exits 0.""" result = runner.invoke(app, ["--version"]) @@ -67,3 +77,81 @@ def test_cli_scan_nonexistent_exits_2() -> None: result = runner.invoke(app, ["scan", "/nonexistent/path/xyz"]) assert result.exit_code == 2 assert "Error" in result.output or "error" in result.output.lower() + + +def _make_pdf_fixture_skill(root: Path) -> Path: + """Create a skill dir whose only non-SKILL.md file is a PDF carrying TM1 bytes.""" + skill_dir = root / "skill" + (skill_dir / "assets").mkdir(parents=True) + (skill_dir / "SKILL.md").write_text("---\nname: exclude-test\n---\n# Skill\n", encoding="utf-8") + (skill_dir / "assets" / "template-style.pdf").write_bytes(_PDF_WITH_TM1) + return skill_dir + + +def test_cli_scan_exclude_drops_pdf_from_components_and_findings(tmp_path: Path) -> None: + """--exclude '*.pdf' skips the PDF: no findings raised against it, not in components.""" + skill_dir = _make_pdf_fixture_skill(tmp_path) + result = runner.invoke( + app, + [ + "scan", + str(skill_dir), + "--format", + "json", + "--no-llm", + "--exclude", + "*.pdf", + ], + ) + assert result.exit_code == 0, result.output + report = json.loads(result.output) + component_paths = [c.get("path") for c in report.get("components", [])] + assert "assets/template-style.pdf" not in component_paths + issues = report.get("issues", []) + assert all(i.get("location", {}).get("file") != "assets/template-style.pdf" for i in issues) + + +def test_cli_scan_exclude_repeatable(tmp_path: Path) -> None: + """Multiple --exclude flags compose; each pattern filters independently.""" + skill_dir = _make_pdf_fixture_skill(tmp_path) + (skill_dir / "notes.txt").write_text("plain text", encoding="utf-8") + result = runner.invoke( + app, + [ + "scan", + str(skill_dir), + "--format", + "json", + "--no-llm", + "--exclude", + "*.pdf", + "--exclude", + "*.txt", + ], + ) + assert result.exit_code == 0, result.output + report = json.loads(result.output) + component_paths = [c.get("path") for c in report.get("components", [])] + assert "assets/template-style.pdf" not in component_paths + assert "notes.txt" not in component_paths + + +def test_cli_scan_exclude_everything_succeeds(tmp_path: Path) -> None: + """Excluding every file is valid: scan succeeds with no findings.""" + skill_dir = _make_pdf_fixture_skill(tmp_path) + result = runner.invoke( + app, + [ + "scan", + str(skill_dir), + "--format", + "json", + "--no-llm", + "--exclude", + "*", + ], + ) + assert result.exit_code == 0, result.output + report = json.loads(result.output) + assert report.get("components", []) == [] + assert report.get("issues", []) == []