diff --git a/README.md b/README.md index ab84623..0f5be6a 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,10 @@ skillspector scan https://github.com/user/my-skill # Scan a zip file skillspector scan ./my-skill.zip + +# Exclude files by glob (repeatable). Useful for binary assets that +# can trip up the regex scanner with false positives. +skillspector scan ./my-skill/ --exclude '*.pdf' --exclude 'assets/*' ``` ### Output Formats @@ -360,6 +364,7 @@ Options: -f, --format [terminal|json|markdown|sarif] Output format [default: terminal] -o, --output PATH Output file path --no-llm Skip LLM analysis (static only) + --exclude TEXT Glob (relative to scan root) to exclude. Repeatable. -V, --verbose Show detailed progress --help Show this message and exit ``` diff --git a/src/skillspector/cli.py b/src/skillspector/cli.py index 83e3224..733cd58 100644 --- a/src/skillspector/cli.py +++ b/src/skillspector/cli.py @@ -91,6 +91,7 @@ def _scan_state( format: FormatChoice, no_llm: bool, yara_rules_dir: str | None = None, + exclude_patterns: list[str] | None = None, ) -> dict[str, object]: """Build initial graph state from scan CLI args.""" state: dict[str, object] = { @@ -100,6 +101,8 @@ def _scan_state( } if yara_rules_dir is not None: state["yara_rules_dir"] = yara_rules_dir + if exclude_patterns: + state["exclude_patterns"] = list(exclude_patterns) return state @@ -171,6 +174,13 @@ def scan( help="Directory containing additional YARA rule files (.yar/.yara) to load alongside built-in rules.", ), ] = None, + exclude: Annotated[ + list[str] | None, + typer.Option( + "--exclude", + help="Glob pattern (relative to scan root) to exclude from the scan. Repeatable.", + ), + ] = None, verbose: Annotated[ bool, typer.Option( @@ -208,7 +218,13 @@ def scan( result = None try: yara_dir = str(yara_rules_dir.resolve()) if yara_rules_dir else None - state = _scan_state(input_path, format, no_llm, yara_rules_dir=yara_dir) + state = _scan_state( + input_path, + format, + no_llm, + yara_rules_dir=yara_dir, + exclude_patterns=exclude, + ) if verbose: set_level("DEBUG") console.print("[dim]Running scan...[/dim]") diff --git a/src/skillspector/nodes/build_context.py b/src/skillspector/nodes/build_context.py index 06355f3..f858e85 100644 --- a/src/skillspector/nodes/build_context.py +++ b/src/skillspector/nodes/build_context.py @@ -21,6 +21,7 @@ from __future__ import annotations +import fnmatch import re from pathlib import Path @@ -72,11 +73,14 @@ def _resolve_skill_dir(state: SkillspectorState) -> Path | None: return resolved -def _walk_skill_files(skill_dir: Path) -> list[str]: +def _walk_skill_files(skill_dir: Path, exclude_patterns: list[str] | None = None) -> list[str]: """Walk skill directory and return sorted relative path strings. Skips _SKIP_DIRS and hidden files except those starting with .claude. + Also skips paths matching any glob in ``exclude_patterns`` (fnmatch + semantics, matched against the path relative to ``skill_dir``). """ + patterns = exclude_patterns or [] paths: list[str] = [] for item in skill_dir.rglob("*"): if not item.is_file(): @@ -87,10 +91,15 @@ def _walk_skill_files(skill_dir: Path) -> list[str]: continue try: rel = item.relative_to(skill_dir) - paths.append(str(rel)) except ValueError: logger.debug("Skipping path (not under skill_dir): %s", item) continue + rel_str = rel.as_posix() + matched = next((p for p in patterns if fnmatch.fnmatch(rel_str, p)), None) + if matched is not None: + logger.debug("Excluded by --exclude %r: %s", matched, rel_str) + continue + paths.append(str(rel)) paths.sort() return paths @@ -231,7 +240,9 @@ def build_context(state: SkillspectorState) -> dict[str, object]: logger.debug("skill_path missing or not a directory; returning minimal context") return _minimal_update() - components = _walk_skill_files(skill_dir) + raw_patterns = state.get("exclude_patterns") or [] + exclude_patterns = [p for p in raw_patterns if isinstance(p, str) and p] + components = _walk_skill_files(skill_dir, exclude_patterns) file_cache = _read_file_cache(skill_dir, components) manifest = _parse_manifest(skill_dir) component_metadata, has_executable_scripts = _build_component_metadata(skill_dir, components) diff --git a/src/skillspector/state.py b/src/skillspector/state.py index 8a44727..fe565e6 100644 --- a/src/skillspector/state.py +++ b/src/skillspector/state.py @@ -71,6 +71,9 @@ class SkillspectorState(TypedDict, total=False): # Additional YARA rules directory (user-specified via --yara-rules-dir) yara_rules_dir: str | None + # Glob patterns (relative to scan root) to exclude from scanning (--exclude) + exclude_patterns: list[str] + # Node IDs that use an LLM. Each such node should check use_llm at the top and return # immediately (e.g. fallback / no-op) when False; no graph-level routing. diff --git a/tests/nodes/test_build_context.py b/tests/nodes/test_build_context.py index b905ecb..8ee3a8c 100644 --- a/tests/nodes/test_build_context.py +++ b/tests/nodes/test_build_context.py @@ -185,3 +185,61 @@ def test_build_context_skill_md_lowercase(tmp_path: Path) -> None: assert result["manifest"]["description"] == "d" assert "skill.md" in result["components"] assert "references/guide.md" in result["components"] + + +def test_build_context_exclude_glob_filters_components(tmp_path: Path) -> None: + """exclude_patterns drops matching files from components, file_cache, and metadata.""" + _make_skill_spec_dir(tmp_path) + (tmp_path / "assets" / "template-style.pdf").write_bytes(b"%PDF-1.4\nshell=True\n%%EOF") + (tmp_path / "notes.pdf").write_bytes(b"%PDF-1.4 trailing") + + state: SkillspectorState = { + "skill_path": str(tmp_path), + "exclude_patterns": ["*.pdf"], + } + result = build_context(state) + + components = result["components"] + assert "assets/template-style.pdf" not in components + assert "notes.pdf" not in components + assert "SKILL.md" in components + assert "scripts/run.py" in components + assert "assets/template-style.pdf" not in result["file_cache"] + assert all(m.get("path") != "assets/template-style.pdf" for m in result["component_metadata"]) + + +def test_build_context_exclude_directory_pattern(tmp_path: Path) -> None: + """Glob like 'assets/*' excludes all files under that directory.""" + _make_skill_spec_dir(tmp_path) + state: SkillspectorState = { + "skill_path": str(tmp_path), + "exclude_patterns": ["assets/*"], + } + result = build_context(state) + assert not any(p.startswith("assets/") for p in result["components"]) + assert "scripts/run.py" in result["components"] + + +def test_build_context_exclude_no_match_keeps_all(tmp_path: Path) -> None: + """Patterns that don't match anything leave the components list untouched.""" + _make_skill_spec_dir(tmp_path) + state: SkillspectorState = { + "skill_path": str(tmp_path), + "exclude_patterns": ["*.xyz"], + } + result = build_context(state) + assert "SKILL.md" in result["components"] + assert "assets/icon.png" in result["components"] + + +def test_build_context_exclude_everything_yields_empty(tmp_path: Path) -> None: + """Excluding every file is valid — empty components, empty file_cache.""" + _make_skill_spec_dir(tmp_path) + state: SkillspectorState = { + "skill_path": str(tmp_path), + "exclude_patterns": ["*"], + } + result = build_context(state) + assert result["components"] == [] + assert result["file_cache"] == {} + assert result["component_metadata"] == [] diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 60053f1..7f15e4e 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -15,6 +15,7 @@ """Tests for skillspector CLI (skillspector scan, --version).""" +import json from pathlib import Path from typer.testing import CliRunner @@ -24,6 +25,15 @@ runner = CliRunner() +# Minimal PDF-like bytes containing a TM1 trigger (shell=True). The static +# pattern scanner reads files with utf-8 + errors='replace', so binary assets +# can match regex patterns and produce spurious HIGH findings — which is +# exactly the false positive --exclude is meant to suppress. +_PDF_WITH_TM1 = ( + b"%PDF-1.4\n1 0 obj<>endobj\n% subprocess.run(cmd, shell=True)\n%%EOF\n" +) + + def test_cli_version() -> None: """--version prints version and exits 0.""" result = runner.invoke(app, ["--version"]) @@ -67,3 +77,81 @@ def test_cli_scan_nonexistent_exits_2() -> None: result = runner.invoke(app, ["scan", "/nonexistent/path/xyz"]) assert result.exit_code == 2 assert "Error" in result.output or "error" in result.output.lower() + + +def _make_pdf_fixture_skill(root: Path) -> Path: + """Create a skill dir whose only non-SKILL.md file is a PDF carrying TM1 bytes.""" + skill_dir = root / "skill" + (skill_dir / "assets").mkdir(parents=True) + (skill_dir / "SKILL.md").write_text("---\nname: exclude-test\n---\n# Skill\n", encoding="utf-8") + (skill_dir / "assets" / "template-style.pdf").write_bytes(_PDF_WITH_TM1) + return skill_dir + + +def test_cli_scan_exclude_drops_pdf_from_components_and_findings(tmp_path: Path) -> None: + """--exclude '*.pdf' skips the PDF: no findings raised against it, not in components.""" + skill_dir = _make_pdf_fixture_skill(tmp_path) + result = runner.invoke( + app, + [ + "scan", + str(skill_dir), + "--format", + "json", + "--no-llm", + "--exclude", + "*.pdf", + ], + ) + assert result.exit_code == 0, result.output + report = json.loads(result.output) + component_paths = [c.get("path") for c in report.get("components", [])] + assert "assets/template-style.pdf" not in component_paths + issues = report.get("issues", []) + assert all(i.get("location", {}).get("file") != "assets/template-style.pdf" for i in issues) + + +def test_cli_scan_exclude_repeatable(tmp_path: Path) -> None: + """Multiple --exclude flags compose; each pattern filters independently.""" + skill_dir = _make_pdf_fixture_skill(tmp_path) + (skill_dir / "notes.txt").write_text("plain text", encoding="utf-8") + result = runner.invoke( + app, + [ + "scan", + str(skill_dir), + "--format", + "json", + "--no-llm", + "--exclude", + "*.pdf", + "--exclude", + "*.txt", + ], + ) + assert result.exit_code == 0, result.output + report = json.loads(result.output) + component_paths = [c.get("path") for c in report.get("components", [])] + assert "assets/template-style.pdf" not in component_paths + assert "notes.txt" not in component_paths + + +def test_cli_scan_exclude_everything_succeeds(tmp_path: Path) -> None: + """Excluding every file is valid: scan succeeds with no findings.""" + skill_dir = _make_pdf_fixture_skill(tmp_path) + result = runner.invoke( + app, + [ + "scan", + str(skill_dir), + "--format", + "json", + "--no-llm", + "--exclude", + "*", + ], + ) + assert result.exit_code == 0, result.output + report = json.loads(result.output) + assert report.get("components", []) == [] + assert report.get("issues", []) == []