diff --git a/README.md b/README.md index ab84623..4ef9c6c 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,6 @@ SkillSpector helps you answer: **"Is this skill safe to install?"** ## Documentation - **[Development guide](docs/DEVELOPMENT.md)** — Architecture, package layout, and how to extend the analyzer pipeline. -- **[OSS_RELEASE.md](OSS_RELEASE.md)** — How to produce a public-OSS branch from this repo. ## Features @@ -272,10 +271,7 @@ SkillSpector detects **64 vulnerability patterns** across 16 categories: | TP3 | Parameter Description Injection | MEDIUM | Injection patterns in parameter definitions (overrides, system tokens, malicious defaults) | | TP4 | Description-Behavior Mismatch | MEDIUM | Declared tool description does not match actual code behavior (LLM-powered) | -View all patterns: -```bash -skillspector patterns -``` +All detected patterns are listed in the tables above. ## Risk Scoring @@ -301,7 +297,7 @@ skillspector patterns ### Terminal Output ``` - SkillSpector Security Report v0.1.0 + SkillSpector Security Report v2.0.0 Skill: suspicious-skill Source: ./suspicious-skill/ diff --git a/src/skillspector/llm_analyzer_base.py b/src/skillspector/llm_analyzer_base.py index a678e4e..ea8ba43 100644 --- a/src/skillspector/llm_analyzer_base.py +++ b/src/skillspector/llm_analyzer_base.py @@ -28,11 +28,13 @@ from __future__ import annotations import asyncio +import os +import time from collections import defaultdict from dataclasses import dataclass, field from typing import Literal -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, field_validator from skillspector.llm_utils import get_chat_model from skillspector.logging_config import get_logger @@ -45,6 +47,83 @@ CHARS_PER_TOKEN = 4 CHUNK_OVERLAP_LINES = 50 +# Concurrency / rate-limit handling for hosted providers (e.g. NVIDIA build +# tier) that 429 under bursty parallelism. Override via env without code edits. +DEFAULT_MAX_CONCURRENCY = int(os.environ.get("SKILLSPECTOR_MAX_CONCURRENCY", "2")) +_RATE_LIMIT_MAX_RETRIES = int(os.environ.get("SKILLSPECTOR_RATELIMIT_RETRIES", "5")) +_RATE_LIMIT_BASE_DELAY = float(os.environ.get("SKILLSPECTOR_RATELIMIT_BASE_DELAY", "2.0")) + + +def _is_rate_limit_error(exc: BaseException) -> bool: + """Heuristically detect a 429 / rate-limit error across SDKs.""" + if getattr(exc, "status_code", None) == 429 or getattr(exc, "status", None) == 429: + return True + name = type(exc).__name__ + text = str(exc) + return "RateLimit" in name or "429" in text or "Too Many Requests" in text + + +def _is_timeout_error(exc: BaseException) -> bool: + """Detect a transient request-timeout error across SDKs.""" + name = type(exc).__name__ + text = str(exc).lower() + return "Timeout" in name or "timed out" in text or "timeout" in text + + +def _is_retryable_error(exc: BaseException) -> bool: + """Transient errors worth retrying: rate limits and timeouts. + + Deliberately excludes 4xx like context-overflow (400) — retrying those is + futile; the caller skips that batch instead. + """ + return _is_rate_limit_error(exc) or _is_timeout_error(exc) + + +async def _ainvoke_with_retry(coro_factory, label: str): + """Await ``coro_factory()``; on rate-limit errors retry with exponential backoff. + + ``coro_factory`` must return a *fresh* awaitable each call (coroutines are + single-use). Non-rate-limit errors propagate immediately. + """ + for attempt in range(_RATE_LIMIT_MAX_RETRIES + 1): + try: + return await coro_factory() + except Exception as exc: # noqa: BLE001 - re-raised below unless rate limit + if not _is_retryable_error(exc) or attempt == _RATE_LIMIT_MAX_RETRIES: + raise + delay = _RATE_LIMIT_BASE_DELAY * (2**attempt) + logger.warning( + "Rate limited on %s (attempt %d/%d); backing off %.1fs", + label, + attempt + 1, + _RATE_LIMIT_MAX_RETRIES, + delay, + ) + await asyncio.sleep(delay) + + +def _invoke_with_retry(call_factory, label: str): + """Synchronous twin of :func:`_ainvoke_with_retry`. + + ``call_factory`` must perform a *fresh* invocation each call and return its + result. Non-rate-limit errors propagate immediately. + """ + for attempt in range(_RATE_LIMIT_MAX_RETRIES + 1): + try: + return call_factory() + except Exception as exc: # noqa: BLE001 - re-raised below unless rate limit + if not _is_retryable_error(exc) or attempt == _RATE_LIMIT_MAX_RETRIES: + raise + delay = _RATE_LIMIT_BASE_DELAY * (2**attempt) + logger.warning( + "Rate limited on %s (attempt %d/%d); backing off %.1fs", + label, + attempt + 1, + _RATE_LIMIT_MAX_RETRIES, + delay, + ) + time.sleep(delay) + # --------------------------------------------------------------------------- # Default structured-output schemas (discovery mode) @@ -61,12 +140,25 @@ class LLMFinding(BaseModel): rule_id: str = Field(description="Identifier for the type of finding") message: str = Field(description="Short description of the finding") severity: Literal["LOW", "MEDIUM", "HIGH", "CRITICAL"] = Field(description="Severity level") - start_line: int = Field(ge=1, description="Starting line number") + # NOTE: numeric range constraints (ge/le) are intentionally expressed in the + # description rather than as Field bounds. Pydantic bounds emit JSON-schema + # minimum/maximum, which some structured-output endpoints (e.g. Anthropic's + # OpenAI-compatible tool schema) reject. Consuming code clamps/guards values. + start_line: int = Field(description="Starting line number (>= 1)") end_line: int | None = Field(default=None, description="Ending line number (optional)") - confidence: float = Field(ge=0.0, le=1.0, default=0.5, description="Confidence score") + confidence: float = Field(default=0.5, description="Confidence score between 0.0 and 1.0") explanation: str = Field(default="", description="Why this is a finding (2-3 sentences)") remediation: str = Field(default="", description="Actionable steps to fix the issue") + @field_validator("confidence") + @classmethod + def _check_confidence(cls, v: float) -> float: + # Runtime bound only — kept out of the JSON schema (see note above) so + # structured-output endpoints that reject minimum/maximum still work. + if not 0.0 <= v <= 1.0: + raise ValueError("confidence must be between 0.0 and 1.0") + return v + def to_finding(self, file: str) -> Finding: """Convert to a :class:`Finding` for the graph state.""" return Finding( @@ -352,12 +444,22 @@ def run_batches( estimate_tokens(prompt), len(batch.findings), ) - if self._structured_llm: - response = self._structured_llm.invoke(prompt) - else: - response = self._llm.invoke(prompt).content - logger.debug("LLM response for %s", batch.file_label) - parsed = self.parse_response(response, batch) + try: + if self._structured_llm: + response = _invoke_with_retry( + lambda p=prompt: self._structured_llm.invoke(p), batch.file_label + ) + else: + response = _invoke_with_retry( + lambda p=prompt: self._llm.invoke(p), batch.file_label + ).content + logger.debug("LLM response for %s", batch.file_label) + parsed = self.parse_response(response, batch) + except NotImplementedError: + raise # structural/subclass bug — never swallow + except Exception as exc: # noqa: BLE001 - isolate one batch's failure + logger.warning("Skipping batch %s after error: %s", batch.file_label, exc) + continue results.append((batch, parsed)) return results @@ -365,7 +467,7 @@ async def arun_batches( self, batches: list[Batch], *, - max_concurrency: int = 10, + max_concurrency: int = DEFAULT_MAX_CONCURRENCY, **kwargs: object, ) -> list[tuple[Batch, list]]: """Execute LLM calls for all *batches* concurrently. @@ -378,7 +480,7 @@ async def arun_batches( """ sem = asyncio.Semaphore(max_concurrency) - async def _process(batch: Batch) -> tuple[Batch, list]: + async def _process(batch: Batch) -> tuple[Batch, list] | None: async with sem: prompt = self.build_prompt(batch, **kwargs) logger.debug( @@ -387,14 +489,26 @@ async def _process(batch: Batch) -> tuple[Batch, list]: estimate_tokens(prompt), len(batch.findings), ) - if self._structured_llm: - response = await self._structured_llm.ainvoke(prompt) - else: - response = (await self._llm.ainvoke(prompt)).content - logger.debug("LLM response for %s", batch.file_label) - return (batch, self.parse_response(response, batch)) - - return list(await asyncio.gather(*[_process(b) for b in batches])) + try: + if self._structured_llm: + response = await _ainvoke_with_retry( + lambda: self._structured_llm.ainvoke(prompt), batch.file_label + ) + else: + invoked = await _ainvoke_with_retry( + lambda: self._llm.ainvoke(prompt), batch.file_label + ) + response = invoked.content + logger.debug("LLM response for %s", batch.file_label) + return (batch, self.parse_response(response, batch)) + except NotImplementedError: + raise # structural/subclass bug — never swallow + except Exception as exc: # noqa: BLE001 - isolate one batch's failure + logger.warning("Skipping batch %s after error: %s", batch.file_label, exc) + return None + + gathered = await asyncio.gather(*[_process(b) for b in batches]) + return [r for r in gathered if r is not None] # -- Convenience -------------------------------------------------------- diff --git a/src/skillspector/nodes/analyzers/behavioral_ast.py b/src/skillspector/nodes/analyzers/behavioral_ast.py index f0b7a18..1777a9d 100644 --- a/src/skillspector/nodes/analyzers/behavioral_ast.py +++ b/src/skillspector/nodes/analyzers/behavioral_ast.py @@ -24,7 +24,7 @@ from skillspector.state import AnalyzerNodeResponse, SkillspectorState from .common import get_context_from_lines, get_source_segment, resolve_call_name -from .static_runner import MAX_FILE_BYTES, analyzer_finding_to_finding +from .static_runner import analyzer_finding_to_finding, raise_if_content_exceeds_limit ANALYZER_ID = "behavioral_ast" logger = get_logger(__name__) @@ -216,8 +216,9 @@ def node(state: SkillspectorState) -> AnalyzerNodeResponse: if not path.endswith(".py"): continue content = file_cache.get(path) - if content is None or len(content) > MAX_FILE_BYTES: + if content is None: continue + raise_if_content_exceeds_limit(path, content) raw = _analyze_python(content, path) all_findings.extend(analyzer_finding_to_finding(af) for af in raw) diff --git a/src/skillspector/nodes/analyzers/behavioral_taint_tracking.py b/src/skillspector/nodes/analyzers/behavioral_taint_tracking.py index 90c7e24..6b973e6 100644 --- a/src/skillspector/nodes/analyzers/behavioral_taint_tracking.py +++ b/src/skillspector/nodes/analyzers/behavioral_taint_tracking.py @@ -36,7 +36,7 @@ resolve_call_name_typed, resolve_dotted_name, ) -from .static_runner import MAX_FILE_BYTES, analyzer_finding_to_finding +from .static_runner import analyzer_finding_to_finding, raise_if_content_exceeds_limit ANALYZER_ID = "behavioral_taint_tracking" logger = get_logger(__name__) @@ -400,8 +400,9 @@ def node(state: SkillspectorState) -> AnalyzerNodeResponse: if not path.endswith(".py"): continue content = file_cache.get(path) - if content is None or len(content) > MAX_FILE_BYTES: + if content is None: continue + raise_if_content_exceeds_limit(path, content) raw = _analyze_python(content, path) all_findings.extend(analyzer_finding_to_finding(af) for af in raw) diff --git a/src/skillspector/nodes/analyzers/mcp_tool_poisoning.py b/src/skillspector/nodes/analyzers/mcp_tool_poisoning.py index d32bc6e..8c38e6c 100644 --- a/src/skillspector/nodes/analyzers/mcp_tool_poisoning.py +++ b/src/skillspector/nodes/analyzers/mcp_tool_poisoning.py @@ -835,8 +835,11 @@ def node(state: SkillspectorState) -> AnalyzerNodeResponse: if isinstance(params, list): findings.extend(_check_tp3(params)) - # TP4: LLM-based check (only when use_llm is enabled) - if state.get("use_llm", False): + # TP4: LLM-based check (only when use_llm is enabled). Defaults to True to + # match every other LLM-using node (semantic_*, meta_analyzer); the CLI + # always sets this explicitly, so the default only affects programmatic + # callers that omit the key. + if state.get("use_llm", True): findings.extend(_check_tp4(state)) logger.info("%s: %d findings", ANALYZER_ID, len(findings)) diff --git a/src/skillspector/nodes/analyzers/static_runner.py b/src/skillspector/nodes/analyzers/static_runner.py index dd6518f..ae4a438 100644 --- a/src/skillspector/nodes/analyzers/static_runner.py +++ b/src/skillspector/nodes/analyzers/static_runner.py @@ -46,7 +46,17 @@ ".rs": "rust", } -MAX_FILE_BYTES = 1_000_000 +MAX_FILE_BYTES = 50 * 1024 * 1024 + + +def raise_if_content_exceeds_limit(path: str, content: str) -> None: + """Fail closed if analyzer input exceeds the per-file analysis limit.""" + size_bytes = len(content.encode("utf-8")) + if size_bytes > MAX_FILE_BYTES: + raise ValueError( + "Scan aborted: file size exceeds the per-file analysis limit " + f"({MAX_FILE_BYTES} bytes): {path} ({size_bytes} bytes)" + ) def _infer_file_type(path: str) -> str: @@ -107,14 +117,7 @@ def run_static_patterns( if content is None: logger.debug("Skipping %s: no content in file_cache", path) continue - if len(content) > MAX_FILE_BYTES: - logger.debug( - "Skipping %s: size %d exceeds MAX_FILE_BYTES (%d)", - path, - len(content), - MAX_FILE_BYTES, - ) - continue + raise_if_content_exceeds_limit(path, content) file_type = _infer_file_type(path) for module in pattern_modules: raw = module.analyze(content=content, file_path=path, file_type=file_type) diff --git a/src/skillspector/nodes/analyzers/static_yara.py b/src/skillspector/nodes/analyzers/static_yara.py index f600f6b..bc2f7f4 100644 --- a/src/skillspector/nodes/analyzers/static_yara.py +++ b/src/skillspector/nodes/analyzers/static_yara.py @@ -33,7 +33,7 @@ from .common import get_context, get_line_number from .pattern_defaults import PatternCategory -from .static_runner import MAX_FILE_BYTES, analyzer_finding_to_finding +from .static_runner import analyzer_finding_to_finding, raise_if_content_exceeds_limit ANALYZER_ID = "static_yara" logger = get_logger(__name__) @@ -243,9 +243,7 @@ def node(state: SkillspectorState) -> AnalyzerNodeResponse: content = file_cache.get(path) if content is None: continue - if len(content) > MAX_FILE_BYTES: - logger.debug("%s: skipping %s (exceeds size limit)", ANALYZER_ID, path) - continue + raise_if_content_exceeds_limit(path, content) for af in _match_file(rules, content, path): findings.append(analyzer_finding_to_finding(af)) diff --git a/src/skillspector/nodes/build_context.py b/src/skillspector/nodes/build_context.py index 06355f3..9f7bca2 100644 --- a/src/skillspector/nodes/build_context.py +++ b/src/skillspector/nodes/build_context.py @@ -60,6 +60,11 @@ {".py", ".sh", ".bash", ".zsh", ".js", ".ts", ".rb", ".go", ".rs", ".pl"} ) +# Per-file read cap. Files larger than this fail the scan rather than being +# skipped, because skipping would let malicious content hide in an oversized +# file. Aligned with static_runner.MAX_FILE_BYTES. +_MAX_READ_BYTES = 50 * 1024 * 1024 + def _resolve_skill_dir(state: SkillspectorState) -> Path | None: """Resolve state skill_path to an existing directory Path, or None if missing/invalid.""" @@ -103,10 +108,26 @@ def _infer_file_type(path: str) -> str: def _count_lines(file_path: Path) -> int: - """Count lines in a file, handling binary and errors gracefully.""" + """Count lines in a file, handling binary and errors gracefully. + + Reads the file in fixed-size binary chunks and counts newline bytes so + peak memory stays bounded regardless of file size *or* line length — a + multi-gigabyte file with no newlines must not be materialized in memory + (consistent with the _MAX_READ_BYTES cap in _read_file_cache). The count + matches ``len(text.splitlines())`` for the common ``\\n`` / ``\\r\\n`` cases: + number of newline bytes, plus one for a final line without a trailing + newline. + """ try: - content = file_path.read_text(encoding="utf-8", errors="replace") - return len(content.splitlines()) + newline_count = 0 + last_byte = b"" + with file_path.open("rb") as fh: + while chunk := fh.read(65536): + newline_count += chunk.count(b"\n") + last_byte = chunk[-1:] + if last_byte and last_byte != b"\n": + newline_count += 1 # final line with no trailing newline + return newline_count except OSError: logger.debug("Could not read file for line count: %s", file_path) return 0 @@ -161,6 +182,33 @@ def _read_file_cache(skill_dir: Path, components: list[str]) -> dict[str, str]: return file_cache +def _validate_file_sizes(skill_dir: Path, components: list[str]) -> None: + """Fail the scan if any discovered file exceeds the per-file read cap.""" + oversized: list[tuple[str, int]] = [] + for path in components: + full = skill_dir / path + if not full.is_file(): + continue + try: + size_bytes = full.stat().st_size + except OSError: + logger.debug("Could not stat file for size validation: %s", path) + continue + if size_bytes > _MAX_READ_BYTES: + oversized.append((path, size_bytes)) + + if not oversized: + return + + details = ", ".join(f"{path} ({size_bytes} bytes)" for path, size_bytes in oversized[:5]) + if len(oversized) > 5: + details += f", and {len(oversized) - 5} more" + raise ValueError( + "Scan aborted: file size exceeds the per-file analysis limit " + f"({_MAX_READ_BYTES} bytes): {details}" + ) + + def _parse_manifest(skill_dir: Path) -> dict[str, object]: """Parse SKILL.md or skill.md YAML frontmatter into a manifest dict. @@ -172,7 +220,14 @@ def _parse_manifest(skill_dir: Path) -> dict[str, object]: if not path.is_file(): continue try: - content = path.read_text(encoding="utf-8", errors="replace") + # Only the leading YAML frontmatter is needed, and it sits at the + # top of the file. Read a bounded prefix so an oversized SKILL.md + # (e.g. a huge body, or a malicious multi-GB file) is never + # materialized whole — same memory bound as _read_file_cache. If a + # skill's frontmatter somehow exceeds this, the closing delimiter + # falls outside the prefix and parsing degrades to {} below. + with path.open("r", encoding="utf-8", errors="replace") as fh: + content = fh.read(_MAX_READ_BYTES) except OSError: logger.debug("Could not read manifest file: %s", name) return {} @@ -200,6 +255,14 @@ def _parse_manifest(skill_dir: Path) -> dict[str, object]: manifest["permissions"] = ( [str(p) for p in permissions] if isinstance(permissions, list) else [] ) + # Preserve parameter definitions as dicts so the MCP tool-poisoning + # analyzer (TP1/TP2/TP3 parameter checks) can inspect them. Without + # this, those checks never fire on real scans because the manifest + # carried no `parameters` key. + parameters = data.get("parameters", []) + manifest["parameters"] = ( + [p for p in parameters if isinstance(p, dict)] if isinstance(parameters, list) else [] + ) return manifest return {} @@ -232,6 +295,7 @@ def build_context(state: SkillspectorState) -> dict[str, object]: return _minimal_update() components = _walk_skill_files(skill_dir) + _validate_file_sizes(skill_dir, components) file_cache = _read_file_cache(skill_dir, components) manifest = _parse_manifest(skill_dir) component_metadata, has_executable_scripts = _build_component_metadata(skill_dir, components) diff --git a/src/skillspector/nodes/meta_analyzer.py b/src/skillspector/nodes/meta_analyzer.py index 8f2b541..9bd6021 100644 --- a/src/skillspector/nodes/meta_analyzer.py +++ b/src/skillspector/nodes/meta_analyzer.py @@ -63,7 +63,10 @@ class MetaAnalyzerFinding(BaseModel): description="The end line number from the finding's Location, if available.", ) is_vulnerability: bool = Field(description="Whether this is a true vulnerability") - confidence: float = Field(ge=0.0, le=1.0, description="Confidence score between 0.0 and 1.0") + # ge/le omitted deliberately: they emit JSON-schema minimum/maximum, which + # Anthropic's structured-output tool schema rejects. Range is conveyed in the + # description; apply_filter coerces via float() and gates on confidence < 0.6. + confidence: float = Field(description="Confidence score between 0.0 and 1.0") intent: Literal["malicious", "negligent", "benign"] = Field( description="Likely intent behind the finding" ) @@ -73,6 +76,15 @@ class MetaAnalyzerFinding(BaseModel): explanation: str = Field(default="", description="Why this is dangerous (2-3 sentences)") remediation: str = Field(default="", description="How to fix the issue (actionable steps)") + @field_validator("confidence") + @classmethod + def _check_confidence(cls, v: float) -> float: + # Runtime bound only — kept out of the JSON schema (see note above) so + # Anthropic's structured-output tool schema accepts it. + if not 0.0 <= v <= 1.0: + raise ValueError("confidence must be between 0.0 and 1.0") + return v + class OverallAssessment(BaseModel): """Overall risk assessment for the analyzed file.""" @@ -219,6 +231,30 @@ def _fallback_filtered(findings: list[Finding]) -> list[Finding]: ] +def _finding_is_covered_by_batch(finding: Finding, batch: Batch) -> bool: + """Return whether a successful LLM batch covered this finding's location.""" + if finding.file != batch.file_path: + return False + if batch.end_line is None: + return True + return batch.start_line <= finding.start_line <= batch.end_line + + +def _partition_findings_by_batch_coverage( + findings: list[Finding], + successful_batches: list[Batch], +) -> tuple[list[Finding], list[Finding]]: + """Split findings into LLM-covered and uncovered groups.""" + analyzed: list[Finding] = [] + unanalyzed: list[Finding] = [] + for finding in findings: + if any(_finding_is_covered_by_batch(finding, batch) for batch in successful_batches): + analyzed.append(finding) + else: + unanalyzed.append(finding) + return analyzed, unanalyzed + + # --------------------------------------------------------------------------- # LLMMetaAnalyzer (filter / enrich mode) # --------------------------------------------------------------------------- @@ -392,12 +428,29 @@ def meta_analyzer(state: SkillspectorState) -> MetaAnalyzerResponse: ) batch_results = asyncio.run(analyzer.arun_batches(batches, metadata_text=metadata_text)) - filtered = analyzer.apply_filter(findings, batch_results) + + # A batch may have been skipped (timeout / context-overflow / exhausted + # retries). Only findings whose file and line range were covered by a + # successful batch are eligible for filtering; findings in failed chunks + # are kept unfiltered rather than silently dropped. + successful_batches = [batch for batch, _ in batch_results] + analyzed, unanalyzed = _partition_findings_by_batch_coverage(findings, successful_batches) + + filtered = analyzer.apply_filter(analyzed, batch_results) + if unanalyzed: + logger.warning( + "Meta-analyzer: %d findings across %d unanalyzed files kept unfiltered", + len(unanalyzed), + len({f.file for f in unanalyzed}), + ) + filtered = filtered + _fallback_filtered(unanalyzed) logger.debug( - "LLM filtering done: %d findings -> %d after filter", + "LLM filtering done: %d findings -> %d after filter (%d analyzed, %d kept unfiltered)", len(findings), len(filtered), + len(analyzed), + len(unanalyzed), ) return {"filtered_findings": filtered} except ValueError: diff --git a/src/skillspector/state.py b/src/skillspector/state.py index 8a44727..8a21bab 100644 --- a/src/skillspector/state.py +++ b/src/skillspector/state.py @@ -58,7 +58,9 @@ class SkillspectorState(TypedDict, total=False): output_format: str report_body: str - # LLM: when False, LLM-based nodes return immediately without calling the LLM (see LLM_BASED_NODE_IDS) + # LLM: when False, LLM-based nodes (meta_analyzer, mcp_tool_poisoning's TP4, + # and the semantic_* analyzers) return immediately without calling the LLM. + # Each such node checks use_llm itself; there is no graph-level routing. use_llm: bool # Risk: report node sets these from risk_score @@ -72,11 +74,6 @@ class SkillspectorState(TypedDict, total=False): yara_rules_dir: str | None -# Node IDs that use an LLM. Each such node should check use_llm at the top and return -# immediately (e.g. fallback / no-op) when False; no graph-level routing. -LLM_BASED_NODE_IDS: tuple[str, ...] = ("meta_analyzer", "mcp_tool_poisoning") - - class AnalyzerNodeResponse(TypedDict): """Strict analyzer update payload for graph state.""" diff --git a/tests/nodes/analyzers/test_behavioral_taint_tracking.py b/tests/nodes/analyzers/test_behavioral_taint_tracking.py index 77cc211..bc7d287 100644 --- a/tests/nodes/analyzers/test_behavioral_taint_tracking.py +++ b/tests/nodes/analyzers/test_behavioral_taint_tracking.py @@ -17,6 +17,8 @@ from __future__ import annotations +import pytest + from skillspector.nodes.analyzers import behavioral_taint_tracking @@ -258,13 +260,13 @@ def test_missing_file_in_cache(self): result = behavioral_taint_tracking.node(state) assert result["findings"] == [] - def test_oversized_file_skipped(self): + def test_oversized_file_fails_scan(self): from skillspector.nodes.analyzers.static_runner import MAX_FILE_BYTES big = 'import os\nexec(os.environ.get("KEY"))\n' + ("x = 1\n" * MAX_FILE_BYTES) state = {"components": ["big.py"], "file_cache": {"big.py": big}} - result = behavioral_taint_tracking.node(state) - assert result["findings"] == [] + with pytest.raises(ValueError, match="big\\.py"): + behavioral_taint_tracking.node(state) def test_multiple_files_produce_findings(self): state = { diff --git a/tests/nodes/analyzers/test_static_patterns.py b/tests/nodes/analyzers/test_static_patterns.py index d501222..fa6c6d7 100644 --- a/tests/nodes/analyzers/test_static_patterns.py +++ b/tests/nodes/analyzers/test_static_patterns.py @@ -17,6 +17,8 @@ from __future__ import annotations +import pytest + from skillspector.nodes.analyzers import ( static_patterns_data_exfiltration as data_exfiltration_module, ) @@ -142,3 +144,22 @@ def test_empty_components_returns_empty(self): state = {"components": [], "file_cache": {}} findings = static_runner.run_static_patterns(state, [prompt_injection_module]) assert findings == [] + + +class TestStaticRunnerSizeLimit: + def test_content_limit_is_enforced_in_bytes(self, monkeypatch: pytest.MonkeyPatch) -> None: + """Multibyte text over the byte cap must fail even when char count is under it.""" + monkeypatch.setattr(static_runner, "MAX_FILE_BYTES", 4) + + with pytest.raises(ValueError, match=r"multi\.txt .*6 bytes"): + static_runner.raise_if_content_exceeds_limit("multi.txt", "ééé") + + def test_run_static_patterns_fails_on_oversized_content( + self, + monkeypatch: pytest.MonkeyPatch, + ) -> None: + monkeypatch.setattr(static_runner, "MAX_FILE_BYTES", 4) + state = {"components": ["multi.md"], "file_cache": {"multi.md": "ééé"}} + + with pytest.raises(ValueError, match=r"multi\.md"): + static_runner.run_static_patterns(state, [prompt_injection_module]) diff --git a/tests/nodes/analyzers/test_static_yara.py b/tests/nodes/analyzers/test_static_yara.py index 89fc4ec..aa39447 100644 --- a/tests/nodes/analyzers/test_static_yara.py +++ b/tests/nodes/analyzers/test_static_yara.py @@ -247,13 +247,13 @@ def test_missing_file_in_cache(self, tmp_path): state = {"components": ["ghost.txt"], "file_cache": {}, "yara_rules_dir": str(tmp_path)} assert static_yara.node(state)["findings"] == [] - def test_oversized_file_skipped(self, tmp_path): + def test_oversized_file_fails_scan(self, tmp_path): _write_rule( tmp_path, "rule_big", category="malware", severity="HIGH", strings={"a": "BIGMARKER"} ) content = "BIGMARKER" + ("x" * MAX_FILE_BYTES) - findings = _run(content, "big.txt", str(tmp_path)) - assert findings == [] + with pytest.raises(ValueError, match="big\\.txt"): + _run(content, "big.txt", str(tmp_path)) def test_nonexistent_rules_dir_returns_empty(self): state = { diff --git a/tests/nodes/test_build_context.py b/tests/nodes/test_build_context.py index b905ecb..b2bc261 100644 --- a/tests/nodes/test_build_context.py +++ b/tests/nodes/test_build_context.py @@ -22,6 +22,8 @@ from pathlib import Path +import pytest + from skillspector.constants import MODEL_CONFIG from skillspector.nodes.build_context import build_context from skillspector.state import SkillspectorState @@ -70,6 +72,7 @@ def test_build_context_real_directory_with_skill_md(tmp_path: Path) -> None: "description": "For tests", "triggers": ["a", "b"], "permissions": ["read"], + "parameters": [], } assert result["ast_cache"] == {} assert result["previous_manifest"] is None @@ -185,3 +188,89 @@ def test_build_context_skill_md_lowercase(tmp_path: Path) -> None: assert result["manifest"]["description"] == "d" assert "skill.md" in result["components"] assert "references/guide.md" in result["components"] + + +def test_build_context_parses_parameters_from_frontmatter(tmp_path: Path) -> None: + """`parameters` frontmatter is preserved as dicts so MCP TP checks can reach it. + + Regression guard: without this, the mcp_tool_poisoning parameter checks + (TP3 and parameter-scoped TP1/TP2) never fire on real scans because the + manifest carried no `parameters` key. + """ + (tmp_path / "SKILL.md").write_text( + "---\n" + "name: reader\n" + "description: reads data\n" + "parameters:\n" + " - name: path\n" + " description: file path to read\n" + " - not-a-dict\n" # non-dict entries are dropped + "---\n", + encoding="utf-8", + ) + state: SkillspectorState = {"skill_path": str(tmp_path)} + result = build_context(state) + assert result["manifest"]["parameters"] == [ + {"name": "path", "description": "file path to read"} + ] + + +def test_build_context_fails_on_oversized_file_before_reading_content( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """Oversized files fail the scan before content is materialized. + + Regression guard for the security behavior: a file above the analysis + limit must not be silently cached as empty, because that makes the scan + report safe without analyzing the bytes. The spy also fails if the + oversized file is read via ``read_text`` before the size gate. + """ + from skillspector.nodes.build_context import _MAX_READ_BYTES + + (tmp_path / "SKILL.md").write_text("---\nname: big\n---\n", encoding="utf-8") + big = tmp_path / "huge.txt" + big.write_text("A" * (_MAX_READ_BYTES + 1), encoding="utf-8") + + original_read_text = Path.read_text + + def guarded_read_text(self: Path, *args: object, **kwargs: object) -> str: + if self.name == "huge.txt": + raise AssertionError(f"oversized file read fully into memory: {self}") + return original_read_text(self, *args, **kwargs) # type: ignore[arg-type] + + monkeypatch.setattr(Path, "read_text", guarded_read_text) + + state: SkillspectorState = {"skill_path": str(tmp_path)} + + with pytest.raises(ValueError, match="huge\\.txt"): + build_context(state) + + +def test_build_context_oversized_manifest_fails_before_reading_whole( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """An oversized SKILL.md fails the scan before full-content reads. + + The spy fails if SKILL.md is ever read via unbounded ``read_text`` before + the size gate rejects it. + """ + from skillspector.nodes.build_context import _MAX_READ_BYTES + + (tmp_path / "SKILL.md").write_text( + "---\nname: bigmanifest\ndescription: d\n---\n" + "A" * _MAX_READ_BYTES, + encoding="utf-8", + ) + + original_read_text = Path.read_text + + def guarded_read_text(self: Path, *args: object, **kwargs: object) -> str: + if self.name in ("SKILL.md", "skill.md"): + raise AssertionError(f"oversized manifest read fully into memory: {self}") + return original_read_text(self, *args, **kwargs) # type: ignore[arg-type] + + monkeypatch.setattr(Path, "read_text", guarded_read_text) + + state: SkillspectorState = {"skill_path": str(tmp_path)} + + with pytest.raises(ValueError, match="SKILL\\.md"): + build_context(state) diff --git a/tests/nodes/test_llm_analyzer_base.py b/tests/nodes/test_llm_analyzer_base.py index 9899a7f..ba9123f 100644 --- a/tests/nodes/test_llm_analyzer_base.py +++ b/tests/nodes/test_llm_analyzer_base.py @@ -36,7 +36,9 @@ LLMMetaAnalyzer, MetaAnalyzerFinding, MetaAnalyzerResult, + _fallback_filtered, _format_findings_for_prompt, + _partition_findings_by_batch_coverage, ) # --------------------------------------------------------------------------- @@ -1084,6 +1086,47 @@ def test_end_line_used_when_provided(self) -> None: assert result[0].end_line == 10 assert result[0].explanation == "Long block is dangerous" + @patch(MOCK_PATCH_TARGET, _mock_get_chat_model) + def test_failed_chunk_findings_are_not_treated_as_analyzed(self) -> None: + """A successful chunk must not imply whole-file LLM coverage.""" + analyzer = LLMMetaAnalyzer(model=self.MODEL) + covered = self._make_finding("big.py", "E1", line=50) + uncovered = self._make_finding("big.py", "E2", line=5000) + findings = [covered, uncovered] + successful_batch = Batch( + file_path="big.py", + content="chunk", + start_line=1, + end_line=100, + findings=[covered], + ) + llm_items = [ + { + "pattern_id": "E1", + "start_line": 50, + "is_vulnerability": True, + "confidence": 0.9, + "explanation": "Covered finding confirmed", + "remediation": "Fix covered", + "_file": "big.py", + } + ] + + analyzed, unanalyzed = _partition_findings_by_batch_coverage( + findings, + [successful_batch], + ) + filtered = analyzer.apply_filter(analyzed, [(successful_batch, llm_items)]) + filtered = filtered + _fallback_filtered(unanalyzed) + + assert [f.start_line for f in analyzed] == [50] + assert [f.start_line for f in unanalyzed] == [5000] + assert {f.start_line for f in filtered} == {50, 5000} + assert next(f for f in filtered if f.start_line == 50).explanation == ( + "Covered finding confirmed" + ) + assert next(f for f in filtered if f.start_line == 5000).message == "original" + # --------------------------------------------------------------------------- # LLMMetaAnalyzer.run_batches (mocked LLM) diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 60053f1..3010f9f 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -20,6 +20,7 @@ from typer.testing import CliRunner from skillspector.cli import app +from skillspector.nodes.build_context import _MAX_READ_BYTES runner = CliRunner() @@ -67,3 +68,18 @@ def test_cli_scan_nonexistent_exits_2() -> None: result = runner.invoke(app, ["scan", "/nonexistent/path/xyz"]) assert result.exit_code == 2 assert "Error" in result.output or "error" in result.output.lower() + + +def test_cli_scan_oversized_file_exits_2(tmp_path: Path) -> None: + """scan fails instead of silently skipping files above the analysis limit.""" + (tmp_path / "SKILL.md").write_text("---\nname: oversized\n---\n# Skill", encoding="utf-8") + huge = tmp_path / "huge.txt" + with huge.open("wb") as fh: + fh.seek(_MAX_READ_BYTES) + fh.write(b"x") + + result = runner.invoke(app, ["scan", str(tmp_path), "--format", "json", "--no-llm"]) + + assert result.exit_code == 2 + assert "file size exceeds" in result.output + assert "huge.txt" in result.output