Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ SkillSpector helps you answer: **"Is this skill safe to install?"**
## Documentation

- **[Development guide](docs/DEVELOPMENT.md)** — Architecture, package layout, and how to extend the analyzer pipeline.
- **[OSS_RELEASE.md](OSS_RELEASE.md)** — How to produce a public-OSS branch from this repo.

## Features

Expand Down Expand Up @@ -272,10 +271,7 @@ SkillSpector detects **64 vulnerability patterns** across 16 categories:
| TP3 | Parameter Description Injection | MEDIUM | Injection patterns in parameter definitions (overrides, system tokens, malicious defaults) |
| TP4 | Description-Behavior Mismatch | MEDIUM | Declared tool description does not match actual code behavior (LLM-powered) |

View all patterns:
```bash
skillspector patterns
```
All detected patterns are listed in the tables above.

## Risk Scoring

Expand All @@ -301,7 +297,7 @@ skillspector patterns
### Terminal Output

```
SkillSpector Security Report v0.1.0
SkillSpector Security Report v2.0.0

Skill: suspicious-skill
Source: ./suspicious-skill/
Expand Down
152 changes: 133 additions & 19 deletions src/skillspector/llm_analyzer_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,13 @@
from __future__ import annotations

import asyncio
import os
import time
from collections import defaultdict
from dataclasses import dataclass, field
from typing import Literal

from pydantic import BaseModel, Field
from pydantic import BaseModel, Field, field_validator

from skillspector.llm_utils import get_chat_model
from skillspector.logging_config import get_logger
Expand All @@ -45,6 +47,83 @@
CHARS_PER_TOKEN = 4
CHUNK_OVERLAP_LINES = 50

# Concurrency / rate-limit handling for hosted providers (e.g. NVIDIA build
# tier) that 429 under bursty parallelism. Override via env without code edits.
DEFAULT_MAX_CONCURRENCY = int(os.environ.get("SKILLSPECTOR_MAX_CONCURRENCY", "2"))
_RATE_LIMIT_MAX_RETRIES = int(os.environ.get("SKILLSPECTOR_RATELIMIT_RETRIES", "5"))
_RATE_LIMIT_BASE_DELAY = float(os.environ.get("SKILLSPECTOR_RATELIMIT_BASE_DELAY", "2.0"))


def _is_rate_limit_error(exc: BaseException) -> bool:
"""Heuristically detect a 429 / rate-limit error across SDKs."""
if getattr(exc, "status_code", None) == 429 or getattr(exc, "status", None) == 429:
return True
name = type(exc).__name__
text = str(exc)
return "RateLimit" in name or "429" in text or "Too Many Requests" in text


def _is_timeout_error(exc: BaseException) -> bool:
"""Detect a transient request-timeout error across SDKs."""
name = type(exc).__name__
text = str(exc).lower()
return "Timeout" in name or "timed out" in text or "timeout" in text


def _is_retryable_error(exc: BaseException) -> bool:
"""Transient errors worth retrying: rate limits and timeouts.

Deliberately excludes 4xx like context-overflow (400) — retrying those is
futile; the caller skips that batch instead.
"""
return _is_rate_limit_error(exc) or _is_timeout_error(exc)


async def _ainvoke_with_retry(coro_factory, label: str):
"""Await ``coro_factory()``; on rate-limit errors retry with exponential backoff.

``coro_factory`` must return a *fresh* awaitable each call (coroutines are
single-use). Non-rate-limit errors propagate immediately.
"""
for attempt in range(_RATE_LIMIT_MAX_RETRIES + 1):
try:
return await coro_factory()
except Exception as exc: # noqa: BLE001 - re-raised below unless rate limit
if not _is_retryable_error(exc) or attempt == _RATE_LIMIT_MAX_RETRIES:
raise
delay = _RATE_LIMIT_BASE_DELAY * (2**attempt)
logger.warning(
"Rate limited on %s (attempt %d/%d); backing off %.1fs",
label,
attempt + 1,
_RATE_LIMIT_MAX_RETRIES,
delay,
)
await asyncio.sleep(delay)


def _invoke_with_retry(call_factory, label: str):
"""Synchronous twin of :func:`_ainvoke_with_retry`.

``call_factory`` must perform a *fresh* invocation each call and return its
result. Non-rate-limit errors propagate immediately.
"""
for attempt in range(_RATE_LIMIT_MAX_RETRIES + 1):
try:
return call_factory()
except Exception as exc: # noqa: BLE001 - re-raised below unless rate limit
if not _is_retryable_error(exc) or attempt == _RATE_LIMIT_MAX_RETRIES:
raise
delay = _RATE_LIMIT_BASE_DELAY * (2**attempt)
logger.warning(
"Rate limited on %s (attempt %d/%d); backing off %.1fs",
label,
attempt + 1,
_RATE_LIMIT_MAX_RETRIES,
delay,
)
time.sleep(delay)


# ---------------------------------------------------------------------------
# Default structured-output schemas (discovery mode)
Expand All @@ -61,12 +140,25 @@ class LLMFinding(BaseModel):
rule_id: str = Field(description="Identifier for the type of finding")
message: str = Field(description="Short description of the finding")
severity: Literal["LOW", "MEDIUM", "HIGH", "CRITICAL"] = Field(description="Severity level")
start_line: int = Field(ge=1, description="Starting line number")
# NOTE: numeric range constraints (ge/le) are intentionally expressed in the
# description rather than as Field bounds. Pydantic bounds emit JSON-schema
# minimum/maximum, which some structured-output endpoints (e.g. Anthropic's
# OpenAI-compatible tool schema) reject. Consuming code clamps/guards values.
start_line: int = Field(description="Starting line number (>= 1)")
end_line: int | None = Field(default=None, description="Ending line number (optional)")
confidence: float = Field(ge=0.0, le=1.0, default=0.5, description="Confidence score")
confidence: float = Field(default=0.5, description="Confidence score between 0.0 and 1.0")
explanation: str = Field(default="", description="Why this is a finding (2-3 sentences)")
remediation: str = Field(default="", description="Actionable steps to fix the issue")

@field_validator("confidence")
@classmethod
def _check_confidence(cls, v: float) -> float:
# Runtime bound only — kept out of the JSON schema (see note above) so
# structured-output endpoints that reject minimum/maximum still work.
if not 0.0 <= v <= 1.0:
raise ValueError("confidence must be between 0.0 and 1.0")
return v

def to_finding(self, file: str) -> Finding:
"""Convert to a :class:`Finding` for the graph state."""
return Finding(
Expand Down Expand Up @@ -352,20 +444,30 @@ def run_batches(
estimate_tokens(prompt),
len(batch.findings),
)
if self._structured_llm:
response = self._structured_llm.invoke(prompt)
else:
response = self._llm.invoke(prompt).content
logger.debug("LLM response for %s", batch.file_label)
parsed = self.parse_response(response, batch)
try:
if self._structured_llm:
response = _invoke_with_retry(
lambda p=prompt: self._structured_llm.invoke(p), batch.file_label
)
else:
response = _invoke_with_retry(
lambda p=prompt: self._llm.invoke(p), batch.file_label
).content
logger.debug("LLM response for %s", batch.file_label)
parsed = self.parse_response(response, batch)
except NotImplementedError:
raise # structural/subclass bug — never swallow
except Exception as exc: # noqa: BLE001 - isolate one batch's failure
logger.warning("Skipping batch %s after error: %s", batch.file_label, exc)
continue
results.append((batch, parsed))
return results

async def arun_batches(
self,
batches: list[Batch],
*,
max_concurrency: int = 10,
max_concurrency: int = DEFAULT_MAX_CONCURRENCY,
**kwargs: object,
) -> list[tuple[Batch, list]]:
"""Execute LLM calls for all *batches* concurrently.
Expand All @@ -378,7 +480,7 @@ async def arun_batches(
"""
sem = asyncio.Semaphore(max_concurrency)

async def _process(batch: Batch) -> tuple[Batch, list]:
async def _process(batch: Batch) -> tuple[Batch, list] | None:
async with sem:
prompt = self.build_prompt(batch, **kwargs)
logger.debug(
Expand All @@ -387,14 +489,26 @@ async def _process(batch: Batch) -> tuple[Batch, list]:
estimate_tokens(prompt),
len(batch.findings),
)
if self._structured_llm:
response = await self._structured_llm.ainvoke(prompt)
else:
response = (await self._llm.ainvoke(prompt)).content
logger.debug("LLM response for %s", batch.file_label)
return (batch, self.parse_response(response, batch))

return list(await asyncio.gather(*[_process(b) for b in batches]))
try:
if self._structured_llm:
response = await _ainvoke_with_retry(
lambda: self._structured_llm.ainvoke(prompt), batch.file_label
)
else:
invoked = await _ainvoke_with_retry(
lambda: self._llm.ainvoke(prompt), batch.file_label
)
response = invoked.content
logger.debug("LLM response for %s", batch.file_label)
return (batch, self.parse_response(response, batch))
except NotImplementedError:
raise # structural/subclass bug — never swallow
except Exception as exc: # noqa: BLE001 - isolate one batch's failure
logger.warning("Skipping batch %s after error: %s", batch.file_label, exc)
return None

gathered = await asyncio.gather(*[_process(b) for b in batches])
return [r for r in gathered if r is not None]

# -- Convenience --------------------------------------------------------

Expand Down
5 changes: 3 additions & 2 deletions src/skillspector/nodes/analyzers/behavioral_ast.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from skillspector.state import AnalyzerNodeResponse, SkillspectorState

from .common import get_context_from_lines, get_source_segment, resolve_call_name
from .static_runner import MAX_FILE_BYTES, analyzer_finding_to_finding
from .static_runner import analyzer_finding_to_finding, raise_if_content_exceeds_limit

ANALYZER_ID = "behavioral_ast"
logger = get_logger(__name__)
Expand Down Expand Up @@ -216,8 +216,9 @@ def node(state: SkillspectorState) -> AnalyzerNodeResponse:
if not path.endswith(".py"):
continue
content = file_cache.get(path)
if content is None or len(content) > MAX_FILE_BYTES:
if content is None:
continue
raise_if_content_exceeds_limit(path, content)
raw = _analyze_python(content, path)
all_findings.extend(analyzer_finding_to_finding(af) for af in raw)

Expand Down
5 changes: 3 additions & 2 deletions src/skillspector/nodes/analyzers/behavioral_taint_tracking.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
resolve_call_name_typed,
resolve_dotted_name,
)
from .static_runner import MAX_FILE_BYTES, analyzer_finding_to_finding
from .static_runner import analyzer_finding_to_finding, raise_if_content_exceeds_limit

ANALYZER_ID = "behavioral_taint_tracking"
logger = get_logger(__name__)
Expand Down Expand Up @@ -400,8 +400,9 @@ def node(state: SkillspectorState) -> AnalyzerNodeResponse:
if not path.endswith(".py"):
continue
content = file_cache.get(path)
if content is None or len(content) > MAX_FILE_BYTES:
if content is None:
continue
raise_if_content_exceeds_limit(path, content)
raw = _analyze_python(content, path)
all_findings.extend(analyzer_finding_to_finding(af) for af in raw)

Expand Down
7 changes: 5 additions & 2 deletions src/skillspector/nodes/analyzers/mcp_tool_poisoning.py
Original file line number Diff line number Diff line change
Expand Up @@ -835,8 +835,11 @@ def node(state: SkillspectorState) -> AnalyzerNodeResponse:
if isinstance(params, list):
findings.extend(_check_tp3(params))

# TP4: LLM-based check (only when use_llm is enabled)
if state.get("use_llm", False):
# TP4: LLM-based check (only when use_llm is enabled). Defaults to True to
# match every other LLM-using node (semantic_*, meta_analyzer); the CLI
# always sets this explicitly, so the default only affects programmatic
# callers that omit the key.
if state.get("use_llm", True):
findings.extend(_check_tp4(state))

logger.info("%s: %d findings", ANALYZER_ID, len(findings))
Expand Down
21 changes: 12 additions & 9 deletions src/skillspector/nodes/analyzers/static_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,17 @@
".rs": "rust",
}

MAX_FILE_BYTES = 1_000_000
MAX_FILE_BYTES = 50 * 1024 * 1024


def raise_if_content_exceeds_limit(path: str, content: str) -> None:
"""Fail closed if analyzer input exceeds the per-file analysis limit."""
size_bytes = len(content.encode("utf-8"))
if size_bytes > MAX_FILE_BYTES:
raise ValueError(
"Scan aborted: file size exceeds the per-file analysis limit "
f"({MAX_FILE_BYTES} bytes): {path} ({size_bytes} bytes)"
)


def _infer_file_type(path: str) -> str:
Expand Down Expand Up @@ -107,14 +117,7 @@ def run_static_patterns(
if content is None:
logger.debug("Skipping %s: no content in file_cache", path)
continue
if len(content) > MAX_FILE_BYTES:
logger.debug(
"Skipping %s: size %d exceeds MAX_FILE_BYTES (%d)",
path,
len(content),
MAX_FILE_BYTES,
)
continue
raise_if_content_exceeds_limit(path, content)
file_type = _infer_file_type(path)
for module in pattern_modules:
raw = module.analyze(content=content, file_path=path, file_type=file_type)
Expand Down
6 changes: 2 additions & 4 deletions src/skillspector/nodes/analyzers/static_yara.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

from .common import get_context, get_line_number
from .pattern_defaults import PatternCategory
from .static_runner import MAX_FILE_BYTES, analyzer_finding_to_finding
from .static_runner import analyzer_finding_to_finding, raise_if_content_exceeds_limit

ANALYZER_ID = "static_yara"
logger = get_logger(__name__)
Expand Down Expand Up @@ -243,9 +243,7 @@ def node(state: SkillspectorState) -> AnalyzerNodeResponse:
content = file_cache.get(path)
if content is None:
continue
if len(content) > MAX_FILE_BYTES:
logger.debug("%s: skipping %s (exceeds size limit)", ANALYZER_ID, path)
continue
raise_if_content_exceeds_limit(path, content)
for af in _match_file(rules, content, path):
findings.append(analyzer_finding_to_finding(af))

Expand Down
Loading