Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions python/hf-trust-remote-code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import datasets
import transformers
from datasets import load_dataset
from transformers import (
AutoConfig,
AutoModel,
AutoModelForCausalLM,
AutoTokenizer,
pipeline,
)

CHECKPOINT = "org/model"

# ruleid: hf-trust-remote-code
model = AutoModelForCausalLM.from_pretrained(CHECKPOINT, trust_remote_code=True)

# ruleid: hf-trust-remote-code
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT, trust_remote_code=True)

# ruleid: hf-trust-remote-code
config = AutoConfig.from_pretrained(CHECKPOINT, trust_remote_code=True)

# ruleid: hf-trust-remote-code
base = AutoModel.from_pretrained("org/custom", revision="main", trust_remote_code=True)

# ruleid: hf-trust-remote-code
pipe = pipeline("text-generation", model=CHECKPOINT, trust_remote_code=True)

# ruleid: hf-trust-remote-code
pipe2 = transformers.pipeline("text-generation", model=CHECKPOINT, trust_remote_code=True)

# ruleid: hf-trust-remote-code
ds = load_dataset("org/dataset", trust_remote_code=True)

# ruleid: hf-trust-remote-code
ds2 = datasets.load_dataset("org/dataset", split="train", trust_remote_code=True)


def configurable_load(trust: bool = True):
# ruleid: hf-trust-remote-code
return AutoModelForCausalLM.from_pretrained(CHECKPOINT, trust_remote_code=True)


# ok: hf-trust-remote-code
safe_model = AutoModelForCausalLM.from_pretrained(CHECKPOINT)

# ok: hf-trust-remote-code
safe_model_explicit = AutoModelForCausalLM.from_pretrained(CHECKPOINT, trust_remote_code=False)

# ok: hf-trust-remote-code
safe_tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT, trust_remote_code=False)

# ok: hf-trust-remote-code
safe_config = AutoConfig.from_pretrained(CHECKPOINT)

# ok: hf-trust-remote-code
safe_pipe = pipeline("text-generation", model=CHECKPOINT)

# ok: hf-trust-remote-code
safe_pipe_explicit = transformers.pipeline("text-generation", model=CHECKPOINT, trust_remote_code=False)

# ok: hf-trust-remote-code
safe_ds = load_dataset("org/dataset", trust_remote_code=False)

# ok: hf-trust-remote-code
safe_ds_default = datasets.load_dataset("org/dataset", split="train")
37 changes: 37 additions & 0 deletions python/hf-trust-remote-code.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
rules:
- id: hf-trust-remote-code
message: >-
Loading a Hugging Face model, tokenizer, config, pipeline, or dataset with
`trust_remote_code=True` executes arbitrary Python (`modeling_*.py`,
`configuration_*.py`, or dataset loading scripts) shipped in the remote
repository. A malicious or compromised repository achieves remote code
execution on the loading host. Prefer `trust_remote_code=False`, pin the
load to a reviewed `revision`, or vendor and audit the custom code before
loading.
languages: [python]
severity: ERROR
metadata:
category: security
cwe: "CWE-94: Improper Control of Generation of Code ('Code Injection')"
subcategory: [vuln]
confidence: MEDIUM
likelihood: MEDIUM
impact: HIGH
technology: [transformers, huggingface]
description: "Potential arbitrary code execution from loading Hugging Face assets with `trust_remote_code=True`"
references:
- https://huggingface.co/docs/transformers/en/models#custom-models
- https://blog.trailofbits.com/2025/09/24/supply-chain-attacks-are-exploiting-our-assumptions/

patterns:
- pattern-either:
- pattern: $OBJ.from_pretrained(..., trust_remote_code=True, ...)
- pattern: transformers.pipeline(..., trust_remote_code=True, ...)
- pattern: pipeline(..., trust_remote_code=True, ...)
- pattern: datasets.load_dataset(..., trust_remote_code=True, ...)
- pattern: load_dataset(..., trust_remote_code=True, ...)
- pattern-not: $OBJ.from_pretrained(..., trust_remote_code=False, ...)
- pattern-not: transformers.pipeline(..., trust_remote_code=False, ...)
- pattern-not: pipeline(..., trust_remote_code=False, ...)
- pattern-not: datasets.load_dataset(..., trust_remote_code=False, ...)
- pattern-not: load_dataset(..., trust_remote_code=False, ...)