From e7efc3fb9dd41948158e869759248aeaa1067dbb Mon Sep 17 00:00:00 2001 From: Devam Shah Date: Fri, 12 Jun 2026 12:48:05 +0530 Subject: [PATCH 1/2] python: add hf-trust-remote-code rule (HuggingFace trust_remote_code=True RCE) Signed-off-by: Devam Shah --- python/hf-trust-remote-code.yaml | 37 ++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 python/hf-trust-remote-code.yaml diff --git a/python/hf-trust-remote-code.yaml b/python/hf-trust-remote-code.yaml new file mode 100644 index 0000000..deaa586 --- /dev/null +++ b/python/hf-trust-remote-code.yaml @@ -0,0 +1,37 @@ +rules: + - id: hf-trust-remote-code + message: >- + Loading a Hugging Face model, tokenizer, config, pipeline, or dataset with + `trust_remote_code=True` executes arbitrary Python (`modeling_*.py`, + `configuration_*.py`, or dataset loading scripts) shipped in the remote + repository. A malicious or compromised repository achieves remote code + execution on the loading host. Prefer `trust_remote_code=False`, pin the + load to a reviewed `revision`, or vendor and audit the custom code before + loading. + languages: [python] + severity: ERROR + metadata: + category: security + cwe: "CWE-94: Improper Control of Generation of Code ('Code Injection')" + subcategory: [vuln] + confidence: MEDIUM + likelihood: MEDIUM + impact: HIGH + technology: [transformers, huggingface] + description: "Potential arbitrary code execution from loading Hugging Face assets with `trust_remote_code=True`" + references: + - https://huggingface.co/docs/transformers/en/models#custom-models + - https://blog.trailofbits.com/2025/09/24/supply-chain-attacks-are-exploiting-our-assumptions/ + + patterns: + - pattern-either: + - pattern: $OBJ.from_pretrained(..., trust_remote_code=True, ...) + - pattern: transformers.pipeline(..., trust_remote_code=True, ...) + - pattern: pipeline(..., trust_remote_code=True, ...) + - pattern: datasets.load_dataset(..., trust_remote_code=True, ...) + - pattern: load_dataset(..., trust_remote_code=True, ...) + - pattern-not: $OBJ.from_pretrained(..., trust_remote_code=False, ...) + - pattern-not: transformers.pipeline(..., trust_remote_code=False, ...) + - pattern-not: pipeline(..., trust_remote_code=False, ...) + - pattern-not: datasets.load_dataset(..., trust_remote_code=False, ...) + - pattern-not: load_dataset(..., trust_remote_code=False, ...) From fb7992f44f2b1fb706cbdc365a9e53a30a4eb55d Mon Sep 17 00:00:00 2001 From: Devam Shah Date: Fri, 12 Jun 2026 12:48:06 +0530 Subject: [PATCH 2/2] python: add hf-trust-remote-code rule (HuggingFace trust_remote_code=True RCE) Signed-off-by: Devam Shah --- python/hf-trust-remote-code.py | 66 ++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 python/hf-trust-remote-code.py diff --git a/python/hf-trust-remote-code.py b/python/hf-trust-remote-code.py new file mode 100644 index 0000000..0d22d74 --- /dev/null +++ b/python/hf-trust-remote-code.py @@ -0,0 +1,66 @@ +import datasets +import transformers +from datasets import load_dataset +from transformers import ( + AutoConfig, + AutoModel, + AutoModelForCausalLM, + AutoTokenizer, + pipeline, +) + +CHECKPOINT = "org/model" + +# ruleid: hf-trust-remote-code +model = AutoModelForCausalLM.from_pretrained(CHECKPOINT, trust_remote_code=True) + +# ruleid: hf-trust-remote-code +tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT, trust_remote_code=True) + +# ruleid: hf-trust-remote-code +config = AutoConfig.from_pretrained(CHECKPOINT, trust_remote_code=True) + +# ruleid: hf-trust-remote-code +base = AutoModel.from_pretrained("org/custom", revision="main", trust_remote_code=True) + +# ruleid: hf-trust-remote-code +pipe = pipeline("text-generation", model=CHECKPOINT, trust_remote_code=True) + +# ruleid: hf-trust-remote-code +pipe2 = transformers.pipeline("text-generation", model=CHECKPOINT, trust_remote_code=True) + +# ruleid: hf-trust-remote-code +ds = load_dataset("org/dataset", trust_remote_code=True) + +# ruleid: hf-trust-remote-code +ds2 = datasets.load_dataset("org/dataset", split="train", trust_remote_code=True) + + +def configurable_load(trust: bool = True): + # ruleid: hf-trust-remote-code + return AutoModelForCausalLM.from_pretrained(CHECKPOINT, trust_remote_code=True) + + +# ok: hf-trust-remote-code +safe_model = AutoModelForCausalLM.from_pretrained(CHECKPOINT) + +# ok: hf-trust-remote-code +safe_model_explicit = AutoModelForCausalLM.from_pretrained(CHECKPOINT, trust_remote_code=False) + +# ok: hf-trust-remote-code +safe_tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT, trust_remote_code=False) + +# ok: hf-trust-remote-code +safe_config = AutoConfig.from_pretrained(CHECKPOINT) + +# ok: hf-trust-remote-code +safe_pipe = pipeline("text-generation", model=CHECKPOINT) + +# ok: hf-trust-remote-code +safe_pipe_explicit = transformers.pipeline("text-generation", model=CHECKPOINT, trust_remote_code=False) + +# ok: hf-trust-remote-code +safe_ds = load_dataset("org/dataset", trust_remote_code=False) + +# ok: hf-trust-remote-code +safe_ds_default = datasets.load_dataset("org/dataset", split="train")