Skip to content

Commit 2d0b675

Browse files
Huggingface revision pinning (#1281)
* Huggingface revision pinning In much the same way as unpinned container images benefit from digest pinning, fixing a model, dataset or file to a revision digest uniquely and immutably fixes use to a paricular model snapshot (commit) * Add more example unsafe patterns * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix PEP8 * Reduce to 79 chars * Additional Changes to Huggingface Revision Checks - Add an entry for CWE 494 - Use string.hexdigits - Set to 18.6 release - Remove Copywright - Order after markupsafe * Sort CWE by Numbers * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 4cd1337 commit 2d0b675

File tree

6 files changed

+319
-0
lines changed

6 files changed

+319
-0
lines changed

bandit/core/issue.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ class Cwe:
2626
INSUFFICIENT_RANDOM_VALUES = 330
2727
INSECURE_TEMP_FILE = 377
2828
UNCONTROLLED_RESOURCE_CONSUMPTION = 400
29+
DOWNLOAD_OF_CODE_WITHOUT_INTEGRITY_CHECK = 494
2930
DESERIALIZATION_OF_UNTRUSTED_DATA = 502
3031
MULTIPLE_BINDS = 605
3132
IMPROPER_CHECK_OF_EXCEPT_COND = 703
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
r"""
3+
================================================
4+
B615: Test for unsafe Hugging Face Hub downloads
5+
================================================
6+
7+
This plugin checks for unsafe downloads from Hugging Face Hub without proper
8+
integrity verification. Downloading models, datasets, or files without
9+
specifying a revision based on an immmutable revision (commit) can
10+
lead to supply chain attacks where malicious actors could
11+
replace model files and use an existing tag or branch name
12+
to serve malicious content.
13+
14+
The secure approach is to:
15+
16+
1. Pin to specific revisions/commits when downloading models, files or datasets
17+
18+
Common unsafe patterns:
19+
- ``AutoModel.from_pretrained("org/model-name")``
20+
- ``AutoModel.from_pretrained("org/model-name", revision="main")``
21+
- ``AutoModel.from_pretrained("org/model-name", revision="v1.0.0")``
22+
- ``load_dataset("org/dataset-name")`` without revision
23+
- ``load_dataset("org/dataset-name", revision="main")``
24+
- ``load_dataset("org/dataset-name", revision="v1.0")``
25+
- ``AutoTokenizer.from_pretrained("org/model-name")``
26+
- ``AutoTokenizer.from_pretrained("org/model-name", revision="main")``
27+
- ``AutoTokenizer.from_pretrained("org/model-name", revision="v3.3.0")``
28+
- ``hf_hub_download(repo_id="org/model_name", filename="file_name")``
29+
- ``hf_hub_download(repo_id="org/model_name",
30+
filename="file_name",
31+
revision="main"
32+
)``
33+
- ``hf_hub_download(repo_id="org/model_name",
34+
filename="file_name",
35+
revision="v2.0.0"
36+
)``
37+
- ``snapshot_download(repo_id="org/model_name")``
38+
- ``snapshot_download(repo_id="org/model_name", revision="main")``
39+
- ``snapshot_download(repo_id="org/model_name", revision="refs/pr/1")``
40+
41+
42+
:Example:
43+
44+
.. code-block:: none
45+
46+
>> Issue: Unsafe Hugging Face Hub download without revision pinning
47+
Severity: Medium Confidence: High
48+
CWE: CWE-494 (https://cwe.mitre.org/data/definitions/494.html)
49+
Location: examples/huggingface_unsafe_download.py:8
50+
7 # Unsafe: no revision specified
51+
8 model = AutoModel.from_pretrained("org/model_name")
52+
9
53+
54+
.. seealso::
55+
56+
- https://cwe.mitre.org/data/definitions/494.html
57+
- https://huggingface.co/docs/huggingface_hub/en/guides/download
58+
59+
.. versionadded:: 1.8.6
60+
61+
"""
62+
import string
63+
64+
import bandit
65+
from bandit.core import issue
66+
from bandit.core import test_properties as test
67+
68+
69+
@test.checks("Call")
70+
@test.test_id("B615")
71+
def huggingface_unsafe_download(context):
72+
"""
73+
This plugin checks for unsafe artifact download from Hugging Face Hub
74+
without immutable/reproducible revision pinning.
75+
"""
76+
# Check if any HuggingFace-related modules are imported
77+
hf_modules = [
78+
"transformers",
79+
"datasets",
80+
"huggingface_hub",
81+
]
82+
83+
# Check if any HF modules are imported
84+
hf_imported = any(
85+
context.is_module_imported_like(module) for module in hf_modules
86+
)
87+
88+
if not hf_imported:
89+
return
90+
91+
qualname = context.call_function_name_qual
92+
if not isinstance(qualname, str):
93+
return
94+
95+
unsafe_patterns = {
96+
# transformers library patterns
97+
"from_pretrained": ["transformers"],
98+
# datasets library patterns
99+
"load_dataset": ["datasets"],
100+
# huggingface_hub patterns
101+
"hf_hub_download": ["huggingface_hub"],
102+
"snapshot_download": ["huggingface_hub"],
103+
"repository_id": ["huggingface_hub"],
104+
}
105+
106+
qualname_parts = qualname.split(".")
107+
func_name = qualname_parts[-1]
108+
109+
if func_name not in unsafe_patterns:
110+
return
111+
112+
required_modules = unsafe_patterns[func_name]
113+
if not any(module in qualname_parts for module in required_modules):
114+
return
115+
116+
# Check for revision parameter (the key security control)
117+
revision_value = context.get_call_arg_value("revision")
118+
commit_id_value = context.get_call_arg_value("commit_id")
119+
120+
# Check if a revision or commit_id is specified
121+
revision_to_check = revision_value or commit_id_value
122+
123+
if revision_to_check is not None:
124+
# Check if it's a secure revision (looks like a commit hash)
125+
# Commit hashes: 40 chars (full SHA) or 7+ chars (short SHA)
126+
if isinstance(revision_to_check, str):
127+
# Remove quotes if present
128+
revision_str = str(revision_to_check).strip("\"'")
129+
130+
# Check if it looks like a commit hash (hexadecimal string)
131+
# Must be at least 7 characters and all hexadecimal
132+
is_hex = all(c in string.hexdigits for c in revision_str)
133+
if len(revision_str) >= 7 and is_hex:
134+
# This looks like a commit hash, which is secure
135+
return
136+
137+
# Edge case: check if this is a local path (starts with ./ or /)
138+
first_arg = context.get_call_arg_at_position(0)
139+
if first_arg and isinstance(first_arg, str):
140+
if first_arg.startswith(("./", "/", "../")):
141+
# Local paths are generally safer
142+
return
143+
144+
return bandit.Issue(
145+
severity=bandit.MEDIUM,
146+
confidence=bandit.HIGH,
147+
text=(
148+
f"Unsafe Hugging Face Hub download without revision pinning "
149+
f"in {func_name}()"
150+
),
151+
cwe=issue.Cwe.DOWNLOAD_OF_CODE_WITHOUT_INTEGRITY_CHECK,
152+
lineno=context.get_lineno_for_call_arg(func_name),
153+
)
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---------------------------------
2+
B615: huggingface_unsafe_download
3+
---------------------------------
4+
5+
.. automodule:: bandit.plugins.huggingface_unsafe_download
6+
:no-index:
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
from datasets import load_dataset
2+
from huggingface_hub import hf_hub_download, snapshot_download
3+
from transformers import AutoModel, AutoTokenizer
4+
5+
# UNSAFE USAGE
6+
7+
# AutoModel (Model Loading)
8+
9+
# Example #1: No revision (defaults to floating 'main')
10+
unsafe_model_no_revision = AutoModel.from_pretrained("org/model_name")
11+
12+
# Example #2: Floating revision: 'main'
13+
unsafe_model_main = AutoModel.from_pretrained(
14+
"org/model_name",
15+
revision="main"
16+
)
17+
18+
# Example #3: Floating tag revision: 'v1.0.0'
19+
unsafe_model_tag = AutoModel.from_pretrained(
20+
"org/model_name",
21+
revision="v1.0.0"
22+
)
23+
24+
25+
# AutoTokenizer (Tokenizer Loading)
26+
27+
# Example #4: No revision
28+
unsafe_tokenizer_no_revision = AutoTokenizer.from_pretrained("org/model_name")
29+
30+
# Example #5: Floating revision: 'main'
31+
unsafe_tokenizer_main = AutoTokenizer.from_pretrained(
32+
"org/model_name",
33+
revision="main"
34+
)
35+
36+
# Example #6: Floating tag revision: 'v1.0.0'
37+
unsafe_tokenizer_tag = AutoTokenizer.from_pretrained(
38+
"org/model_name",
39+
revision="v1.0.0"
40+
)
41+
42+
43+
# Example #7: load_dataset (Dataset Loading)
44+
45+
# Example #8: No revision
46+
unsafe_dataset_no_revision = load_dataset("org_dataset")
47+
48+
# Example #9: Floating revision: 'main'
49+
unsafe_dataset_main = load_dataset("org_dataset", revision="main")
50+
51+
# Example #10: Floating tag revision: 'v1.0.0'
52+
unsafe_dataset_tag = load_dataset("org_dataset", revision="v1.0.0")
53+
54+
55+
# f_hub_download (File Download)
56+
57+
# Example #11: No revision
58+
unsafe_file_no_revision = hf_hub_download(
59+
repo_id="org/model_name",
60+
filename="config.json"
61+
)
62+
63+
# Example #12: Floating revision: 'main'
64+
unsafe_file_main = hf_hub_download(
65+
repo_id="org/model_name",
66+
filename="config.json",
67+
revision="main"
68+
)
69+
70+
# Example #13: Floating tag revision: 'v1.0.0'
71+
unsafe_file_tag = hf_hub_download(
72+
repo_id="org/model_name",
73+
filename="config.json",
74+
revision="v1.0.0"
75+
)
76+
77+
78+
# snapshot_download (Repo Snapshot)
79+
80+
# Example #14: No revision
81+
unsafe_snapshot_no_revision = snapshot_download(repo_id="org/model_name")
82+
83+
# Example #15: Floating revision: 'main'
84+
unsafe_snapshot_main = snapshot_download(
85+
repo_id="org/model_name",
86+
revision="main"
87+
)
88+
89+
# Example #16: Floating tag revision: 'v1.0.0'
90+
unsafe_snapshot_tag = snapshot_download(
91+
repo_id="org/model_name",
92+
revision="v1.0.0"
93+
)
94+
95+
96+
# -------------------------------
97+
# SAFE USAGE
98+
# -------------------------------
99+
100+
# AutoModel
101+
102+
# Example #17: Pinned commit hash
103+
safe_model_commit = AutoModel.from_pretrained(
104+
"org/model_name",
105+
revision="5d0f2e8a7f1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d"
106+
)
107+
108+
# Example #18: Local path
109+
safe_model_local = AutoModel.from_pretrained("./local_model")
110+
safe_model_local_abs = AutoModel.from_pretrained("/path/to/model")
111+
112+
# AutoTokenizer
113+
114+
# Example #19: Pinned commit hash
115+
safe_tokenizer_commit = AutoTokenizer.from_pretrained(
116+
"org/model_name",
117+
revision="5d0f2e8a7f1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d"
118+
)
119+
120+
# Example #20: Local path
121+
safe_tokenizer_local = AutoTokenizer.from_pretrained("./local_tokenizer")
122+
123+
124+
# load_dataset
125+
126+
# Example #21: Pinned commit hash
127+
safe_dataset_commit = load_dataset(
128+
"org_dataset",
129+
revision="5d0f2e8a7f1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d"
130+
)
131+
132+
133+
# hf_hub_download
134+
135+
# Example #22: Pinned commit hash
136+
safe_file_commit = hf_hub_download(
137+
repo_id="org/model_name",
138+
filename="config.json",
139+
revision="5d0f2e8a7f1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d"
140+
)
141+
142+
143+
# snapshot_download
144+
145+
# Example #23: Pinned commit hash
146+
safe_snapshot_commit = snapshot_download(
147+
repo_id="org/model_name",
148+
revision="5d0f2e8a7f1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d"
149+
)

setup.cfg

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,9 @@ bandit.plugins =
163163
# bandit/plugins/markupsafe_markup_xss.py
164164
markupsafe_markup_xss = bandit.plugins.markupsafe_markup_xss:markupsafe_markup_xss
165165

166+
# bandit/plugins/huggingface_unsafe_download.py
167+
huggingface_unsafe_download = bandit.plugins.huggingface_unsafe_download:huggingface_unsafe_download
168+
166169
[build_sphinx]
167170
all_files = 1
168171
build-dir = doc/build

tests/functional/test_functional.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -926,3 +926,10 @@ def test_markupsafe_markup_xss_allowed_calls(self):
926926
self.check_example(
927927
"markupsafe_markup_xss_allowed_calls.py", expect
928928
)
929+
930+
def test_huggingface_unsafe_download(self):
931+
expect = {
932+
"SEVERITY": {"UNDEFINED": 0, "LOW": 0, "MEDIUM": 15, "HIGH": 0},
933+
"CONFIDENCE": {"UNDEFINED": 0, "LOW": 0, "MEDIUM": 0, "HIGH": 15},
934+
}
935+
self.check_example("huggingface_unsafe_download.py", expect)

0 commit comments

Comments
 (0)