Skip to content

Commit b111f8a

Browse files
fix(security): Add VLLM_MAX_N_SEQUENCES environment variable and enforce limit (#37952)
Signed-off-by: jperezde <jperezde@redhat.com> Signed-off-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: Russell Bryant <rbryant@redhat.com>
1 parent 497e234 commit b111f8a

File tree

5 files changed

+193
-0
lines changed

5 files changed

+193
-0
lines changed

docs/usage/security.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,18 @@ The most effective approach is to deploy vLLM behind a reverse proxy (such as ng
231231
- Blocks all other endpoints, including the unauthenticated inference and operational control endpoints
232232
- Implements additional authentication, rate limiting, and logging at the proxy layer
233233

234+
## Request Parameter Resource Limits
235+
236+
Certain API request parameters can have a large impact on resource consumption and may be abused to exhaust server resources. The `n` parameter in the `/v1/completions` and `/v1/chat/completions` endpoints controls how many independent output sequences are generated per request. A very large value causes the engine to allocate memory, CPU, and GPU time proportional to `n`, which can lead to out-of-memory conditions on the host and block the server from processing other requests.
237+
238+
To mitigate this, vLLM enforces a configurable upper bound on the `n` parameter via the `VLLM_MAX_N_SEQUENCES` environment variable (default: **16384**). Requests exceeding this limit are rejected before reaching the engine.
239+
240+
### Recommendations
241+
242+
- **Public-facing deployments:** Consider setting `VLLM_MAX_N_SEQUENCES` to a value appropriate for your workload (e.g., `64` or `128`) to limit the blast radius of a single request.
243+
- **Reverse proxy layer:** In addition to vLLM's built-in limit, consider enforcing request body validation and rate limiting at your reverse proxy to further constrain abusive payloads.
244+
- **Monitoring:** Monitor per-request resource consumption to detect anomalous patterns that may indicate abuse.
245+
234246
## Tool Server and MCP Security
235247

236248
vLLM supports connecting to external tool servers via the `--tool-server` argument. This enables models to call tools through the Responses API (`/v1/responses`). Tool server support works with all models — it is not limited to specific model architectures.

tests/entrypoints/openai/chat_completion/test_chat.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1020,3 +1020,114 @@ def test_chat_completion_request_n_parameter_various_values():
10201020
assert sampling_params.n == n_value, (
10211021
f"Expected n={n_value}, got n={sampling_params.n}"
10221022
)
1023+
1024+
1025+
def test_chat_completion_request_n_parameter_exceeds_default_limit(
1026+
monkeypatch: pytest.MonkeyPatch,
1027+
):
1028+
"""Test that n values exceeding the default limit are rejected."""
1029+
import vllm.envs as envs
1030+
1031+
monkeypatch.delenv("VLLM_MAX_N_SEQUENCES", raising=False)
1032+
if hasattr(envs.__getattr__, "cache_clear"):
1033+
envs.__getattr__.cache_clear()
1034+
1035+
max_n = envs.VLLM_MAX_N_SEQUENCES
1036+
request = ChatCompletionRequest(
1037+
model="test-model",
1038+
messages=[{"role": "user", "content": "Test"}],
1039+
n=max_n + 1,
1040+
max_tokens=10,
1041+
)
1042+
1043+
with pytest.raises(ValueError, match="n must be at most"):
1044+
request.to_sampling_params(
1045+
max_tokens=10,
1046+
default_sampling_params={},
1047+
)
1048+
1049+
1050+
def test_chat_completion_request_n_parameter_at_limit(
1051+
monkeypatch: pytest.MonkeyPatch,
1052+
):
1053+
"""Test that n at exactly the limit is accepted."""
1054+
import vllm.envs as envs
1055+
1056+
monkeypatch.delenv("VLLM_MAX_N_SEQUENCES", raising=False)
1057+
if hasattr(envs.__getattr__, "cache_clear"):
1058+
envs.__getattr__.cache_clear()
1059+
1060+
max_n = envs.VLLM_MAX_N_SEQUENCES
1061+
request = ChatCompletionRequest(
1062+
model="test-model",
1063+
messages=[{"role": "user", "content": "Test"}],
1064+
n=max_n,
1065+
max_tokens=10,
1066+
)
1067+
1068+
sampling_params = request.to_sampling_params(
1069+
max_tokens=10,
1070+
default_sampling_params={},
1071+
)
1072+
assert sampling_params.n == max_n
1073+
1074+
1075+
def test_chat_completion_request_n_parameter_custom_limit(
1076+
monkeypatch: pytest.MonkeyPatch,
1077+
):
1078+
"""Test that VLLM_MAX_N_SEQUENCES env var overrides the default limit."""
1079+
import vllm.envs as envs
1080+
1081+
monkeypatch.setenv("VLLM_MAX_N_SEQUENCES", "128")
1082+
if hasattr(envs.__getattr__, "cache_clear"):
1083+
envs.__getattr__.cache_clear()
1084+
1085+
request = ChatCompletionRequest(
1086+
model="test-model",
1087+
messages=[{"role": "user", "content": "Test"}],
1088+
n=128,
1089+
max_tokens=10,
1090+
)
1091+
1092+
sampling_params = request.to_sampling_params(
1093+
max_tokens=10,
1094+
default_sampling_params={},
1095+
)
1096+
assert sampling_params.n == 128
1097+
1098+
request_over = ChatCompletionRequest(
1099+
model="test-model",
1100+
messages=[{"role": "user", "content": "Test"}],
1101+
n=129,
1102+
max_tokens=10,
1103+
)
1104+
1105+
with pytest.raises(ValueError, match="n must be at most 128"):
1106+
request_over.to_sampling_params(
1107+
max_tokens=10,
1108+
default_sampling_params={},
1109+
)
1110+
1111+
1112+
def test_chat_completion_request_n_parameter_massive_value(
1113+
monkeypatch: pytest.MonkeyPatch,
1114+
):
1115+
"""Test that astronomically large n values are rejected (CVE fix)."""
1116+
import vllm.envs as envs
1117+
1118+
monkeypatch.delenv("VLLM_MAX_N_SEQUENCES", raising=False)
1119+
if hasattr(envs.__getattr__, "cache_clear"):
1120+
envs.__getattr__.cache_clear()
1121+
1122+
request = ChatCompletionRequest(
1123+
model="test-model",
1124+
messages=[{"role": "user", "content": "Test"}],
1125+
n=100_000_000,
1126+
max_tokens=1,
1127+
)
1128+
1129+
with pytest.raises(ValueError, match="n must be at most"):
1130+
request.to_sampling_params(
1131+
max_tokens=1,
1132+
default_sampling_params={},
1133+
)

tests/test_envs.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -454,3 +454,55 @@ def test_configure_logging_with_invalid_value_raises_error(self):
454454

455455
with pytest.raises(ValueError, match="invalid literal for int"):
456456
_ = envs.VLLM_CONFIGURE_LOGGING
457+
458+
459+
class TestVllmMaxNSequences:
460+
def test_default_value(self):
461+
"""Test that VLLM_MAX_N_SEQUENCES defaults to 64."""
462+
with patch.dict(os.environ, {}, clear=False):
463+
os.environ.pop("VLLM_MAX_N_SEQUENCES", None)
464+
if hasattr(envs.__getattr__, "cache_clear"):
465+
envs.__getattr__.cache_clear()
466+
467+
assert envs.VLLM_MAX_N_SEQUENCES == 16384
468+
469+
def test_custom_value(self, monkeypatch: pytest.MonkeyPatch):
470+
"""Test that VLLM_MAX_N_SEQUENCES can be overridden."""
471+
monkeypatch.setenv("VLLM_MAX_N_SEQUENCES", "128")
472+
if hasattr(envs.__getattr__, "cache_clear"):
473+
envs.__getattr__.cache_clear()
474+
475+
assert envs.VLLM_MAX_N_SEQUENCES == 128
476+
477+
def test_sampling_params_respects_limit(
478+
self,
479+
monkeypatch: pytest.MonkeyPatch,
480+
):
481+
"""Test that SamplingParams rejects n above the limit."""
482+
from vllm.sampling_params import SamplingParams
483+
484+
monkeypatch.delenv("VLLM_MAX_N_SEQUENCES", raising=False)
485+
if hasattr(envs.__getattr__, "cache_clear"):
486+
envs.__getattr__.cache_clear()
487+
488+
max_n = envs.VLLM_MAX_N_SEQUENCES
489+
SamplingParams(n=max_n)
490+
491+
with pytest.raises(ValueError, match="n must be at most"):
492+
SamplingParams(n=max_n + 1)
493+
494+
def test_sampling_params_respects_custom_limit(
495+
self,
496+
monkeypatch: pytest.MonkeyPatch,
497+
):
498+
"""Test that SamplingParams uses the overridden env var limit."""
499+
from vllm.sampling_params import SamplingParams
500+
501+
monkeypatch.setenv("VLLM_MAX_N_SEQUENCES", "128")
502+
if hasattr(envs.__getattr__, "cache_clear"):
503+
envs.__getattr__.cache_clear()
504+
505+
SamplingParams(n=128)
506+
507+
with pytest.raises(ValueError, match="n must be at most 128"):
508+
SamplingParams(n=129)

vllm/envs.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@
8686
VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
8787
VLLM_RPC_TIMEOUT: int = 10000 # ms
8888
VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5 # seconds
89+
VLLM_MAX_N_SEQUENCES: int = 16384
8990
VLLM_PLUGINS: list[str] | None = None
9091
VLLM_LORA_RESOLVER_CACHE_DIR: str | None = None
9192
VLLM_LORA_RESOLVER_HF_REPO_LIST: str | None = None
@@ -870,6 +871,12 @@ def _get_or_set_default() -> str:
870871
"VLLM_HTTP_TIMEOUT_KEEP_ALIVE": lambda: int(
871872
os.environ.get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")
872873
),
874+
# Maximum allowed value for the `n` sampling parameter (number of output
875+
# sequences per request). Limits resource consumption to prevent
876+
# denial-of-service via excessively large fan-out. Default: 16384.
877+
"VLLM_MAX_N_SEQUENCES": lambda: int(
878+
os.environ.get("VLLM_MAX_N_SEQUENCES", "16384")
879+
),
873880
# a list of plugin names to load, separated by commas.
874881
# if this is not set, it means all plugins will be loaded
875882
# if this is set to an empty string, no plugins will be loaded

vllm/sampling_params.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import msgspec
1313
from pydantic.dataclasses import dataclass
1414

15+
import vllm.envs as envs
1516
from vllm.config import ModelConfig, SpeculativeConfig, StructuredOutputsConfig
1617
from vllm.exceptions import VLLMValidationError
1718
from vllm.logger import init_logger
@@ -169,6 +170,9 @@ class SamplingParams(
169170
n: int = 1
170171
"""Number of outputs to return for the given prompt request.
171172
173+
The maximum allowed value is controlled by the ``VLLM_MAX_N_SEQUENCES``
174+
environment variable (default: 16384).
175+
172176
NOTE:
173177
`AsyncLLM` streams outputs by default. When `n > 1`, all `n` outputs
174178
are generated and streamed cumulatively per request. To see all `n`
@@ -425,6 +429,13 @@ def _verify_args(self) -> None:
425429
raise ValueError(f"n must be an int, but is of type {type(self.n)}")
426430
if self.n < 1:
427431
raise ValueError(f"n must be at least 1, got {self.n}.")
432+
max_n = envs.VLLM_MAX_N_SEQUENCES
433+
if self.n > max_n:
434+
raise ValueError(
435+
f"n must be at most {max_n}, got {self.n}. "
436+
"To increase this limit, set the VLLM_MAX_N_SEQUENCES "
437+
"environment variable."
438+
)
428439
if not -2.0 <= self.presence_penalty <= 2.0:
429440
raise ValueError(
430441
f"presence_penalty must be in [-2, 2], got {self.presence_penalty}."

0 commit comments

Comments
 (0)