Skip to content

Commit edd80f6

Browse files
Ubospicatarinkk
authored andcommitted
upgrade xgrammar to 0.1.19 (sgl-project#6129)
1 parent c58a9d4 commit edd80f6

File tree

2 files changed

+12
-20
lines changed

2 files changed

+12
-20
lines changed

python/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ runtime_common = [
4242
"transformers==4.51.1",
4343
"uvicorn",
4444
"uvloop",
45-
"xgrammar==0.1.17",
45+
"xgrammar==0.1.19",
4646
"blobfile==3.0.0"
4747
]
4848

python/sglang/srt/constrained/xgrammar_backend.py

Lines changed: 11 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from typing import List, Optional, Tuple, Union
1919

2020
import torch
21+
import xgrammar
2122
from xgrammar import (
2223
CompiledGrammar,
2324
GrammarCompiler,
@@ -58,17 +59,11 @@ def __init__(
5859
self.override_stop_tokens = override_stop_tokens
5960
self.finished = False
6061

61-
# Fix (from vLLM team): postpone the import of apply_token_bitmask_inplace_kernels to the
62-
# class init site to avoid re-initializing CUDA in forked subprocess.
63-
from xgrammar.kernels import apply_token_bitmask_inplace_kernels
64-
65-
self.use_token_bitmask_triton = get_bool_env_var(
66-
"SGLANG_TOKEN_BITMASK_TRITON", "false"
67-
)
68-
self.apply_vocab_mask_cuda = apply_token_bitmask_inplace_kernels.get(
69-
"cuda", None
62+
from xgrammar.kernels.apply_token_bitmask_inplace_cpu import (
63+
apply_token_bitmask_inplace_cpu,
7064
)
71-
self.apply_vocab_mask_cpu = apply_token_bitmask_inplace_kernels.get("cpu", None)
65+
66+
self.apply_vocab_mask_cpu = apply_token_bitmask_inplace_cpu
7267

7368
def accept_token(self, token: int):
7469
assert self.matcher.accept_token(token)
@@ -113,15 +108,12 @@ def move_vocab_mask(vocab_mask: torch.Tensor, device) -> torch.Tensor:
113108
return vocab_mask.to(device, non_blocking=True)
114109

115110
def apply_vocab_mask(self, logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
116-
if (
117-
not self.use_token_bitmask_triton
118-
and logits.device.type == "cuda"
119-
and self.apply_vocab_mask_cuda
120-
):
121-
return self.apply_vocab_mask_cuda(logits, vocab_mask)
122-
if logits.device.type == "cpu" and self.apply_vocab_mask_cpu:
123-
return self.apply_vocab_mask_cpu(logits, vocab_mask)
124-
apply_token_bitmask_inplace_triton(logits, vocab_mask)
111+
if logits.device.type == "cuda":
112+
apply_token_bitmask_inplace_triton(logits, vocab_mask)
113+
elif logits.device.type == "cpu" and self.apply_vocab_mask_cpu:
114+
self.apply_vocab_mask_cpu(logits, vocab_mask)
115+
else:
116+
raise RuntimeError(f"Unsupported device: {logits.device.type}")
125117

126118
def copy(self):
127119
matcher = GrammarMatcher(

0 commit comments

Comments
 (0)