File tree Expand file tree Collapse file tree 1 file changed +4
-1
lines changed
python/sglang/srt/model_executor Expand file tree Collapse file tree 1 file changed +4
-1
lines changed Original file line number Diff line number Diff line change @@ -139,7 +139,7 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
139
139
gpu_mem = get_whatever_gpu_memory_capacity () / 1024
140
140
141
141
if gpu_mem > 120 :
142
- capture_bs += list (range (160 , 320 , 8 ))
142
+ capture_bs += list (range (160 , 256 , 8 ))
143
143
144
144
if max (capture_bs ) > model_runner .req_to_token_pool .size :
145
145
# In some case (e.g., with a small GPU or --max-running-requests), the #max-running-requests
@@ -196,6 +196,9 @@ def __init__(self, model_runner: ModelRunner):
196
196
197
197
# Batch sizes to capture
198
198
self .capture_bs , self .compile_bs = get_batch_sizes_to_capture (model_runner )
199
+
200
+ print (f"\x1b [32mx)bs={ self .capture_bs } \x1b [0m" )
201
+
199
202
self .capture_forward_mode = ForwardMode .DECODE
200
203
self .capture_hidden_mode = CaptureHiddenMode .NULL
201
204
self .num_tokens_per_bs = 1
You can’t perform that action at this time.
0 commit comments