Skip to content

Commit eba4258

Browse files
JustinTong0323tarinkk
authored andcommitted
[CI]Add performance CI for VLM (sgl-project#6038)
Signed-off-by: Xinyuan Tong <[email protected]>
1 parent 9096e48 commit eba4258

File tree

5 files changed

+219
-4
lines changed

5 files changed

+219
-4
lines changed

.github/workflows/pr-test.yml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,18 @@ jobs:
162162
cd test/srt
163163
python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8
164164
165+
- name: Benchmark VLM offline throughput
166+
timeout-minutes: 10
167+
run: |
168+
cd test/srt
169+
python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_offline_throughput
170+
171+
- name: Benchmark VLM online latency
172+
timeout-minutes: 10
173+
run: |
174+
cd test/srt
175+
python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_online_latency
176+
165177
performance-test-2-gpu:
166178
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
167179
github.event.pull_request.draft == false

python/sglang/bench_serving.py

Lines changed: 149 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ class RequestFuncInput:
5858
output_len: int
5959
model: str
6060
lora_name: str
61+
image_data: str
6162
extra_request_body: Dict[str, Any]
6263

6364

@@ -347,6 +348,11 @@ async def async_request_sglang_generate(
347348
"logprob_start_len": -1,
348349
**request_func_input.extra_request_body,
349350
}
351+
352+
# Add image data if available
353+
if request_func_input.image_data:
354+
payload["image_data"] = request_func_input.image_data
355+
350356
headers = get_auth_headers()
351357

352358
output = RequestFuncOutput()
@@ -510,6 +516,13 @@ def get_dataset(args, tokenizer):
510516
tokenizer=tokenizer,
511517
args=args,
512518
)
519+
elif args.dataset_name == "mmmu":
520+
input_requests = sample_mmmu_requests(
521+
num_requests=args.num_prompts,
522+
tokenizer=tokenizer,
523+
fixed_output_len=args.random_output_len,
524+
random_sample=True,
525+
)
513526
else:
514527
raise ValueError(f"Unknown dataset: {args.dataset_name}")
515528
return input_requests
@@ -597,6 +610,121 @@ def download_and_cache_file(url: str, filename: Optional[str] = None):
597610
return filename
598611

599612

613+
def sample_mmmu_requests(
614+
num_requests: int,
615+
tokenizer: PreTrainedTokenizerBase,
616+
fixed_output_len: Optional[int] = None,
617+
random_sample: bool = True,
618+
) -> List[Tuple[str, int, int]]:
619+
"""
620+
Sample requests from the MMMU dataset using HuggingFace datasets.
621+
622+
Args:
623+
num_requests: Number of requests to sample.
624+
tokenizer: Tokenizer to use for token counting.
625+
fixed_output_len: If provided, use this fixed output length for all requests.
626+
random_sample: Whether to randomly sample or take the first N.
627+
628+
Returns:
629+
List of tuples (prompt, prompt_token_len, output_token_len).
630+
"""
631+
try:
632+
import base64
633+
import io
634+
635+
from datasets import load_dataset
636+
except ImportError:
637+
raise ImportError("Please install datasets: pip install datasets")
638+
639+
print("Loading MMMU dataset from HuggingFace...")
640+
641+
try:
642+
print("Attempting to load MMMU Math dataset...")
643+
mmmu_dataset = load_dataset("MMMU/MMMU", "Math", split="test")
644+
print(
645+
f"Successfully loaded MMMU Math dataset from HuggingFace with {len(mmmu_dataset)} examples"
646+
)
647+
except Exception as e:
648+
print(f"Failed to load MMMU Math dataset: {e}")
649+
raise ValueError(f"Failed to load MMMU dataset: {e}")
650+
651+
# Sample from the dataset
652+
if len(mmmu_dataset) > num_requests:
653+
if random_sample:
654+
# Random sample
655+
indices = random.sample(range(len(mmmu_dataset)), num_requests)
656+
sample_dataset = mmmu_dataset.select(indices)
657+
else:
658+
# Take first N
659+
sample_dataset = mmmu_dataset.select(
660+
range(min(num_requests, len(mmmu_dataset)))
661+
)
662+
else:
663+
print(f"Dataset has less than {num_requests} examples, using all examples")
664+
sample_dataset = mmmu_dataset
665+
666+
print(f"Selected {len(sample_dataset)} examples for benchmarking")
667+
668+
# Create prompts
669+
filtered_dataset = []
670+
671+
for i, example in enumerate(sample_dataset):
672+
try:
673+
# Extract image_1
674+
image = example.get("image_1")
675+
676+
if image is not None:
677+
if hasattr(image, "save"):
678+
# Convert RGBA images to RGB before encoding
679+
if image.mode == "RGBA":
680+
image = image.convert("RGB")
681+
682+
# Encode image to base64
683+
buffered = io.BytesIO()
684+
image.save(buffered, format="JPEG")
685+
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
686+
image_path = f"data:image/jpeg;base64,{img_str}"
687+
else:
688+
continue
689+
690+
# Extract the question
691+
question = example.get("question")
692+
693+
# Create the prompt with image, question
694+
prompt = f"Question: {question}\n\nAnswer: "
695+
prompt = tokenizer.apply_chat_template(
696+
[
697+
{
698+
"role": "user",
699+
"content": [
700+
{"type": "image_url", "image_url": {"url": image_path}},
701+
{"type": "text", "text": prompt},
702+
],
703+
}
704+
],
705+
add_generation_prompt=True,
706+
tokenize=False,
707+
)
708+
prompt = f"<image>{image_path}</image>{prompt}"
709+
710+
# Calculate token lengths
711+
# Note: This is approximate since we're not rendering the actual image tokens
712+
prompt_token_ids = tokenizer.encode(prompt)
713+
prompt_len = (
714+
len(prompt_token_ids) + 512
715+
) # Add estimate for image tokens
716+
717+
output_len = fixed_output_len if fixed_output_len is not None else 256
718+
719+
filtered_dataset.append((prompt, prompt_len, output_len))
720+
721+
except Exception as e:
722+
print(f"Error processing example {i}: {e}")
723+
724+
print(f"\nCreated {len(filtered_dataset)} MMMU prompts")
725+
return filtered_dataset
726+
727+
600728
def sample_sharegpt_requests(
601729
dataset_path: str,
602730
num_requests: int,
@@ -1004,6 +1132,15 @@ async def limited_request_func(request_func_input, pbar):
10041132
else:
10051133
lora_name = None
10061134

1135+
if "<image>" in test_prompt:
1136+
import re
1137+
1138+
image_match = re.search(r"<image>(.*?)</image>(.*)", test_prompt)
1139+
image_data = image_match.group(1) if image_match else None
1140+
test_prompt = image_match.group(2) if image_match else test_prompt
1141+
else:
1142+
image_data = None
1143+
10071144
# Create the test input once
10081145
test_input = RequestFuncInput(
10091146
model=model_id,
@@ -1012,6 +1149,7 @@ async def limited_request_func(request_func_input, pbar):
10121149
prompt_len=test_prompt_len,
10131150
output_len=min(test_output_len, 32),
10141151
lora_name=lora_name,
1152+
image_data=image_data,
10151153
extra_request_body=extra_request_body,
10161154
)
10171155

@@ -1063,13 +1201,23 @@ async def limited_request_func(request_func_input, pbar):
10631201
else:
10641202
lora_name = None
10651203

1204+
if "<image>" in prompt:
1205+
import re
1206+
1207+
image_match = re.search(r"<image>(.*?)</image>(.*)", prompt)
1208+
image_data = image_match.group(1) if image_match else None
1209+
prompt = image_match.group(2) if image_match else prompt
1210+
else:
1211+
image_data = None
1212+
10661213
request_func_input = RequestFuncInput(
10671214
model=model_id,
10681215
prompt=prompt,
10691216
api_url=api_url,
10701217
prompt_len=prompt_len,
10711218
output_len=output_len,
10721219
lora_name=lora_name,
1220+
image_data=image_data,
10731221
extra_request_body=extra_request_body,
10741222
)
10751223
tasks.append(
@@ -1444,7 +1592,7 @@ def __call__(self, parser, namespace, values, option_string=None):
14441592
"--dataset-name",
14451593
type=str,
14461594
default="sharegpt",
1447-
choices=["sharegpt", "random", "random-ids", "generated-shared-prefix"],
1595+
choices=["sharegpt", "random", "random-ids", "generated-shared-prefix", "mmmu"],
14481596
help="Name of the dataset to benchmark on.",
14491597
)
14501598
parser.add_argument(

python/sglang/test/test_utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,8 @@
7979
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
8080
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4,hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
8181
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
82-
DEFAULT_SMALL_VLM_MODEL_NAME = "Qwen/Qwen2-VL-2B"
82+
DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST = "Qwen/Qwen2.5-VL-3B-Instruct"
83+
DEFAULT_VLM_CHAT_TEMPLATE_FOR_TEST = "qwen2-vl"
8384

8485
DEFAULT_IMAGE_URL = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
8586
DEFAULT_VIDEO_URL = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"

test/srt/test_bench_serving.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
DEFAULT_MODEL_NAME_FOR_TEST,
88
DEFAULT_MODEL_NAME_FOR_TEST_FP8,
99
DEFAULT_MOE_MODEL_NAME_FOR_TEST,
10+
DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST,
11+
DEFAULT_VLM_CHAT_TEMPLATE_FOR_TEST,
1012
CustomTestCase,
1113
is_in_ci,
1214
run_bench_serving,
@@ -148,6 +150,58 @@ def test_online_latency_default(self):
148150
self.assertLess(res["median_ttft_ms"], 86)
149151
self.assertLess(res["median_itl_ms"], 10)
150152

153+
def test_vlm_offline_throughput(self):
154+
res = run_bench_serving(
155+
model=DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST,
156+
num_prompts=200,
157+
request_rate=float("inf"),
158+
other_server_args=[
159+
"--chat-template",
160+
DEFAULT_VLM_CHAT_TEMPLATE_FOR_TEST,
161+
"--mem-fraction-static",
162+
"0.7",
163+
],
164+
dataset_name="mmmu",
165+
)
166+
167+
if is_in_ci():
168+
write_github_step_summary(
169+
f"### test_vlm_offline_throughput\n"
170+
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
171+
)
172+
if os.getenv("SGLANG_AMD_CI") == "1":
173+
self.assertGreater(res["output_throughput"], 2000)
174+
# TODO: not set yet, need AMD machine
175+
else:
176+
self.assertGreater(res["output_throughput"], 2500)
177+
178+
def test_vlm_online_latency(self):
179+
res = run_bench_serving(
180+
model=DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST,
181+
num_prompts=50,
182+
request_rate=1,
183+
other_server_args=[
184+
"--chat-template",
185+
DEFAULT_VLM_CHAT_TEMPLATE_FOR_TEST,
186+
"--mem-fraction-static",
187+
"0.7",
188+
],
189+
dataset_name="mmmu",
190+
)
191+
192+
if is_in_ci():
193+
write_github_step_summary(
194+
f"### test_vlm_online_latency\n"
195+
f'median_e2e_latency_ms: {res["median_e2e_latency_ms"]:.2f} ms\n'
196+
)
197+
self.assertLess(res["median_e2e_latency_ms"], 16000)
198+
if os.getenv("SGLANG_AMD_CI") == "1":
199+
self.assertLess(res["median_ttft_ms"], 150)
200+
# TODO: not set yet, need AMD machine
201+
else:
202+
self.assertLess(res["median_ttft_ms"], 90)
203+
self.assertLess(res["median_itl_ms"], 8)
204+
151205
def test_online_latency_eagle(self):
152206
res = run_bench_serving(
153207
model=DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,

test/srt/test_skip_tokenizer_init.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from sglang.test.test_utils import (
1717
DEFAULT_IMAGE_URL,
1818
DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
19-
DEFAULT_SMALL_VLM_MODEL_NAME,
19+
DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST,
2020
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
2121
DEFAULT_URL_FOR_TEST,
2222
CustomTestCase,
@@ -195,7 +195,7 @@ def setUpClass(cls):
195195
cls.image_url = DEFAULT_IMAGE_URL
196196
response = requests.get(cls.image_url)
197197
cls.image = Image.open(BytesIO(response.content))
198-
cls.model = DEFAULT_SMALL_VLM_MODEL_NAME
198+
cls.model = DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST
199199
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model, use_fast=False)
200200
cls.processor = AutoProcessor.from_pretrained(cls.model, trust_remote_code=True)
201201
cls.base_url = DEFAULT_URL_FOR_TEST

0 commit comments

Comments
 (0)