Skip to content

Commit daed453

Browse files
authored
[CI] Improve github summary & enable fa3 for more models (#5796)
1 parent ded04b2 commit daed453

File tree

6 files changed

+26
-20
lines changed

6 files changed

+26
-20
lines changed

.github/workflows/pr-test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ jobs:
123123
timeout-minutes: 10
124124
run: |
125125
cd test/srt
126-
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1
126+
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default
127127
128128
- name: Benchmark online latency
129129
timeout-minutes: 10

python/sglang/srt/utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1970,6 +1970,7 @@ def is_fa3_default_architecture(hf_config):
19701970
"Llama4ForConditionalGeneration",
19711971
"LlamaForCausalLM",
19721972
"MistralForCausalLM",
1973+
"MixtralForCausalLM",
19731974
"Gemma2ForCausalLM",
19741975
"Gemma3ForConditionalGeneration",
19751976
}

test/srt/run_suite.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ class TestFile:
6464
TestFile("test_retract_decode.py", 54),
6565
TestFile("test_server_args.py", 1),
6666
TestFile("test_skip_tokenizer_init.py", 117),
67-
TestFile("test_srt_engine.py", 237),
67+
TestFile("test_srt_engine.py", 261),
6868
TestFile("test_srt_endpoint.py", 130),
6969
TestFile("test_torch_compile.py", 76),
7070
TestFile("test_torch_compile_moe.py", 172),

test/srt/test_bench_one_batch.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,23 +4,22 @@
44
DEFAULT_MODEL_NAME_FOR_TEST,
55
DEFAULT_MOE_MODEL_NAME_FOR_TEST,
66
CustomTestCase,
7-
get_bool_env_var,
87
is_in_ci,
98
run_bench_one_batch,
109
write_github_step_summary,
1110
)
1211

1312

1413
class TestBenchOneBatch(CustomTestCase):
15-
def test_bs1(self):
14+
def test_bs1_default(self):
1615
output_throughput = run_bench_one_batch(
1716
DEFAULT_MODEL_NAME_FOR_TEST, ["--cuda-graph-max-bs", "2"]
1817
)
1918

2019
if is_in_ci():
2120
write_github_step_summary(
22-
f"### test_bs1\n"
23-
f"output_throughput : {output_throughput:.2f} token/s\n"
21+
f"### test_bs1_default (llama-3.1-8b)\n"
22+
f"output_throughput: {output_throughput:.2f} token/s\n"
2423
)
2524
self.assertGreater(output_throughput, 135)
2625

@@ -32,9 +31,9 @@ def test_moe_tp2_bs1(self):
3231
if is_in_ci():
3332
write_github_step_summary(
3433
f"### test_moe_tp2_bs1\n"
35-
f"output_throughput : {output_throughput:.2f} token/s\n"
34+
f"output_throughput: {output_throughput:.2f} token/s\n"
3635
)
37-
self.assertGreater(output_throughput, 124)
36+
self.assertGreater(output_throughput, 125)
3837

3938
def test_torch_compile_tp2_bs1(self):
4039
output_throughput = run_bench_one_batch(
@@ -45,9 +44,9 @@ def test_torch_compile_tp2_bs1(self):
4544
if is_in_ci():
4645
write_github_step_summary(
4746
f"### test_torch_compile_tp2_bs1\n"
48-
f"output_throughput : {output_throughput:.2f} token/s\n"
47+
f"output_throughput: {output_throughput:.2f} token/s\n"
4948
)
50-
self.assertGreater(output_throughput, 225)
49+
self.assertGreater(output_throughput, 220)
5150

5251

5352
if __name__ == "__main__":

test/srt/test_bench_serving.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def test_offline_throughput_with_triton_attention_backend(self):
9898
f"### test_offline_throughput_with_triton_attention_backend\n"
9999
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
100100
)
101-
self.assertGreater(res["output_throughput"], 3600)
101+
self.assertGreater(res["output_throughput"], 3700)
102102

103103
def test_offline_throughput_default_fp8(self):
104104
res = run_bench_serving(
@@ -113,7 +113,7 @@ def test_offline_throughput_default_fp8(self):
113113
f"### test_offline_throughput_default_fp8\n"
114114
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
115115
)
116-
self.assertGreater(res["output_throughput"], 4200)
116+
self.assertGreater(res["output_throughput"], 4300)
117117

118118
def test_online_latency_default(self):
119119
res = run_bench_serving(
@@ -126,7 +126,7 @@ def test_online_latency_default(self):
126126
if is_in_ci():
127127
write_github_step_summary(
128128
f"### test_online_latency_default\n"
129-
f'median_e2e_latency_ms : {res["median_e2e_latency_ms"]:.2f} ms\n'
129+
f'median_e2e_latency_ms: {res["median_e2e_latency_ms"]:.2f} ms\n'
130130
)
131131
self.assertLess(res["median_e2e_latency_ms"], 11000)
132132
self.assertLess(res["median_ttft_ms"], 86)
@@ -161,8 +161,8 @@ def test_online_latency_eagle(self):
161161
if is_in_ci():
162162
write_github_step_summary(
163163
f"### test_online_latency_eagle\n"
164-
f'median_e2e_latency_ms : {res["median_e2e_latency_ms"]:.2f} ms\n'
165-
f'accept_length : {res["accept_length"]:.2f} \n'
164+
f'median_e2e_latency_ms: {res["median_e2e_latency_ms"]:.2f} ms\n'
165+
f'accept_length: {res["accept_length"]:.2f} \n'
166166
)
167167
self.assertLess(res["median_e2e_latency_ms"], 900)
168168
self.assertGreater(res["accept_length"], 3.0)

test/srt/test_full_deepseek_v3.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
from types import SimpleNamespace
33

44
import requests
5-
import torch
65

76
from sglang.srt.utils import kill_process_tree
87
from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
@@ -49,7 +48,7 @@ def test_gsm8k(self):
4948
metrics = run_eval_few_shot_gsm8k(args)
5049
print(f"{metrics=}")
5150

52-
self.assertGreater(metrics["accuracy"], 0.94)
51+
self.assertGreater(metrics["accuracy"], 0.935)
5352

5453

5554
class TestBenchOneBatch(CustomTestCase):
@@ -58,11 +57,11 @@ def test_bs1(self):
5857
FULL_DEEPSEEK_V3_MODEL_PATH,
5958
["--trust-remote-code", "--tp", "8", "--cuda-graph-max-bs", "2"],
6059
)
61-
print(f"output_throughput : {output_throughput:.2f} token/s")
60+
print(f"{output_throughput=:.2f} token/s")
61+
6262
if is_in_ci():
6363
write_github_step_summary(
64-
f"### test_bs1\n"
65-
f"output_throughput : {output_throughput:.2f} token/s\n"
64+
f"### test_bs1 (deepseek-v3)\n" f"{output_throughput=:.2f} token/s\n"
6665
)
6766
self.assertGreater(output_throughput, 70)
6867

@@ -121,6 +120,13 @@ def test_gsm8k(self):
121120
print(f"{avg_spec_accept_length=}")
122121
self.assertGreater(avg_spec_accept_length, 3.2)
123122

123+
if is_in_ci():
124+
write_github_step_summary(
125+
f"### test_gsm8k (deepseek-v3)\n"
126+
f'{metrics["accuracy"]=:.3f}\n'
127+
f"{avg_spec_accept_length=:.2f}\n"
128+
)
129+
124130

125131
if __name__ == "__main__":
126132
unittest.main()

0 commit comments

Comments
 (0)