Skip to content

Commit a651e0b

Browse files
ispobockLayssy
authored andcommitted
Update ci test and doc for MTP api change (sgl-project#5952)
1 parent 0c31c67 commit a651e0b

File tree

6 files changed

+66
-14
lines changed

6 files changed

+66
-14
lines changed

docs/references/deepseek.md

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -153,12 +153,10 @@ The precompilation process typically takes around 10 minutes to complete.
153153
**Description**: SGLang implements DeepSeek V3 Multi-Token Prediction (MTP) based on [EAGLE speculative decoding](https://docs.sglang.ai/backend/speculative_decoding.html#EAGLE-Decoding). With this optimization, the decoding speed can be improved by **1.8x** for batch size 1 and **1.5x** for batch size 32 respectively on H200 TP8 setting.
154154

155155
**Usage**:
156-
Add arguments `--speculative-algorithm`, `--speculative-draft-model-path`,
157-
`--speculative-num-steps`, `--speculative-eagle-topk` and `--speculative-num-draft-tokens` to enable this feature. For example:
156+
Add arguments `--speculative-algorithm`, `--speculative-num-steps`, `--speculative-eagle-topk` and `--speculative-num-draft-tokens` to enable this feature. For example:
158157
```
159-
python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --speculative-algorithm EAGLE --speculative-draft-model-path lmsys/DeepSeek-V3-0324-NextN --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 --trust-remote-code --tp 8
158+
python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --speculative-algorithm EAGLE --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 --trust-remote-code --tp 8
160159
```
161-
- The draft model are available at huggingface: [lmsys/DeepSeek-V3-0324-NextN](https://huggingface.co/lmsys/DeepSeek-V3-0324-NextN), [lmsys/DeepSeek-R1-NextN](https://huggingface.co/lmsys/DeepSeek-R1-NextN). It can also be exported from original DeepSeek-V3/R1 model with [export_deepseek_nextn.py](https://github.com/sgl-project/sglang/blob/main/scripts/export_deepseek_nextn.py) script.
162160
- The best configuration for `--speculative-num-steps`, `--speculative-eagle-topk` and `--speculative-num-draft-tokens` can be searched with [bench_speculative.py](https://github.com/sgl-project/sglang/blob/main/scripts/playground/bench_speculative.py) script for given batch size. The minimum configuration is `--speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2`, which can achieve speedup for larger batch sizes.
163161
When using FlashInfer MLA wrapper (`--attention-backend flashinfer`) with speculative decoding, set the `--speculative-eagle-topk` parameter to `1`. The FlashAttention 3 backend also only supports `--speculative-eagle-topk 1`.
164162
- To enable DeepSeek MTP for large batch sizes (>32), there are some parameters should be changed (Reference [this discussion](https://github.com/sgl-project/sglang/issues/4543#issuecomment-2737413756)):

python/sglang/srt/server_args.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -351,10 +351,13 @@ def __post_init__(self):
351351
model_arch = get_model_arch(self)
352352

353353
# Auto set draft_model_path DeepSeek-V3/R1
354-
if self.speculative_draft_model_path is None and model_arch in [
355-
"DeepseekV3ForCausalLM"
356-
]:
357-
self.speculative_draft_model_path = self.model_path
354+
if model_arch == "DeepseekV3ForCausalLM":
355+
if self.speculative_draft_model_path is None:
356+
self.speculative_draft_model_path = self.model_path
357+
else:
358+
logger.warning(
359+
"DeepSeek MTP does not require setting speculative_draft_model_path."
360+
)
358361

359362
# Auto choose parameters
360363
if self.speculative_num_steps is None:

test/srt/test_full_deepseek_v3.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,6 @@ def setUpClass(cls):
8080
"--trust-remote-code",
8181
"--speculative-algorithm",
8282
"EAGLE",
83-
"--speculative-draft",
84-
"lmsys/DeepSeek-V3-0324-NextN",
8583
"--speculative-num-steps",
8684
"3",
8785
"--speculative-eagle-topk",

test/srt/test_mla_deepseek_v3.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,63 @@ def test_gsm8k(self):
5050

5151

5252
class TestDeepseekV3MTP(CustomTestCase):
53+
@classmethod
54+
def setUpClass(cls):
55+
cls.model = "lmsys/sglang-ci-dsv3-test"
56+
cls.base_url = DEFAULT_URL_FOR_TEST
57+
other_args = [
58+
"--trust-remote-code",
59+
"--cuda-graph-max-bs",
60+
"2",
61+
"--disable-radix",
62+
"--enable-torch-compile",
63+
"--torch-compile-max-bs",
64+
"1",
65+
"--speculative-algorithm",
66+
"EAGLE",
67+
"--speculative-num-steps",
68+
"2",
69+
"--speculative-eagle-topk",
70+
"4",
71+
"--speculative-num-draft-tokens",
72+
"4",
73+
]
74+
cls.process = popen_launch_server(
75+
cls.model,
76+
cls.base_url,
77+
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
78+
other_args=other_args,
79+
)
80+
81+
@classmethod
82+
def tearDownClass(cls):
83+
kill_process_tree(cls.process.pid)
84+
85+
def test_gsm8k(self):
86+
requests.get(self.base_url + "/flush_cache")
87+
88+
args = SimpleNamespace(
89+
num_shots=5,
90+
data_path=None,
91+
num_questions=200,
92+
max_new_tokens=512,
93+
parallel=128,
94+
host="http://127.0.0.1",
95+
port=int(self.base_url.split(":")[-1]),
96+
)
97+
metrics = run_eval_few_shot_gsm8k(args)
98+
print(metrics)
99+
100+
self.assertGreater(metrics["accuracy"], 0.60)
101+
102+
server_info = requests.get(self.base_url + "/get_server_info")
103+
avg_spec_accept_length = server_info.json()["avg_spec_accept_length"]
104+
print(f"{avg_spec_accept_length=}")
105+
self.assertGreater(avg_spec_accept_length, 2.5)
106+
107+
108+
# compatible with old APIs
109+
class TestDeepseekV3MTPWithDraft(CustomTestCase):
53110
@classmethod
54111
def setUpClass(cls):
55112
cls.model = "lmsys/sglang-ci-dsv3-test"

test/srt/test_mla_flashinfer.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,8 +118,6 @@ def setUpClass(cls):
118118
"1",
119119
"--speculative-algorithm",
120120
"EAGLE",
121-
"--speculative-draft",
122-
"lmsys/sglang-ci-dsv3-test-NextN",
123121
"--speculative-num-steps",
124122
"3",
125123
"--speculative-eagle-topk",

test/srt/test_mla_int8_deepseek_v3.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,8 +162,6 @@ def setUpClass(cls):
162162
"1",
163163
"--speculative-algorithm",
164164
"EAGLE",
165-
"--speculative-draft",
166-
"sgl-project/sglang-ci-dsv3-block-int8-test-NextN",
167165
"--speculative-num-steps",
168166
"2",
169167
"--speculative-eagle-topk",

0 commit comments

Comments
 (0)