Skip to content

Commit 93b5fa7

Browse files
yhyang201DiweiSun
authored andcommitted
[Model] Adding Qwen3 and Qwen3MoE (sgl-project#4693)
1 parent abbd652 commit 93b5fa7

File tree

5 files changed

+780
-14
lines changed

5 files changed

+780
-14
lines changed

python/sglang/srt/layers/attention/flashinfer_backend.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,11 @@ def __init__(
100100
self.num_wrappers = 1
101101
self.dispatch_reason = None
102102

103-
# Qwen2 models require higher flashinfer workspace size
104-
if "Qwen2ForCausalLM" in model_runner.model_config.hf_config.architectures:
103+
# Qwen2/Qwen3 models require higher flashinfer workspace size
104+
if (
105+
"Qwen2ForCausalLM" in model_runner.model_config.hf_config.architectures
106+
or "Qwen3ForCausalLM" in model_runner.model_config.hf_config.architectures
107+
):
105108
global_config.flashinfer_workspace_size = 512 * 1024 * 1024
106109

107110
# Allocate buffers

python/sglang/srt/models/qwen2.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,7 @@ def __init__(
239239
config: Qwen2Config,
240240
quant_config: Optional[QuantizationConfig] = None,
241241
prefix: str = "",
242+
decoder_layer_type: type[nn.Module] = Qwen2DecoderLayer,
242243
) -> None:
243244
super().__init__()
244245
self.config = config
@@ -250,9 +251,11 @@ def __init__(
250251
quant_config=quant_config,
251252
prefix=add_prefix("embed_tokens", prefix),
252253
)
254+
# Use the provided decoder layer type or default to Qwen2DecoderLayer
255+
decoder_layer_type = decoder_layer_type or Qwen2DecoderLayer
253256
self.layers = make_layers(
254257
config.num_hidden_layers,
255-
lambda idx, prefix: Qwen2DecoderLayer(
258+
lambda idx, prefix: decoder_layer_type(
256259
layer_id=idx,
257260
config=config,
258261
quant_config=quant_config,

python/sglang/srt/models/qwen2_moe.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
from sglang.srt.managers.expert_distribution import ExpertDistributionRecorder
4848
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
4949
from sglang.srt.model_loader.weight_utils import default_weight_loader
50-
from sglang.srt.utils import add_prefix
50+
from sglang.srt.utils import add_prefix, make_layers
5151

5252
expert_distribution_recorder = ExpertDistributionRecorder()
5353

@@ -334,6 +334,7 @@ def __init__(
334334
config: PretrainedConfig,
335335
quant_config: Optional[QuantizationConfig] = None,
336336
prefix: str = "",
337+
decoder_layer_type: type[nn.Module] = Qwen2MoeDecoderLayer,
337338
) -> None:
338339
super().__init__()
339340
self.padding_idx = config.pad_token_id
@@ -344,16 +345,17 @@ def __init__(
344345
config.hidden_size,
345346
prefix=add_prefix("embed_tokens", prefix),
346347
)
347-
self.layers = nn.ModuleList(
348-
[
349-
Qwen2MoeDecoderLayer(
350-
config,
351-
layer_id,
352-
quant_config=quant_config,
353-
prefix=add_prefix(f"layers.{layer_id}", prefix),
354-
)
355-
for layer_id in range(config.num_hidden_layers)
356-
]
348+
# Use the provided decoder layer type or default to Qwen2MoeDecoderLayer
349+
decoder_layer_type = decoder_layer_type or Qwen2MoeDecoderLayer
350+
self.layers = make_layers(
351+
config.num_hidden_layers,
352+
lambda idx, prefix: decoder_layer_type(
353+
layer_id=idx,
354+
config=config,
355+
quant_config=quant_config,
356+
prefix=prefix,
357+
),
358+
prefix=add_prefix("layers", prefix),
357359
)
358360
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
359361

0 commit comments

Comments
 (0)