Skip to content

Commit bbb5f05

Browse files
HandH1998laixinnsleepcoozhyncs
authored andcommitted
Support Llama4 fp8 inference (sgl-project#5194)
Co-authored-by: laixinn <[email protected]> Co-authored-by: sleepcoo <[email protected]> Co-authored-by: zhyncs <[email protected]>
1 parent 72e66fd commit bbb5f05

File tree

14 files changed

+537
-106
lines changed

14 files changed

+537
-106
lines changed

python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py

Lines changed: 33 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,7 @@ def fused_moe_kernel(
342342
use_fp8_w8a8: tl.constexpr,
343343
use_int8_w8a8: tl.constexpr,
344344
use_int8_w8a16: tl.constexpr,
345+
per_channel_quant: tl.constexpr,
345346
even_Ks: tl.constexpr,
346347
):
347348
"""
@@ -416,20 +417,7 @@ def fused_moe_kernel(
416417
)
417418
b_scale = tl.load(b_scale_ptrs)
418419

419-
if use_fp8_w8a8:
420-
# block-wise
421-
if group_k > 0 and group_n > 0:
422-
a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
423-
offs_bsn = offs_bn // group_n
424-
b_scale_ptrs = (
425-
b_scale_ptr + off_experts * stride_bse + offs_bsn * stride_bsn
426-
)
427-
# tensor-wise
428-
else:
429-
a_scale = tl.load(a_scale_ptr)
430-
b_scale = tl.load(b_scale_ptr + off_experts)
431-
432-
if use_int8_w8a8:
420+
if use_fp8_w8a8 or use_int8_w8a8:
433421
# block-wise
434422
if group_k > 0 and group_n > 0:
435423
a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
@@ -438,15 +426,18 @@ def fused_moe_kernel(
438426
b_scale_ptr + off_experts * stride_bse + offs_bsn * stride_bsn
439427
)
440428
# channel-wise
441-
else:
442-
# Load per-column scale for weights
429+
elif per_channel_quant:
443430
b_scale_ptrs = (
444431
b_scale_ptr + off_experts * stride_bse + offs_bn[None, :] * stride_bsn
445432
)
446433
b_scale = tl.load(b_scale_ptrs)
447434
# Load per-token scale for activations
448435
a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
449436
a_scale = tl.load(a_scale_ptrs, mask=token_mask, other=0.0)[:, None]
437+
# tensor-wise
438+
else:
439+
a_scale = tl.load(a_scale_ptr)
440+
b_scale = tl.load(b_scale_ptr + off_experts)
450441

451442
# -----------------------------------------------------------
452443
# Iterate to compute a block of the C matrix.
@@ -753,6 +744,7 @@ def invoke_fused_moe_kernel(
753744
use_int8_w8a8: bool,
754745
use_int8_w8a16: bool,
755746
use_int4_w4a16: bool,
747+
per_channel_quant: bool,
756748
block_shape: Optional[List[int]] = None,
757749
no_combine: bool = False,
758750
) -> None:
@@ -777,10 +769,15 @@ def invoke_fused_moe_kernel(
777769
if block_shape is None:
778770
# activation tensor-wise fp8 quantization, dynamic or static
779771
padded_size = padding_size
772+
# activations apply per-token quantization when weights apply per-channel quantization by default
780773
if _is_cuda:
781-
A, A_scale = sgl_scaled_fp8_quant(A, A_scale)
774+
A, A_scale = sgl_scaled_fp8_quant(
775+
A, A_scale, use_per_token_if_dynamic=per_channel_quant
776+
)
782777
else:
783-
A, A_scale = vllm_ops.scaled_fp8_quant(A, A_scale)
778+
A, A_scale = vllm_ops.scaled_fp8_quant(
779+
A, A_scale, use_per_token_if_dynamic=per_channel_quant
780+
)
784781
else:
785782
# activation block-wise fp8 quantization
786783
assert len(block_shape) == 2
@@ -796,6 +793,9 @@ def invoke_fused_moe_kernel(
796793
assert B_scale is not None
797794
if block_shape is None:
798795
# activation channel-wise int8 quantization
796+
assert (
797+
per_channel_quant
798+
), "int8 quantization only supports channel-wise quantization except for block-wise quantization"
799799
A, A_scale = per_token_quant_int8(A)
800800
else:
801801
# activation block-wise int8 quantization
@@ -904,6 +904,7 @@ def invoke_fused_moe_kernel(
904904
use_fp8_w8a8=use_fp8_w8a8,
905905
use_int8_w8a8=use_int8_w8a8,
906906
use_int8_w8a16=use_int8_w8a16,
907+
per_channel_quant=per_channel_quant,
907908
even_Ks=even_Ks,
908909
**config,
909910
)
@@ -1086,6 +1087,7 @@ def inplace_fused_experts(
10861087
use_int8_w8a8: bool = False,
10871088
use_int8_w8a16: bool = False,
10881089
use_int4_w4a16: bool = False,
1090+
per_channel_quant: bool = False,
10891091
w1_scale: Optional[torch.Tensor] = None,
10901092
w2_scale: Optional[torch.Tensor] = None,
10911093
w1_zp: Optional[torch.Tensor] = None,
@@ -1107,6 +1109,7 @@ def inplace_fused_experts(
11071109
use_int8_w8a8,
11081110
use_int8_w8a16,
11091111
use_int4_w4a16,
1112+
per_channel_quant,
11101113
w1_scale,
11111114
w2_scale,
11121115
w1_zp,
@@ -1129,6 +1132,7 @@ def inplace_fused_experts_fake(
11291132
use_int8_w8a8: bool = False,
11301133
use_int8_w8a16: bool = False,
11311134
use_int4_w4a16: bool = False,
1135+
per_channel_quant: bool = False,
11321136
w1_scale: Optional[torch.Tensor] = None,
11331137
w2_scale: Optional[torch.Tensor] = None,
11341138
w1_zp: Optional[torch.Tensor] = None,
@@ -1160,6 +1164,7 @@ def outplace_fused_experts(
11601164
use_int8_w8a8: bool = False,
11611165
use_int8_w8a16: bool = False,
11621166
use_int4_w4a16: bool = False,
1167+
per_channel_quant: bool = False,
11631168
w1_scale: Optional[torch.Tensor] = None,
11641169
w2_scale: Optional[torch.Tensor] = None,
11651170
w1_zp: Optional[torch.Tensor] = None,
@@ -1182,6 +1187,7 @@ def outplace_fused_experts(
11821187
use_int8_w8a8,
11831188
use_int8_w8a16,
11841189
use_int4_w4a16,
1190+
per_channel_quant,
11851191
w1_scale,
11861192
w2_scale,
11871193
w1_zp,
@@ -1205,6 +1211,7 @@ def outplace_fused_experts_fake(
12051211
use_int8_w8a8: bool = False,
12061212
use_int8_w8a16: bool = False,
12071213
use_int4_w4a16: bool = False,
1214+
per_channel_quant: bool = False,
12081215
w1_scale: Optional[torch.Tensor] = None,
12091216
w2_scale: Optional[torch.Tensor] = None,
12101217
w1_zp: Optional[torch.Tensor] = None,
@@ -1238,6 +1245,7 @@ def fused_experts(
12381245
use_int8_w8a8: bool = False,
12391246
use_int8_w8a16: bool = False,
12401247
use_int4_w4a16: bool = False,
1248+
per_channel_quant: bool = False,
12411249
w1_scale: Optional[torch.Tensor] = None,
12421250
w2_scale: Optional[torch.Tensor] = None,
12431251
w1_zp: Optional[torch.Tensor] = None,
@@ -1261,6 +1269,7 @@ def fused_experts(
12611269
use_int8_w8a8,
12621270
use_int8_w8a16,
12631271
use_int4_w4a16,
1272+
per_channel_quant,
12641273
w1_scale,
12651274
w2_scale,
12661275
w1_zp,
@@ -1283,6 +1292,7 @@ def fused_experts(
12831292
use_int8_w8a8,
12841293
use_int8_w8a16,
12851294
use_int4_w4a16,
1295+
per_channel_quant,
12861296
w1_scale,
12871297
w2_scale,
12881298
w1_zp,
@@ -1307,6 +1317,7 @@ def fused_experts_impl(
13071317
use_int8_w8a8: bool = False,
13081318
use_int8_w8a16: bool = False,
13091319
use_int4_w4a16: bool = False,
1320+
per_channel_quant: bool = False,
13101321
w1_scale: Optional[torch.Tensor] = None,
13111322
w2_scale: Optional[torch.Tensor] = None,
13121323
w1_zp: Optional[torch.Tensor] = None,
@@ -1443,6 +1454,7 @@ def fused_experts_impl(
14431454
use_int8_w8a8=use_int8_w8a8,
14441455
use_int8_w8a16=use_int8_w8a16,
14451456
use_int4_w4a16=use_int4_w4a16,
1457+
per_channel_quant=per_channel_quant,
14461458
block_shape=block_shape,
14471459
)
14481460
if activation == "silu":
@@ -1486,6 +1498,7 @@ def fused_experts_impl(
14861498
use_int8_w8a8=use_int8_w8a8,
14871499
use_int8_w8a16=use_int8_w8a16,
14881500
use_int4_w4a16=use_int4_w4a16,
1501+
per_channel_quant=per_channel_quant,
14891502
block_shape=block_shape,
14901503
)
14911504

@@ -1532,6 +1545,7 @@ def fused_moe(
15321545
use_int8_w8a8: bool = False,
15331546
use_int8_w8a16: bool = False,
15341547
use_int4_w4a16: bool = False,
1548+
per_channel_quant: bool = False,
15351549
w1_scale: Optional[torch.Tensor] = None,
15361550
w2_scale: Optional[torch.Tensor] = None,
15371551
w1_zp: Optional[torch.Tensor] = None,
@@ -1608,6 +1622,7 @@ def fused_moe(
16081622
use_int8_w8a8=use_int8_w8a8,
16091623
use_int8_w8a16=use_int8_w8a16,
16101624
use_int4_w4a16=use_int4_w4a16,
1625+
per_channel_quant=per_channel_quant,
16111626
w1_scale=w1_scale,
16121627
w2_scale=w2_scale,
16131628
w1_zp=w1_zp,

python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ def __init__(
7777
sparsity_ignore_list: List[str],
7878
kv_cache_scheme: Optional[Dict[str, Any]] = None,
7979
config: Optional[Dict[str, Any]] = None,
80+
packed_modules_mapping: Dict[str, List[str]] = {},
8081
):
8182
super().__init__()
8283
self.ignore = ignore
@@ -87,6 +88,7 @@ def __init__(
8788
self.sparsity_scheme_map = sparsity_scheme_map
8889
self.sparsity_ignore_list = sparsity_ignore_list
8990
self.config = config
91+
self.packed_modules_mapping = packed_modules_mapping
9092

9193
def get_linear_method(self) -> "CompressedTensorsLinearMethod":
9294
return CompressedTensorsLinearMethod(self)
@@ -136,6 +138,7 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
136138
sparsity_scheme_map, sparsity_ignore_list = cls._parse_sparsity_config(
137139
config=config
138140
)
141+
packed_modules_mapping = config.get("packed_modules_mapping", {})
139142

140143
return cls(
141144
target_scheme_map=target_scheme_map,
@@ -144,6 +147,7 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
144147
sparsity_scheme_map=sparsity_scheme_map,
145148
sparsity_ignore_list=sparsity_ignore_list,
146149
config=config,
150+
packed_modules_mapping=packed_modules_mapping,
147151
)
148152

149153
@classmethod

python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py

Lines changed: 66 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -103,16 +103,6 @@ def __init__(
103103
"input_activations"
104104
)
105105

106-
if not (
107-
self.weight_quant.strategy == QuantizationStrategy.TENSOR
108-
and self.input_quant.strategy == QuantizationStrategy.TENSOR
109-
):
110-
raise ValueError(
111-
"For FP8 Fused MoE layers, only per-tensor scales "
112-
"for weights and activations are supported. Found "
113-
f"{self.weight_quant}, {self.input_quant}"
114-
)
115-
116106
self.static_input_scales = not self.input_quant.dynamic
117107

118108
def create_weights(
@@ -154,27 +144,50 @@ def create_weights(
154144
set_weight_attrs(w2_weight, extra_weight_attrs)
155145

156146
# WEIGHT_SCALES
157-
# Allocate 2 scales for w1 and w3 respectively.
158-
# They will be combined to a single scale after weight loading.
159-
w13_weight_scale = torch.nn.Parameter(
160-
torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
161-
)
162-
layer.register_parameter("w13_weight_scale", w13_weight_scale)
147+
# per-tensor quantization
148+
if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
149+
# Allocate 2 scales for w1 and w3 respectively.
150+
# They will be combined to a single scale after weight loading.
151+
w13_weight_scale = torch.nn.Parameter(
152+
torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
153+
)
154+
w2_weight_scale = torch.nn.Parameter(
155+
torch.ones(num_experts, dtype=torch.float32), requires_grad=False
156+
)
157+
weight_quant_method = FusedMoeWeightScaleSupported.TENSOR.value
158+
elif self.weight_quant.strategy == QuantizationStrategy.CHANNEL:
159+
w13_weight_scale = torch.nn.Parameter(
160+
torch.ones(
161+
num_experts,
162+
2 * intermediate_size_per_partition,
163+
1,
164+
dtype=torch.float32,
165+
),
166+
requires_grad=False,
167+
)
168+
w2_weight_scale = torch.nn.Parameter(
169+
torch.ones(num_experts, hidden_size, 1, dtype=torch.float32),
170+
requires_grad=False,
171+
)
172+
weight_quant_method = FusedMoeWeightScaleSupported.CHANNEL.value
173+
else:
174+
raise ValueError(
175+
f"Unsupported weight quantization strategy: {self.weight_quant.strategy}"
176+
)
163177

164-
w2_weight_scale = torch.nn.Parameter(
165-
torch.ones(num_experts, dtype=torch.float32), requires_grad=False
166-
)
178+
layer.register_parameter("w13_weight_scale", w13_weight_scale)
167179
layer.register_parameter("w2_weight_scale", w2_weight_scale)
168180
# Add the quantization method used (per tensor/grouped/channel)
169181
# to ensure the weight scales are loaded in properly
170-
extra_weight_attrs.update(
171-
{"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
172-
)
182+
extra_weight_attrs.update({"quant_method": weight_quant_method})
173183
set_weight_attrs(w13_weight_scale, extra_weight_attrs)
174184
set_weight_attrs(w2_weight_scale, extra_weight_attrs)
175185

176186
# INPUT_SCALES
177187
if self.static_input_scales:
188+
assert (
189+
self.input_quant.strategy == QuantizationStrategy.TENSOR
190+
), "Only per-tensor quantization is supported for static input scales"
178191
w13_input_scale = torch.nn.Parameter(
179192
torch.ones(num_experts, dtype=torch.float32), requires_grad=False
180193
)
@@ -241,31 +254,37 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
241254
layer.w2_input_scale = torch.nn.Parameter(
242255
w2_input_scale, requires_grad=False
243256
)
244-
245-
# Fp8 moe kernel needs single weight scale for w13 per expert.
246-
# We take the max then dequant and requant each expert.
247-
assert layer.w13_weight_scale is not None
248-
shard_size = layer.intermediate_size_per_partition
249-
max_w13_scales = layer.w13_weight_scale.max(dim=1).values
250-
for expert_id in range(layer.local_num_experts):
251-
start = 0
252-
for shard_id in range(2):
253-
dq_weight = per_tensor_dequantize(
254-
layer.w13_weight[expert_id][start : start + shard_size, :],
255-
layer.w13_weight_scale[expert_id][shard_id],
256-
)
257-
258-
if _is_cuda:
259-
layer.w13_weight[expert_id][start : start + shard_size, :], _ = (
260-
sgl_scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
261-
)
262-
else:
263-
layer.w13_weight[expert_id][start : start + shard_size, :], _ = (
264-
vllm_ops.scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
257+
if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
258+
# Fp8 moe kernel needs single weight scale for w13 per expert.
259+
# We take the max then dequant and requant each expert.
260+
assert layer.w13_weight_scale is not None
261+
shard_size = layer.intermediate_size_per_partition
262+
max_w13_scales = layer.w13_weight_scale.max(dim=1).values
263+
for expert_id in range(layer.local_num_experts):
264+
start = 0
265+
for shard_id in range(2):
266+
dq_weight = per_tensor_dequantize(
267+
layer.w13_weight[expert_id][start : start + shard_size, :],
268+
layer.w13_weight_scale[expert_id][shard_id],
265269
)
266-
start += shard_size
267270

268-
layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales, requires_grad=False)
271+
if _is_cuda:
272+
(
273+
layer.w13_weight[expert_id][start : start + shard_size, :],
274+
_,
275+
) = sgl_scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
276+
else:
277+
(
278+
layer.w13_weight[expert_id][start : start + shard_size, :],
279+
_,
280+
) = vllm_ops.scaled_fp8_quant(
281+
dq_weight, max_w13_scales[expert_id]
282+
)
283+
start += shard_size
284+
285+
layer.w13_weight_scale = torch.nn.Parameter(
286+
max_w13_scales, requires_grad=False
287+
)
269288

270289
def apply(
271290
self,
@@ -311,6 +330,8 @@ def apply(
311330
inplace=inplace,
312331
activation=activation,
313332
use_fp8_w8a8=True,
333+
per_channel_quant=self.weight_quant.strategy
334+
== QuantizationStrategy.CHANNEL,
314335
w1_scale=layer.w13_weight_scale,
315336
w2_scale=layer.w2_weight_scale,
316337
a1_scale=layer.w13_input_scale,

0 commit comments

Comments
 (0)