sgl-project · yuleil · Apr 19, 2025 · Apr 23, 2025 · Apr 23, 2025
@@ -188,6 +188,11 @@ def __init__(self, model_runner: ModelRunner):
         self.speculative_algorithm = model_runner.server_args.speculative_algorithm
         self.tp_size = model_runner.server_args.tp_size
         self.dp_size = model_runner.server_args.dp_size
+        self.enable_deepep_moe = model_runner.server_args.enable_deepep_moe
+        self.moe_dense_fully_dp = (
+            model_runner.server_args.enable_deepep_moe
+            and model_runner.server_args.moe_dense_tp_size == 1
+        )
 
         # Batch sizes to capture
         self.capture_bs, self.compile_bs = get_batch_sizes_to_capture(model_runner)
@@ -301,7 +306,9 @@ def model_capture_mode(self):
 
     def can_run(self, forward_batch: ForwardBatch):
         if self.enable_dp_attention or self.enable_sp_layernorm:
-            total_global_tokens = sum(forward_batch.global_num_tokens_cpu)
+            reducer = max if self.moe_dense_fully_dp else sum
+            # DeepEP MoE layers uses a fixed shape with masking instead of gather tokens from DP ranks.
+            total_global_tokens = reducer(forward_batch.global_num_tokens_cpu)
 
             is_bs_supported = forward_batch.can_run_dp_cuda_graph and (
                 total_global_tokens in self.graphs
@@ -484,8 +491,10 @@ def replay_prepare(self, forward_batch: ForwardBatch):
 
         # Pad
         if self.enable_dp_attention or self.enable_sp_layernorm:
+            reducer = max if self.moe_dense_fully_dp else sum
+            # DeepEP MoE layers uses a fixed shape with masking instead of gather tokens from DP ranks.
             index = bisect.bisect_left(
-                self.capture_bs, sum(forward_batch.global_num_tokens_cpu)
+                self.capture_bs, reducer(forward_batch.global_num_tokens_cpu)
             )
         else:
             index = bisect.bisect_left(self.capture_bs, raw_bs)