use local attn dp size

ch-wan · ch-wan · commit fdf58ea5150b · 2025-04-19T20:37:44.000Z
diff --git a/.gitignore b/.gitignore
@@ -175,6 +175,7 @@ benchmark/llava_bench/images
 benchmark/llava_bench/mme_pack
 *.jsonl
 tmp*.txt
+core.*
 
 # Plots
 *.png
diff --git a/python/sglang/srt/layers/dp_attention.py b/python/sglang/srt/layers/dp_attention.py
@@ -26,21 +26,33 @@
 _ATTN_TP_SIZE = None
 _ATTN_DP_RANK = None
 _ATTN_DP_SIZE = None
+_LOCAL_ATTN_DP_SIZE = None
+_LOCAL_ATTN_DP_RANK = None
+
+def compute_dp_attention_world_info(enable_dp_attention, tp_rank, tp_size, dp_size):
+    if not enable_dp_attention:
+        return tp_rank, tp_size, 0
+
+    attn_tp_size = tp_size // dp_size
+    attn_dp_rank = tp_rank //tp_size
+    attn_tp_rank = tp_rank % attn_tp_size
+
+    return attn_tp_rank, attn_tp_size, attn_dp_rank
 
 
-def compute_dp_attention_world_info(enable_dp_attention, tp_rank, tp_size, dp_size, moe_dense_tp_size):
+def compute_dp_attention_local_info(enable_dp_attention, tp_rank, tp_size, dp_size, moe_dense_tp_size):
     if not enable_dp_attention:
         return tp_rank, tp_size, 0
     
     local_tp_size = moe_dense_tp_size if moe_dense_tp_size else tp_size
     local_tp_rank = tp_rank % local_tp_size
     local_dp_size = dp_size // (tp_size // local_tp_size)
 
-    attn_tp_size = local_tp_size // local_dp_size
-    attn_dp_rank = local_tp_rank // attn_tp_size
-    attn_tp_rank = local_tp_rank % attn_tp_size
+    local_attn_tp_size = local_tp_size // local_dp_size
+    local_attn_dp_rank = local_tp_rank // local_attn_tp_size
+    local_attn_tp_rank = local_tp_rank % local_attn_tp_size
 
-    return attn_tp_rank, attn_tp_size, attn_dp_rank
+    return local_attn_tp_rank, local_attn_tp_size, local_attn_dp_rank
 
 
 def initialize_dp_attention(
@@ -51,20 +63,26 @@ def initialize_dp_attention(
     moe_dense_tp_size: Optional[int],
 ):
     global _ATTN_TP_GROUP, _ATTN_TP_RANK, _ATTN_TP_SIZE, _ATTN_DP_RANK, _ATTN_DP_SIZE
+    global _LOCAL_ATTN_DP_SIZE, _LOCAL_ATTN_DP_RANK
 
     from sglang.srt.layers.sampler import SYNC_TOKEN_IDS_ACROSS_TP
 
     _ATTN_TP_RANK, _ATTN_TP_SIZE, _ATTN_DP_RANK = compute_dp_attention_world_info(
+        enable_dp_attention, tp_rank, tp_size, dp_size
+    )
+    _, _, _LOCAL_ATTN_DP_RANK = compute_dp_attention_local_info(
         enable_dp_attention, tp_rank, tp_size, dp_size, moe_dense_tp_size
     )
 
     if enable_dp_attention:
+        _ATTN_DP_SIZE = dp_size
         if moe_dense_tp_size is None:
-            _ATTN_DP_SIZE = dp_size
+            _LOCAL_ATTN_DP_SIZE = _ATTN_DP_SIZE
         else:
-            _ATTN_DP_SIZE = dp_size // (tp_size // moe_dense_tp_size)
+            _LOCAL_ATTN_DP_SIZE = dp_size // (tp_size // moe_dense_tp_size)
     else:
         _ATTN_DP_SIZE = 1
+        _LOCAL_ATTN_DP_SIZE = 1
 
     logger.info(f"{(_ATTN_TP_RANK, _ATTN_TP_SIZE, _ATTN_DP_RANK, _ATTN_DP_SIZE)=}")
 
@@ -110,6 +128,16 @@ def get_attention_dp_size():
     return _ATTN_DP_SIZE
 
 
+def get_local_attention_dp_rank():
+    assert _LOCAL_ATTN_DP_RANK is not None, "dp attention not initialized!"
+    return _LOCAL_ATTN_DP_RANK
+
+
+def get_local_attention_dp_size():
+    assert _LOCAL_ATTN_DP_SIZE is not None, "dp attention not initialized!"
+    return _LOCAL_ATTN_DP_SIZE
+
+
 @contextmanager
 def disable_dp_size():
     """Patch the tp group temporarily until this function ends.
@@ -132,7 +160,7 @@ def disable_dp_size():
 
 
 def get_dp_local_info(forward_batch: ForwardBatch):
-    dp_rank = get_attention_dp_rank()
+    dp_rank = get_local_attention_dp_rank()
 
     if forward_batch.dp_local_start_pos is None:
         cumtokens = torch.cumsum(forward_batch.global_num_tokens_gpu, dim=0)
diff --git a/python/sglang/srt/layers/logits_processor.py b/python/sglang/srt/layers/logits_processor.py
@@ -30,8 +30,8 @@
 from sglang.srt.layers.dp_attention import (
     dp_gather_replicate,
     dp_scatter,
-    get_attention_dp_rank,
-    get_attention_dp_size,
+    get_local_attention_dp_rank,
+    get_local_attention_dp_size,
 )
 from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from sglang.srt.managers.schedule_batch import global_server_args_dict
@@ -169,7 +169,7 @@ def compute_dp_attention_metadata(self, hidden_states: torch.Tensor):
             return
 
         cumtokens = torch.cumsum(self.global_num_tokens_for_logprob_gpu, dim=0)
-        dp_rank = get_attention_dp_rank()
+        dp_rank = get_local_attention_dp_rank()
         if dp_rank == 0:
             dp_local_start_pos = torch.zeros_like(
                 self.global_num_tokens_for_logprob_gpu[0]
@@ -202,7 +202,7 @@ def __init__(
             not skip_all_gather and get_tensor_model_parallel_world_size() > 1
         )
         self.do_tensor_parallel_all_gather_dp_attn = (
-            self.do_tensor_parallel_all_gather and get_attention_dp_size() != 1
+            self.do_tensor_parallel_all_gather and get_local_attention_dp_size() != 1
         )
         self.final_logit_softcapping = getattr(
             self.config, "final_logit_softcapping", None
@@ -315,7 +315,7 @@ def forward(
 
         if self.debug_tensor_dump_output_folder:
             assert (
-                not self.do_tensor_parallel_all_gather or get_attention_dp_size() == 1
+                not self.do_tensor_parallel_all_gather or get_local_attention_dp_size() == 1
             ), "dp attention + sharded lm_head doesn't support full logits"
             full_logits = self._get_logits(hidden_states, lm_head, logits_metadata)
             dump_to_file(self.debug_tensor_dump_output_folder, "logits", full_logits)
diff --git a/python/sglang/srt/managers/data_parallel_controller.py b/python/sglang/srt/managers/data_parallel_controller.py
@@ -192,7 +192,6 @@ def launch_tensor_parallel_group(
                     tp_rank,
                     server_args.tp_size,
                     server_args.dp_size,
-                    server_args.moe_dense_tp_size,
                 )
                 # compute zmq ports for this dp rank
                 rank_port_args = PortArgs.init_new(server_args, dp_rank)
diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py
@@ -38,7 +38,7 @@
 from sglang.srt.layers.dp_attention import (
     dp_gather_partial,
     dp_scatter,
-    get_attention_dp_size,
+    get_local_attention_dp_size,
     get_attention_tp_rank,
     get_attention_tp_size,
     tp_all_gather,
@@ -420,7 +420,6 @@ def __init__(
         self.v_head_dim = v_head_dim
         self.q_lora_rank = q_lora_rank
         self.kv_lora_rank = kv_lora_rank
-        self.dp_size = get_attention_dp_size()
         attn_tp_rank = get_attention_tp_rank()
         attn_tp_size = get_attention_tp_size()
 
@@ -1034,7 +1033,7 @@ def __init__(
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.enable_dp_attention = global_server_args_dict["enable_dp_attention"]
         self.layer_id = layer_id
-        self.dp_size = get_attention_dp_size()
+        self.local_dp_size = get_local_attention_dp_size()
         self.attn_tp_size = get_attention_tp_size()
         self.attn_tp_rank = get_attention_tp_rank()
         self.self_attn = DeepseekV2AttentionMLA(
@@ -1166,7 +1165,7 @@ def forward_ffn_with_full_input(
         # Gather
         if get_tensor_model_parallel_world_size() > 1:
             # all gather and all reduce
-            if self.dp_size != 1:
+            if self.local_dp_size != 1:
                 if self.attn_tp_rank == 0:
                     hidden_states += residual
                 hidden_states, local_hidden_states = (
@@ -1197,7 +1196,7 @@ def forward_ffn_with_full_input(
 
         # TODO(ch-wan): ues reduce-scatter in MLP to avoid this scatter
         # Scatter
-        if self.dp_size != 1:
+        if self.local_dp_size != 1:
             # important: forward batch.gathered_buffer is used both after scatter and after gather.
             # be careful about this!
             hidden_states, global_hidden_states = (
@@ -1341,8 +1340,6 @@ def __init__(
         )
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
-        self.dp_size = get_attention_dp_size()
-
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -1411,10 +1408,9 @@ def __init__(
             config.hidden_size,
             quant_config=quant_config,
             prefix=add_prefix("lm_head", prefix),
-            enable_tp=not _enable_moe_dense_fully_dp(),
+            enable_tp=not _enable_moe_dense_fully_dp(), # TODO: replace it with DP attention
         )
         self.logits_processor = LogitsProcessor(config)
-        self.dp_size = get_attention_dp_size()
 
     def get_input_embeddings(self) -> nn.Embedding:
         return self.model.embed_tokens
diff --git a/python/sglang/srt/models/llama4.py b/python/sglang/srt/models/llama4.py
@@ -30,7 +30,7 @@
 from sglang.srt.layers.dp_attention import (
     dp_gather_partial,
     dp_scatter,
-    get_attention_dp_size,
+    get_local_attention_dp_size,
     get_attention_tp_rank,
     get_attention_tp_size,
 )
@@ -152,7 +152,6 @@ def __init__(
         self.use_rope = int((layer_id + 1) % 4 != 0)
         self.use_qk_norm = config.use_qk_norm and self.use_rope
 
-        self.dp_size = get_attention_dp_size()
         attn_tp_rank = get_attention_tp_rank()
         attn_tp_size = get_attention_tp_size()
 
@@ -297,7 +296,7 @@ def __init__(
         rope_theta = config.rope_theta
         rope_scaling = config.rope_scaling
         max_position_embeddings = config.max_position_embeddings
-        self.dp_size = get_attention_dp_size()
+        self.local_dp_size = get_local_attention_dp_size()
         self.attn_tp_size = get_attention_tp_size()
         self.attn_tp_rank = get_attention_tp_rank()
 
@@ -360,7 +359,7 @@ def forward(
         # Gather
         if get_tensor_model_parallel_world_size() > 1:
             # all gather and all reduce
-            if self.dp_size != 1:
+            if self.local_dp_size != 1:
                 if self.attn_tp_rank == 0:
                     hidden_states += residual
                 hidden_states, local_hidden_states = (
@@ -385,7 +384,7 @@ def forward(
 
         # TODO(ch-wan): ues reduce-scatter in MLP to avoid this scatter
         # Scatter
-        if self.dp_size != 1:
+        if self.local_dp_size != 1:
             # important: forward batch.gathered_buffer is used both after scatter and after gather.
             # be careful about this!
             hidden_states, global_hidden_states = (

Original file line number	Diff line number	Diff line change
`@@ -192,7 +192,6 @@ def launch_tensor_parallel_group(`
`192`	`192`	`tp_rank,`
`193`	`193`	`server_args.tp_size,`
`194`	`194`	`server_args.dp_size,`
`195`		`- server_args.moe_dense_tp_size,`
`196`	`195`	`)`
`197`	`196`	`# compute zmq ports for this dp rank`
`198`	`197`	`rank_port_args = PortArgs.init_new(server_args, dp_rank)`