Optimized CUDA GRAPH: Draft Decode by removed duplicate code

hebiao064 · qingquansong · hebiao064 · commit e3005706f306 · 2025-04-05T23:49:10.000Z
Co-authored-by: Qingquan Song &lt;ustcsqq@gmail.com&gt;
diff --git a/python/sglang/srt/layers/attention/flashattention_backend.py b/python/sglang/srt/layers/attention/flashattention_backend.py
@@ -411,7 +411,13 @@ def init_cuda_graph_state(self, max_bs: int):
         to avoid memory allocations.
         """
         self.decode_cuda_graph_metadata = {
-            # Page table for token mapping (batch_size, max_context_len)
+            "cache_seqlens": torch.zeros(max_bs, dtype=torch.int32, device=self.device),
+            "cu_seqlens_q": torch.arange(
+                0, max_bs + 1, dtype=torch.int32, device=self.device
+            ),
+            "cu_seqlens_k": torch.zeros(
+                max_bs + 1, dtype=torch.int32, device=self.device
+            ),
             "page_table": torch.zeros(
                 max_bs,
                 (self.max_context_len + self.page_size - 1) // self.page_size,
@@ -427,13 +433,6 @@ def init_cuda_graph_state(self, max_bs: int):
             "strided_indices": torch.arange(
                 0, self.max_context_len, self.page_size, device=self.device
             ),
-            "cache_seqlens": torch.zeros(max_bs, dtype=torch.int32, device=self.device),
-            "cu_seqlens_q": torch.arange(
-                0, max_bs + 128, dtype=torch.int32, device=self.device
-            ),
-            "cu_seqlens_k": torch.zeros(
-                max_bs + 128, dtype=torch.int32, device=self.device
-            ),
         }
 
         self.target_verify_metadata = {
@@ -471,26 +470,21 @@ def init_forward_metadata_capture_cuda_graph(
         if forward_mode.is_decode():
             if spec_info is not None:
                 # Draft Decode
-                metadata.cu_seqlens_q = torch.arange(
-                    0, bs + 1, dtype=torch.int32, device=device
-                )
                 metadata.cache_seqlens_int32 = self.decode_cuda_graph_metadata[
                     "cache_seqlens"
                 ][:bs]
-
+                metadata.max_seq_len_k = seq_lens.max().item() + (
+                    self.speculative_step_id + 1
+                )
                 metadata.cu_seqlens_q = self.decode_cuda_graph_metadata["cu_seqlens_q"][
                     : bs + 1
                 ]
-
                 metadata.cu_seqlens_k = torch.nn.functional.pad(
                     torch.cumsum(
                         metadata.cache_seqlens_int32, dim=0, dtype=torch.int32
                     ),
                     (1, 0),
                 )
-                metadata.max_seq_len_k = seq_lens.max().item() + (
-                    self.speculative_step_id + 1
-                )
                 metadata.page_table = self.decode_cuda_graph_metadata[
                     "page_table_draft_decode"
                 ][req_pool_indices, :]
@@ -560,26 +554,21 @@ def init_forward_metadata_replay_cuda_graph(
         out_cache_loc: torch.Tensor = None,
     ):
         # """Initialize forward metadata for replaying CUDA graph."""
-        device = seq_lens.device
         seq_lens = seq_lens[:bs]
-        req_pool_indices = req_pool_indices[:bs]
         seq_lens_cpu = seq_lens_cpu[:bs]
+        req_pool_indices = req_pool_indices[:bs]
         if forward_mode.is_decode():
             metadata = self.decode_cuda_graph_metadata[bs]
 
             if spec_info is not None:
                 # Draft Decode
-                max_len = seq_lens_cpu.max().item()
-                metadata.max_seq_len_k = max_len + (self.speculative_step_id + 1)
-
                 metadata.cache_seqlens_int32.copy_(
                     (seq_lens + (self.speculative_step_id + 1)).to(torch.int32)
                 )
 
                 metadata.max_seq_len_k = seq_lens_cpu.max().item() + (
                     self.speculative_step_id + 1
                 )
-
                 metadata.cu_seqlens_k.copy_(
                     torch.nn.functional.pad(
                         torch.cumsum(