feat: add flashinfer decode mla operator in the attention module

niushengxiao · niushengxiao · commit 3c51248ccee7 · 2025-02-24T12:35:38.000+08:00
diff --git a/lightllm/models/deepseek2/infer_struct.py b/lightllm/models/deepseek2/infer_struct.py
@@ -3,20 +3,70 @@
 import numpy as np
 import torch.distributed as dist
 from lightllm.models.llama.infer_struct import LlamaInferStateInfo
+from lightllm.models.deepseek2.triton_kernel.repack_kv_index import repack_kv_index
+import flashinfer
 
 
 class Deepseek2InferStateInfo(LlamaInferStateInfo):
     def __init__(self):
         super().__init__()
         self.kv_starts = None
         self.enable_dp = os.getenv("ENABLE_DP", "0").upper() in ["ON", "TRUE", "1"]
+        self.enable_flashinfer_decode_mla = os.getenv("ENABLE_FLASHINFER_DECODE_MLA", "False").upper() in [
+            "ON",
+            "TRUE",
+            "1",
+        ]
 
     def init_some_extra_state(self, model, input_ids: torch.Tensor):
         super().init_some_extra_state(model, input_ids)
         # 只有 decode 阶段使用 ppl 的优化算子才会有这个管理变量
         if not self.is_prefill:
             self.kv_starts = torch.cat([self.b_start_loc, self.b_start_loc[-1:] + self.b_seq_len[-1:]], dim=0)
             self.total_token_num_tensor = torch.sum(self.b_seq_len)
+            if self.enable_flashinfer_decode_mla:
+                self.workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8).to(input_ids.device)
+                self.q_indptr = torch.arange(self.batch_size + 1, dtype=torch.int32).to(input_ids.device)
+                self.kv_indices = torch.empty(self.batch_size * model.max_seq_length, dtype=torch.int32).to(
+                    input_ids.device
+                )
+                repack_kv_index(
+                    self.req_manager.req_to_token_indexs,
+                    self.b_req_idx,
+                    self.b_seq_len,
+                    self.b_start_loc,
+                    self.max_len_in_batch,
+                    self.kv_indices,
+                )
+                self.wrapper = flashinfer.mla.BatchMLAPagedAttentionWrapper(
+                    self.workspace_buffer,
+                    backend="fa2",
+                    use_cuda_graph=True,
+                    qo_indptr=self.q_indptr,
+                    kv_indices=self.kv_indices,
+                    kv_indptr=self.kv_starts,
+                    kv_len_arr=self.b_seq_len,
+                )
+                self.head_num = model.tp_q_head_num_ * model.world_size_ if self.enable_dp else model.tp_q_head_num_
+                self.kv_lora_rank = model.kv_lora_rank
+                self.qk_rope_head_dim = model.qk_rope_head_dim
+                self.softmax_scale = model.softmax_scale
+                self.q_data_type = model.data_type
+                self.kv_data_type = model.data_type
+                self.wrapper.plan(
+                    self.q_indptr,
+                    self.kv_starts,
+                    self.kv_indices,
+                    self.b_seq_len,
+                    self.head_num,
+                    self.kv_lora_rank,
+                    self.qk_rope_head_dim,
+                    1,
+                    False,  # causal
+                    self.softmax_scale,
+                    self.q_data_type,
+                    self.kv_data_type,
+                )
 
         if self.is_prefill:
             self.b_kv_start_loc = self.b_seq_len.cumsum(dim=0) - self.b_seq_len
@@ -36,3 +86,22 @@ def init_some_extra_state(self, model, input_ids: torch.Tensor):
             self.end_idx = self.all_end_idx[rank]
 
         return
+
+    def copy_for_cuda_graph(self, new_infer_state):
+        super().copy_for_cuda_graph(new_infer_state)
+        if self.enable_flashinfer_decode_mla:
+            self.wrapper.plan(
+                self.q_indptr,
+                self.kv_starts,
+                self.kv_indices,
+                self.b_seq_len,
+                self.head_num,
+                self.kv_lora_rank,
+                self.qk_rope_head_dim,
+                1,
+                False,  # causal
+                self.softmax_scale,
+                self.q_data_type,
+                self.kv_data_type,
+            )
+        return
diff --git a/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py b/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py
@@ -69,6 +69,11 @@ def __init__(self, layer_num, tp_rank, world_size, network_config, mode=[]):
         self.num_heads = network_config["num_attention_heads"]
         self.num_kv_heads = network_config["num_key_value_heads"]
         self.enable_opt_decoding_mha = os.getenv("ENABLE_OPT_DECODE_MHA", "False").upper() in ["ON", "TRUE", "1"]
+        self.enable_flashinfer_decode_mla = os.getenv("ENABLE_FLASHINFER_DECODE_MLA", "False").upper() in [
+            "ON",
+            "TRUE",
+            "1",
+        ]
         return
 
     def _bind_func(self):
@@ -369,7 +374,17 @@ def _token_gqa_decode_attention_flashdecoding(
                 infer_state.b_req_idx,
                 self.softmax_scale,
                 q.shape[-1],
-                q_nope.shape[-1],
+                self.kv_lora_rank,
+            )
+            return o_tensor
+        elif self.enable_flashinfer_decode_mla:
+            infer_state.wrapper.run(
+                q_nope,
+                q_rope,
+                kv[:, :, : -self.qk_rope_head_dim],
+                kv[:, :, -self.qk_rope_head_dim :],
+                out=o_tensor,
+                return_lse=False,
             )
             return o_tensor
         else:
diff --git a/lightllm/models/deepseek2/model.py b/lightllm/models/deepseek2/model.py
@@ -8,6 +8,7 @@
 from lightllm.common.deepseek2_mem_manager import Deepseek2MemoryManager
 from lightllm.common.deepseek2_fp8kv_mem_manager import Deepseek2FP8KVMemoryManager
 from lightllm.utils.log_utils import init_logger
+from lightllm.models.llama.yarn_rotary_utils import get_deepseek_mscale
 
 
 logger = init_logger(__name__)
@@ -37,6 +38,15 @@ def _init_some_value(self):
         self.q_lora_rank = self.config["q_lora_rank"]
         self.kv_lora_rank = self.config["kv_lora_rank"]
         self.head_dim_ = self.kv_lora_rank + self.qk_rope_head_dim
+        self.tp_q_head_num_ = self.config["num_attention_heads"] // self.world_size_
+        self.softmax_scale = (self.qk_nope_head_dim + self.qk_rope_head_dim) ** (-0.5)
+        if self.config["rope_scaling"] is not None:
+            rope_scaling = self.config["rope_scaling"]
+            mscale_all_dim = rope_scaling.get("mscale_all_dim", 0)
+            scaling_factor = rope_scaling["factor"]
+            if mscale_all_dim:
+                mscale = get_deepseek_mscale(scaling_factor, mscale_all_dim)
+                self.softmax_scale = self.softmax_scale * mscale * mscale
 
     def _init_custom(self):
         self._init_to_get_yarn_rotary()
diff --git a/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding.py b/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding.py
@@ -165,7 +165,7 @@ def _fwd_kernel_calcu_index_and_block_seq(
     import flashinfer
     import lightllm_ppl_mla
 
-    Z, N_CTX, H, D_HEAD, ROPE_HEAD = 200, 16384, 16, 512, 64
+    Z, N_CTX, H, D_HEAD, ROPE_HEAD = 10, 1024, 16, 512, 64
     dtype = torch.bfloat16
     sm_scale = 1.0 / ((D_HEAD + ROPE_HEAD) ** 0.5)
     q_nope = torch.randn((Z, H, D_HEAD), dtype=dtype, device="cuda")
@@ -181,11 +181,11 @@ def _fwd_kernel_calcu_index_and_block_seq(
     b_start_loc = torch.arange(Z).cuda().int() * N_CTX
     b_start_loc[0] = 0
     b_req_idx = torch.arange(Z).cuda().int()
-    req_to_token_indexs = torch.arange(Z * N_CTX, dtype=torch.int32).cuda().view(req_to_token_indexs.shape)
     kv_starts = torch.cat([b_start_loc, b_start_loc[-1:] + b_seq_len[-1:]], dim=0)
 
     o = torch.zeros((Z, H, D_HEAD), dtype=dtype, device="cuda")
     o1 = torch.zeros((Z, H, D_HEAD), dtype=dtype, device="cuda")
+    o2 = torch.zeros((Z, H, D_HEAD), dtype=dtype, device="cuda")
 
     infer_state = Deepseek2InferStateInfo()
     infer_state.batch_size = Z
@@ -212,7 +212,6 @@ def _fwd_kernel_calcu_index_and_block_seq(
         sm_scale,
         o,
     )
-    fn1()
 
     q = torch.cat([q_nope, q_rope], dim=-1)
     fn2 = lambda: lightllm_ppl_mla.decode_mla(
@@ -226,17 +225,18 @@ def _fwd_kernel_calcu_index_and_block_seq(
         D_HEAD + ROPE_HEAD,
         D_HEAD,
     )
-    fn2()
 
     batch_size = Z
     head_dim_ckv = D_HEAD
     head_dim_kpe = ROPE_HEAD
     num_heads = H
     page_size = 1
     workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8).to(0)
-    q_indptr = torch.arange(0, batch_size + 1).to(0).int()
+    q_indptr = torch.arange(batch_size + 1).to(0).int()
     kv_indptr = infer_state.kv_starts
     kv_indices = torch.arange(Z * N_CTX).cuda().int()
+    for b, sl, start in zip(b_req_idx, b_seq_len, b_start_loc):
+        kv_indices[start : start + sl] = req_to_token_indexs[b][:sl]
     kv_lens = b_seq_len
     wrapper = flashinfer.mla.BatchMLAPagedAttentionWrapper(
         workspace_buffer,
@@ -261,15 +261,15 @@ def _fwd_kernel_calcu_index_and_block_seq(
         q_nope.dtype,
         kv.dtype,
     )
-    o2 = wrapper.run(q_nope, q_rope, kv_nope, kv_rope, return_lse=False)
-    fn3 = lambda: wrapper.run(q_nope, q_rope, kv_nope, kv_rope, return_lse=False)
+    fn3 = lambda: wrapper.run(q_nope, q_rope, kv_nope, kv_rope, out=o2, return_lse=False)
 
-    cos_sim1 = F.cosine_similarity(o, o1).mean()
-    cos_sim2 = F.cosine_similarity(o, o2).mean()
-    print(cos_sim1, cos_sim2)
     ms1 = triton.testing.do_bench_cudagraph(fn1)
     ms2 = triton.testing.do_bench_cudagraph(fn2)
     ms3 = triton.testing.do_bench_cudagraph(fn3)
     print(ms1)
     print(ms2)
     print(ms3)
+
+    cos_sim1 = F.cosine_similarity(o, o1).mean()
+    cos_sim2 = F.cosine_similarity(o, o2).mean()
+    print(cos_sim1, cos_sim2)
diff --git a/lightllm/models/deepseek2/triton_kernel/repack_kv_index.py b/lightllm/models/deepseek2/triton_kernel/repack_kv_index.py
@@ -0,0 +1,88 @@
+import torch
+
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _fwd_kernel_repack_kv_index(
+    kv_index,
+    req_index,
+    out_kv_index,
+    seq_len,
+    start_loc,
+    kv_stride_h,
+    SEQ_BLOCK: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    start_seq_n = tl.program_id(1)
+
+    cur_batch_seq_len = tl.load(seq_len + cur_batch)
+    cur_batch_req_idx = tl.load(req_index + cur_batch)
+    cur_batch_start_loc = tl.load(start_loc + cur_batch)
+
+    offs_seq = start_seq_n * SEQ_BLOCK + tl.arange(0, SEQ_BLOCK)
+    block_end_loc = tl.minimum((start_seq_n + 1) * SEQ_BLOCK, cur_batch_seq_len)
+    kv_index_data = tl.load(
+        kv_index + kv_stride_h * cur_batch_req_idx + offs_seq,
+        mask=offs_seq < block_end_loc,
+        other=0,
+    )
+    out_kv_index_ptr = out_kv_index + cur_batch_start_loc + offs_seq
+    tl.store(out_kv_index_ptr, kv_index_data, mask=offs_seq < block_end_loc)
+    return
+
+
+@torch.no_grad()
+def repack_kv_index(kv_index, req_index, seq_len, start_loc, max_seq_len, out_kv_index):
+    batch_size = req_index.shape[0]
+    BLOCK = 64
+    grid = (
+        batch_size,
+        triton.cdiv(max_seq_len, BLOCK),
+    )
+
+    _fwd_kernel_repack_kv_index[grid](
+        kv_index,
+        req_index,
+        out_kv_index,
+        seq_len,
+        start_loc,
+        kv_index.stride(0),
+        SEQ_BLOCK=BLOCK,
+        num_warps=8,
+        num_stages=1,
+    )
+    return
+
+
+def repack_kv_ref(req_to_token_indexs, b_req_idx, b_seq_len, b_start_loc, output):
+    for b, sl, start in zip(b_req_idx, b_seq_len, b_start_loc):
+        ref[start : start + sl] = req_to_token_indexs[b][:sl]
+
+
+if __name__ == "__main__":
+    import torch.nn.functional as F
+
+    BATCH, MAX_SEQ_LEN = 10, 1024
+    rand_idx = torch.randperm(2 * MAX_SEQ_LEN * BATCH).cuda().int()
+    b_req_idx = torch.randperm(BATCH).cuda().int()
+    b_seq_len = torch.randint(1, MAX_SEQ_LEN, (BATCH,)).cuda().int()
+    req_to_token_indexs = torch.zeros((2 * BATCH, 2 * MAX_SEQ_LEN)).cuda().int()
+    b_start_loc = (
+        torch.cat([torch.zeros([1], device=b_seq_len.device, dtype=b_seq_len.dtype), b_seq_len[0:-1].cumsum(0)])
+        .cuda()
+        .int()
+    )
+
+    output = torch.zeros((b_seq_len.sum(),)).cuda().int()
+    ref = torch.zeros((b_seq_len.sum(),)).cuda().int()
+    for b, sl, start in zip(b_req_idx, b_seq_len, b_start_loc):
+        req_to_token_indexs[b][:sl] = rand_idx[start : start + sl]
+
+    fn1 = lambda: repack_kv_ref(req_to_token_indexs, b_req_idx, b_seq_len, b_start_loc, ref)
+    fn2 = lambda: repack_kv_index(req_to_token_indexs, b_req_idx, b_seq_len, b_start_loc, MAX_SEQ_LEN, output)
+    ms1 = triton.testing.do_bench(fn1)
+    ms2 = triton.testing.do_bench_cudagraph(fn2)
+    print(ms1, ms2)
+    assert torch.allclose(output.float(), ref.float())