Add rotary_position_embedding_cpu kernel instead of native impl (#18)

yanbing-j · web-flow · commit 47bc8dfe4c02 · 2025-03-25T20:47:12.000+08:00
* add rope

* remove B

* Fix issue

* update

* Add fused rope

* refactor

* add checks

* support non-contiguous

* update parallel
diff --git a/python/sglang/srt/layers/rotary_embedding.py b/python/sglang/srt/layers/rotary_embedding.py
@@ -15,6 +15,9 @@
 if _is_cuda_available:
     from sgl_kernel import apply_rope_with_cos_sin_cache_inplace
 
+from sglang.srt.cpu_utils import cpu_has_amx_support
+if cpu_has_amx_support():
+    import sgl_kernel.cpu
 
 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
     x1 = x[..., : x.shape[-1] // 2]
@@ -719,37 +722,42 @@ def forward(
         offsets: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """PyTorch-native implementation equivalent to forward()."""
-        query_rot = query[..., : self.rotary_dim]
-        key_rot = key[..., : self.rotary_dim]
-        if self.rotary_dim < self.head_size:
-            query_pass = query[..., self.rotary_dim :]
-            key_pass = key[..., self.rotary_dim :]
+        positions = torch.add(positions, offsets) if offsets is not None else positions
 
-        self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(positions.device)
-        cos_sin = self.cos_sin_cache[
-            torch.add(positions, offsets) if offsets is not None else positions
-        ]
-        cos, sin = cos_sin.chunk(2, dim=-1)
-        if self.is_neox_style:
-            # NOTE(woosuk): Here we assume that the positions tensor has the
-            # shape [batch_size, seq_len].
-            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
-            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
+        # TODO: Add scenario of self.rotary_dim < self.head_size
+        if positions.device == torch.device("cpu") and cpu_has_amx_support():
+            return sgl_kernel.cpu.rotary_position_embedding(
+                positions, query, key, self.cos_sin_cache)
         else:
-            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
-            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
+            self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(positions.device)
+            query_rot = query[..., : self.rotary_dim]
+            key_rot = key[..., : self.rotary_dim]
+            if self.rotary_dim < self.head_size:
+                query_pass = query[..., self.rotary_dim :]
+                key_pass = key[..., self.rotary_dim :]
+
+            cos_sin = self.cos_sin_cache[positions]
+            cos, sin = cos_sin.chunk(2, dim=-1)
+            if self.is_neox_style:
+                # NOTE(woosuk): Here we assume that the positions tensor has the
+                # shape [batch_size, seq_len].
+                cos = cos.repeat(1, 1, 2).unsqueeze(-2)
+                sin = sin.repeat(1, 1, 2).unsqueeze(-2)
+            else:
+                cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
+                sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
 
-        rotate_fn = _rotate_neox if self.is_neox_style else _rotate_gptj
-        query_rot = query_rot * cos + rotate_fn(query_rot) * sin
-        key_rot = key_rot * cos + rotate_fn(key_rot) * sin
+            rotate_fn = _rotate_neox if self.is_neox_style else _rotate_gptj
+            query_rot = query_rot * cos + rotate_fn(query_rot) * sin
+            key_rot = key_rot * cos + rotate_fn(key_rot) * sin
 
-        if self.rotary_dim < self.head_size:
-            query = torch.cat((query_rot, query_pass), dim=-1)
-            key = torch.cat((key_rot, key_pass), dim=-1)
-        else:
-            query = query_rot
-            key = key_rot
-        return query, key
+            if self.rotary_dim < self.head_size:
+                query = torch.cat((query_rot, query_pass), dim=-1)
+                key = torch.cat((key_rot, key_pass), dim=-1)
+            else:
+                query = query_rot
+                key = key_rot
+            return query, key
 
 
 class Llama3RotaryEmbedding(RotaryEmbedding):
diff --git a/sgl-kernel/csrc/cpu/rope.cpp b/sgl-kernel/csrc/cpu/rope.cpp
@@ -0,0 +1,129 @@
+#include "common.h"
+#include "vec.h"
+
+namespace {
+
+template <typename scalar_t>
+void rope_kernel_impl(
+    scalar_t* __restrict__ q_pe_out,
+    scalar_t* __restrict__ k_pe_out,
+    int64_t* __restrict__ t_pos,
+    scalar_t* __restrict__ q_pe,
+    scalar_t* __restrict__ k_pe,
+    scalar_t* __restrict__ t_emb_pos,
+    int64_t seq_len,
+    int64_t num_head,
+    int64_t rotary_dim,
+    int64_t HR,
+    int64_t q_pe_stride_s,
+    int64_t out_stride_qs,
+    int64_t out_stride_ks,
+    int64_t HK,
+    int64_t k_pe_stride_s,
+    int64_t q_pe_stride_n,
+    int64_t out_stride_qn) {
+  int64_t COFF = HR / 2;
+  at::parallel_for(0, seq_len * num_head, GRAIN_SIZE / rotary_dim, [&](int64_t begin, int64_t end) {
+    int64_t seq{0}, head_id{0};
+    data_index_init(begin, seq, seq_len, head_id, num_head);
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t in_offset_q = seq * q_pe_stride_s + head_id * q_pe_stride_n;
+      int64_t out_offset_q = seq * out_stride_qs + head_id * out_stride_qn;
+      int64_t out_offset_k = seq * out_stride_ks;
+      int64_t p = 0;
+      scalar_t* sin_start = nullptr;
+      scalar_t* cos_start = nullptr;
+      // step 0) get the rotary position embedding for the current position
+      p = t_pos[seq];
+      sin_start = t_emb_pos + p * HR + COFF;
+      cos_start = t_emb_pos + p * HR;
+      // step 1) apply_rotary_pos_emb for the rotary_dim elements in every
+      // head of query/key
+      for (int64_t h = 0; h < rotary_dim; h += 2) {
+        scalar_t cos = cos_start[h >> 1];
+        scalar_t sin = sin_start[h >> 1];
+        scalar_t in1 = q_pe[in_offset_q + h];
+        scalar_t in2 = q_pe[in_offset_q + h + 1];
+        scalar_t out1 = in1 * cos - in2 * sin;
+        scalar_t out2 = in2 * cos + in1 * sin;
+        q_pe_out[out_offset_q + h] = out1;
+        q_pe_out[out_offset_q + h + 1] = out2;
+      }
+      for (int64_t h = 0; h < HK; h += 2) {
+        scalar_t cos = cos_start[h >> 1];
+        scalar_t sin = sin_start[h >> 1];
+        int64_t k_pe_offset = seq * k_pe_stride_s;
+        scalar_t in1_k = k_pe[k_pe_offset + h];
+        scalar_t in2_k = k_pe[k_pe_offset + h + 1];
+        scalar_t out1_k = in1_k * cos - in2_k * sin;
+        scalar_t out2_k = in2_k * cos + in1_k * sin;
+        k_pe_out[out_offset_k + h] = out1_k;
+        k_pe_out[out_offset_k + h + 1] = out2_k;
+      }
+      // move to the next index
+      data_index_step(seq, seq_len, head_id, num_head);
+    }
+  });
+}
+}  // namespace
+
+std::tuple<at::Tensor, at::Tensor>
+rotary_position_embedding_cpu(at::Tensor& t_pos, at::Tensor& q_pe, at::Tensor& k_pe, at::Tensor& t_emb_pos) {
+  RECORD_FUNCTION(
+      "sgl-kernel::rotary_position_embedding_cpu", std::vector<c10::IValue>({t_pos, q_pe, k_pe, t_emb_pos}));
+  CHECK_INPUT (t_pos);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT (q_pe);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT (k_pe);
+  CHECK_INPUT (t_emb_pos);
+  CHECK_DIM(1, t_pos);
+  CHECK_DIM(3, q_pe);
+  CHECK_DIM(3, k_pe);
+  CHECK_DIM(2, t_emb_pos);
+
+  int64_t seq_len = q_pe.size(0);
+  int64_t num_head = q_pe.size(1);
+  int64_t rotary_dim = q_pe.size(2);
+  int64_t HK = k_pe.size(2);
+  int64_t HR = t_emb_pos.size(1);
+  CHECK_EQ(HR, rotary_dim);
+  CHECK_EQ(k_pe.size(0), seq_len);
+  CHECK_EQ(k_pe.size(1), 1);
+  CHECK_EQ(t_pos.size(0), seq_len);
+  CHECK_EQ(HK, rotary_dim);
+
+  at::Tensor q_pe_out = at::empty_like(q_pe);
+  at::Tensor k_pe_out = at::empty_like(k_pe);
+  int64_t q_pe_stride_s = q_pe.stride(0);
+  int64_t q_pe_stride_n = q_pe.stride(1);
+  int64_t k_pe_stride_s = k_pe.stride(0);
+  int64_t out_stride_qs = q_pe_out.stride(0);
+  int64_t out_stride_qn = q_pe_out.stride(1);
+  int64_t out_stride_ks = k_pe_out.stride(0);
+
+  const auto input_dtype = q_pe.scalar_type();
+  TORCH_CHECK(t_pos.scalar_type() == at::kLong, "expect positions to be int64, got ", t_pos.scalar_type());
+  TORCH_CHECK(input_dtype == k_pe.scalar_type(), "q_pe and k_pe must have the same data type");
+  TORCH_CHECK(input_dtype == t_emb_pos.scalar_type(), "q_pe and t_emb_pos must have the same data type");
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(input_dtype, "rotary_position_embedding_cpu", [&] {
+    rope_kernel_impl<scalar_t>(
+        q_pe_out.data_ptr<scalar_t>(),
+        k_pe_out.data_ptr<scalar_t>(),
+        t_pos.data_ptr<int64_t>(),
+        q_pe.data_ptr<scalar_t>(),
+        k_pe.data_ptr<scalar_t>(),
+        t_emb_pos.data_ptr<scalar_t>(),
+        seq_len,
+        num_head,
+        rotary_dim,
+        HR,
+        q_pe_stride_s,
+        out_stride_qs,
+        out_stride_ks,
+        HK,
+        k_pe_stride_s,
+        q_pe_stride_n,
+        out_stride_qn);
+  });
+  return std::make_tuple(q_pe_out, k_pe_out);
+}
diff --git a/sgl-kernel/csrc/cpu/torch_extension_cpu.cpp b/sgl-kernel/csrc/cpu/torch_extension_cpu.cpp
@@ -81,6 +81,10 @@ void initialize(int size, int rank);
 // shared mmeory all_reduce
 void shm_allreduce(at::Tensor& data, c10::intrusive_ptr<c10d::ProcessGroup> process_group, py::object op);
 
+// rope
+std::tuple<at::Tensor, at::Tensor> rotary_position_embedding_cpu(at::Tensor& t_pos, at::Tensor& q_pe,
+    at::Tensor& k_pe, at::Tensor& t_emb_pos);
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   // activation
   m.def("silu_and_mul_cpu", &silu_and_mul_cpu, "SiLU and mul for CPU");
@@ -122,4 +126,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   // all reduce
   m.def("initialize", &initialize, "shared memory initialization for CPU");
   m.def("shm_allreduce", &shm_allreduce, "low latency all_reduce implementation for CPU");
+
+  // rope
+  m.def("rotary_position_embedding_cpu", &rotary_position_embedding_cpu, "rotary position embedding for CPU");
 }
diff --git a/sgl-kernel/python/sgl_kernel/cpu.py b/sgl-kernel/python/sgl_kernel/cpu.py
@@ -200,3 +200,15 @@ def int8_scaled_mm(
 
 def per_token_quant_int8(x):
     return sgl_kernel.common_ops.per_token_quant_int8_cpu(x)
+def rotary_position_embedding(
+    t_pos,
+    q_pe,
+    k_pe,
+    t_emb_pos,
+):
+    return sgl_kernel.common_ops.rotary_position_embedding_cpu(
+        t_pos,
+        q_pe,
+        k_pe,
+        t_emb_pos,
+    )
diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py
@@ -155,6 +155,7 @@ def copy_deepgemm_to_build_lib(self):
     "csrc/cpu/moe.cpp",
     "csrc/cpu/moe_int8.cpp",
     "csrc/cpu/norm.cpp",
+    "csrc/cpu/rope.cpp",
     "csrc/cpu/topk.cpp",
     "csrc/cpu/interface.cpp",
     "csrc/cpu/shm.cpp",
diff --git a/test/srt/test_rope.py b/test/srt/test_rope.py
@@ -0,0 +1,96 @@
+import unittest
+import expecttest
+
+import torch
+import sgl_kernel.cpu
+
+class TestROPE(expecttest.TestCase):
+    def test_deepseek_v2_rope(self):
+        def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
+            x1 = x[..., : x.shape[-1] // 2]
+            x2 = x[..., x.shape[-1] // 2 :]
+            return torch.cat((-x2, x1), dim=-1)
+
+        def _rotate_gptj(x: torch.Tensor) -> torch.Tensor:
+            x1 = x[..., ::2]
+            x2 = x[..., 1::2]
+            x = torch.stack((-x2, x1), dim=-1)
+            return x.flatten(-2)
+
+        def forward_ref(positions, query, key, cos_sin_cache, offsets=None):
+            self.rotary_dim = 64
+            self.head_size = 64
+            self.is_neox_style = False
+            query_rot = query[..., : self.rotary_dim]
+            key_rot = key[..., : self.rotary_dim]
+            if self.rotary_dim < self.head_size:
+                query_pass = query[..., self.rotary_dim :]
+                key_pass = key[..., self.rotary_dim :]
+
+            cos_sin = cos_sin_cache[
+                torch.add(positions, offsets) if offsets is not None else positions
+            ]
+            cos, sin = cos_sin.chunk(2, dim=-1)
+            if self.is_neox_style:
+                # NOTE(woosuk): Here we assume that the positions tensor has the
+                # shape [batch_size, seq_len].
+                cos = cos.repeat(1, 1, 2).unsqueeze(-2)
+                sin = sin.repeat(1, 1, 2).unsqueeze(-2)
+            else:
+                cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
+                sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
+
+            rotate_fn = _rotate_neox if self.is_neox_style else _rotate_gptj
+            query_rot = query_rot * cos + rotate_fn(query_rot) * sin
+            key_rot = key_rot * cos + rotate_fn(key_rot) * sin
+
+            if self.rotary_dim < self.head_size:
+                query = torch.cat((query_rot, query_pass), dim=-1)
+                key = torch.cat((key_rot, key_pass), dim=-1)
+            else:
+                query = query_rot
+                key = key_rot
+            return query, key
+
+        num_head = 16
+        seq_len = 1024
+        q_head_dim = 192
+        qk_nope_head_dim = 128
+        qk_rope_head_dim = 64
+        max_pos = 256
+        k_dim = 576
+
+        # Create cos_sin_cache
+        freqs = torch.rand(max_pos, qk_rope_head_dim // 2)
+        cos = freqs.cos() * 0.7
+        sin = freqs.sin() * 0.7
+        cos_sin_cache = torch.cat((cos, sin), dim=-1).to(torch.bfloat16)
+        positions = torch.randint(0, max_pos, (seq_len,))
+
+        for dtype in [torch.bfloat16]:
+            enable_autocast = True
+
+            with torch.no_grad(), torch.cpu.amp.autocast(enabled=enable_autocast):
+                q = torch.randn(seq_len, num_head, q_head_dim, dtype=dtype)
+                q_clone = q.clone()
+                k = torch.randn(seq_len, 1, k_dim, dtype=dtype)
+                k_clone = k.clone()
+                _, q_pe = q.split([qk_nope_head_dim, qk_rope_head_dim], dim=-1)
+                _, q_pe_clone = q_clone.split([qk_nope_head_dim, qk_rope_head_dim], dim=-1)
+                k_pe = k[:, :, k_dim - qk_rope_head_dim :]
+                k_pe_clone = k_clone[:, :, k_dim - qk_rope_head_dim :]
+
+                # ref kernel
+                q_pe, k_pe = forward_ref(positions, q_pe, k_pe, cos_sin_cache)
+
+                # fused rope kernel
+                q_pe_clone, k_pe_clone = sgl_kernel.cpu.rotary_position_embedding(
+                    positions, q_pe_clone, k_pe_clone, cos_sin_cache
+                )
+
+                assert torch.allclose(q_pe, q_pe_clone)
+                assert torch.allclose(k_pe, k_pe_clone)
+
+
+if __name__ == "__main__":
+    unittest.main()