DiweiSun
diff --git a/‎sgl-kernel/csrc/cpu/activation.cpp
Lines changed: 79 additions & 0 deletions b/‎sgl-kernel/csrc/cpu/activation.cpp
Lines changed: 79 additions & 0 deletions
diff --git a/‎sgl-kernel/csrc/cpu/bmm.cpp
Lines changed: 122 additions & 0 deletions b/‎sgl-kernel/csrc/cpu/bmm.cpp
Lines changed: 122 additions & 0 deletions
diff --git a/‎sgl-kernel/csrc/cpu/common.h
Lines changed: 164 additions & 0 deletions b/‎sgl-kernel/csrc/cpu/common.h
Lines changed: 164 additions & 0 deletions
@@ -0,0 +1,79 @@
+#include "common.h"
+#include "vec.h"
+
+namespace {
+
+template <typename scalar_t, typename func_t, typename vec_func_t>
+void act_and_mul_kernel_impl(
+    scalar_t* __restrict__ output,
+    const scalar_t* __restrict__ input,
+    int64_t num_tokens,
+    int64_t dim,
+    const func_t& f,
+    const vec_func_t& vf) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+
+  constexpr int64_t kVecSize = bVec::size();
+  at::parallel_for(0, num_tokens, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t i = begin; i < end; ++i) {
+      // local ptrs
+      const scalar_t* __restrict__ input_ptr = input + i * 2 * dim;
+      const scalar_t* __restrict__ input_other_ptr = input_ptr + dim;
+      scalar_t* __restrict__ output_ptr = output + i * dim;
+
+      int64_t d;
+#pragma GCC unroll 4
+      for (d = 0; d <= dim - kVecSize; d += kVecSize) {
+        bVec x_bvec = bVec::loadu(input_ptr + d);
+        fVec x_fvec0, x_fvec1;
+        std::tie(x_fvec0, x_fvec1) = at::vec::convert_to_float(x_bvec);
+
+        bVec y_bvec = bVec::loadu(input_other_ptr + d);
+        fVec y_fvec0, y_fvec1;
+        std::tie(y_fvec0, y_fvec1) = at::vec::convert_to_float(y_bvec);
+
+        x_fvec0 = vf(x_fvec0);
+        x_fvec1 = vf(x_fvec1);
+
+        x_fvec0 = x_fvec0 * y_fvec0;
+        x_fvec1 = x_fvec1 * y_fvec1;
+
+        x_bvec = convert_from_float_ext<scalar_t>(x_fvec0, x_fvec1);
+        x_bvec.store(output_ptr + d);
+      }
+#pragma GCC unroll 4
+      for (; d < dim; ++d) {
+        float x_val = static_cast<float>(input_ptr[d]);
+        float y_val = static_cast<float>(input_other_ptr[d]);
+        output_ptr[d] = f(x_val) * y_val;
+      }
+    }
+  });
+}
+
+}  // anonymous namespace
+
+// input   : {num_tokens, 2 * d}
+// output  : {num_tokens, d}
+at::Tensor silu_and_mul_cpu(at::Tensor& input) {
+  RECORD_FUNCTION("sgl-kernel::silu_and_mul_cpu", std::vector<c10::IValue>({input}));
+  auto sizes = input.sizes().vec();
+  int64_t last_dim = input.ndimension() - 1;
+  int64_t d = sizes[last_dim] / 2;
+  sizes[last_dim] = d;
+  int64_t num_tokens = input.numel() / input.size(-1);
+  at::Tensor out = at::empty(sizes, input.options());
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(input.scalar_type(), "silu_and_mul", [&] {
+    using Vec = at::vec::Vectorized<float>;
+    act_and_mul_kernel_impl(
+        out.data_ptr<scalar_t>(),
+        input.data_ptr<scalar_t>(),
+        num_tokens,
+        d,
+        [](float x) { return x / (1.f + std::exp(-x)); },
+        [](Vec x) { return x / (Vec(1.f) + x.neg().exp()); });
+  });
+  return out;
+}
@@ -0,0 +1,122 @@
+#include "common.h"
+#include "gemm.h"
+#include "vec.h"
+
+namespace {
+
+template <typename scalar_t>
+void bmm_kernel_impl(
+    scalar_t* __restrict__ out,
+    const scalar_t* __restrict__ mat1,
+    const scalar_t* __restrict__ mat2,
+    int64_t B,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t mat1_strideB,
+    int64_t mat1_strideM,
+    int64_t out_strideB,
+    int64_t out_strideM,
+    float scale = 0.f) {
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+
+  // mat2 contiguous in [B, N, K]
+  int64_t mat2_strideB = N * K;
+  int64_t mat2_strideN = K;
+
+  const bool use_brgemm = can_use_brgemm<scalar_t>(M);
+
+  // parallel on [B, MB, NB]
+  at::parallel_for(0, B * MB * NB, 0, [&](int64_t begin, int64_t end) {
+    int64_t bs{0}, mb{0}, nb{0};
+    data_index_init(begin, bs, B, mb, MB, nb, NB);
+
+    // for brgemm, use float32 for accumulate
+    alignas(64) float Ctmp[BLOCK_M * BLOCK_N];
+
+    for (int i = begin; i < end; ++i) {
+      UNUSED(i);
+      int mb_start = mb * BLOCK_M;
+      int mb_size = std::min(M - mb_start, BLOCK_M);
+      int nb_start = nb * BLOCK_N;
+      int nb_size = std::min(N - nb_start, BLOCK_N);
+
+      tinygemm_kernel<scalar_t>(
+          /*   A */ mat1 + bs * mat1_strideB + mb_start * mat1_strideM,
+          /*   B */ mat2 + bs * mat2_strideB + nb_start * mat2_strideN /* nb * BLOCK_N * K */,
+          /*   C */ out + bs * out_strideB + mb_start * out_strideM + nb_start,
+          /* Ctmp*/ Ctmp,
+          /*   M */ mb_size,
+          /*   N */ nb_size,
+          /*   K */ K,
+          /* lda */ mat1_strideM,
+          /* ldb */ nb_size,
+          /* ldc */ out_strideM,
+          /* brg */ use_brgemm);
+
+      // move to the next index
+      data_index_step(bs, B, mb, MB, nb, NB);
+    }
+
+    if (use_brgemm) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+}
+
+}  // anonymous namespace
+
+// mat1 : [B, M, K]
+// mat2 : [B, N, K] or [B, OC, IC]
+// out  : [B, M, N]
+// scale: [] 0-dim tensor for per tensor quant
+//
+void bmm_cpu(at::Tensor& out, at::Tensor& mat1, at::Tensor& mat2, bool is_vnni, std::optional<at::Tensor>& scale) {
+  RECORD_FUNCTION("sgl-kernel::bmm_cpu", std::vector<c10::IValue>({out, mat1, mat2}));
+
+  auto packed_w = is_vnni ? mat2 : convert_weight_packed(mat2);
+
+  // input and out could be non-contiguous
+  // weight needs to be contiguous in [OC, IC] order
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(mat1);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(out);
+  CHECK_INPUT(mat2);
+  CHECK_DIM(3, out);
+  CHECK_DIM(3, mat1);
+  CHECK_DIM(3, mat2);
+
+  int64_t B = mat1.size(0);
+  int64_t M = mat1.size(1);
+  int64_t N = mat2.size(1);
+  int64_t K = mat1.size(2);
+
+  TORCH_CHECK(!scale.has_value(), "bmm: do not support fp8 weight for now.")
+  TORCH_CHECK(N % 32 == 0, "tinygemm requires N to be 32x.");
+
+  int64_t mat1_strideB = mat1.stride(0);
+  int64_t mat1_strideM = mat1.stride(1);
+  int64_t out_strideB = out.stride(0);
+  int64_t out_strideM = out.stride(1);
+
+  // check shapes
+  TORCH_CHECK(mat2.size(0) == B && mat2.size(2) == K, "bmm: mat2 shape mismatch!");
+  TORCH_CHECK(out.size(0) == B && out.size(1) == M, "bmm: out shape mismatch!");
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(mat1.scalar_type(), "bmm_kernel_impl", [&] {
+    bmm_kernel_impl<scalar_t>(
+        out.data_ptr<scalar_t>(),
+        mat1.data_ptr<scalar_t>(),
+        packed_w.data_ptr<scalar_t>(),
+        B,
+        M,
+        N,
+        K,
+        mat1_strideB,
+        mat1_strideM,
+        out_strideB,
+        out_strideM);
+  });
+}
@@ -0,0 +1,164 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/Parallel.h>
+#include <ATen/record_function.h>
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+namespace {
+
+// dispatch bool
+#define AT_DISPATCH_BOOL(BOOL_V, BOOL_NAME, ...) \
+  [&] {                                          \
+    if (BOOL_V) {                                \
+      constexpr bool BOOL_NAME = true;           \
+      return __VA_ARGS__();                      \
+    } else {                                     \
+      constexpr bool BOOL_NAME = false;          \
+      return __VA_ARGS__();                      \
+    }                                            \
+  }()
+
+// dispatch: bfloat16, float16, int8_t
+#define CPU_DISPATCH_PACKED_TYPES(TYPE, ...)                     \
+  [&] {                                                          \
+    switch (TYPE) {                                              \
+      case at::ScalarType::BFloat16: {                           \
+        using packed_t = at::BFloat16;                           \
+        return __VA_ARGS__();                                    \
+      }                                                          \
+      case at::ScalarType::Half: {                               \
+        using packed_t = at::Half;                               \
+        return __VA_ARGS__();                                    \
+      }                                                          \
+      case at::ScalarType::Char: {                               \
+        using packed_t = int8_t;                                 \
+        return __VA_ARGS__();                                    \
+      }                                                          \
+      default:                                                   \
+        TORCH_CHECK(false, "Unsupported floating data type.\n"); \
+    }                                                            \
+  }()
+
+#define UNUSED(x) (void)(x)
+
+#define CHECK_CPU(x) TORCH_CHECK(x.device().type() == at::kCPU, #x " must be a CPU tensor")
+
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_LAST_DIM_CONTIGUOUS(x) \
+  TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimention")
+
+#define CHECK_INPUT(x) \
+  CHECK_CPU(x);        \
+  CHECK_CONTIGUOUS(x)
+#define CHECK_LAST_DIM_CONTIGUOUS_INPUT(x) \
+  CHECK_CPU(x);                            \
+  CHECK_LAST_DIM_CONTIGUOUS(x)
+
+#define CHECK_DIM(d, x) TORCH_CHECK(x.dim() == d, #x " must be a " #d "D tensor")
+
+#define CHECK_EQ(a, b) TORCH_CHECK((a) == (b), "CHECK_EQ(" #a ", " #b ") failed. ", a, " vs ", b)
+
+// parallel routines
+constexpr int GRAIN_SIZE = 1024;
+
+template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+inline T div_up(T x, T y) {
+  return (x + y - 1) / y;
+}
+
+template <typename T>
+inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
+#if 0
+    // onednn partition pattern
+    T& n_my = n_end;
+    if (nth <= 1 || n == 0) {
+        n_start = 0;
+        n_my = n;
+    } else {
+        T n1 = div_up(n, nth);
+        T n2 = n1 - 1;
+        T T1 = n - n2 * nth;
+        n_my = ith < T1 ? n1 : n2;
+        n_start = ith <= T1 ? ith*n1 : T1 * n1 + (ith - T1) * n2;
+    }
+    n_end += n_start;
+#else
+  // pytorch aten partition pattern
+  T n_my = div_up(n, nth);
+  n_start = ith * n_my;
+  n_end = std::min(n_start + n_my, n);
+#endif
+}
+
+template <typename func_t>
+inline void parallel_for(int n, const func_t& f) {
+#if defined(_OPENMP)
+#pragma omp parallel
+  {
+    int nth = omp_get_num_threads();
+    int ith = omp_get_thread_num();
+    int tbegin, tend;
+    balance211(n, nth, ith, tbegin, tend);
+    f(tbegin, tend);
+  }
+#else
+  f(0, n);
+#endif
+}
+
+// data indexing for dimension collapse
+template <typename T>
+inline T data_index_init(T offset) {
+  return offset;
+}
+
+template <typename T, typename... Args>
+inline T data_index_init(T offset, T& x, const T& X, Args&&... args) {
+  offset = data_index_init(offset, std::forward<Args>(args)...);
+  x = offset % X;
+  return offset / X;
+}
+
+inline bool data_index_step() {
+  return true;
+}
+
+template <typename T, typename... Args>
+inline bool data_index_step(T& x, const T& X, Args&&... args) {
+  if (data_index_step(std::forward<Args>(args)...)) {
+    x = ((x + 1) == X) ? 0 : (x + 1);
+    return x == 0;
+  }
+  return false;
+}
+
+// forced unroll for perf critical path
+
+#if __has_attribute(always_inline)
+#define ALWAYS_INLINE __attribute__((__always_inline__)) inline
+#else
+#define ALWAYS_INLINE inline
+#endif
+
+template <int n>
+struct Unroll {
+  template <typename Func, typename... Args>
+  ALWAYS_INLINE void operator()(const Func& f, Args... args) const {
+    Unroll<n - 1>{}(f, args...);
+    f(std::integral_constant<int, n - 1>{}, args...);
+  }
+};
+
+template <>
+struct Unroll<1> {
+  template <typename Func, typename... Args>
+  ALWAYS_INLINE void operator()(const Func& f, Args... args) const {
+    f(std::integral_constant<int, 0>{}, args...);
+  }
+};
+
+}  // anonymous namespace