Improve bounds_check_indices for VBE (pytorch#3386)

sryap · facebook-github-bot · commit 37044aae6278 · 2024-11-18T07:41:19.000-08:00
Summary: X-link: facebookresearch/FBGEMM#475 Instead of over launching thread blocks, use `b_t_map` to launch only necessary thread blocks to increase occupancy for the VBE case Note that `b_t_map` is necessary for the TBE look for the VBE case. It is generated during the TBE forward pass. In this diff, we call `generate_vbe_metdata` twice (before bounds check and before forward look up). These two calls can be fused into one. We will clean this up in the subsequent diffs. Differential Revision: D65735342
diff --git a/fbgemm_gpu/codegen/utils/embedding_bounds_check_host.cpp b/fbgemm_gpu/codegen/utils/embedding_bounds_check_host.cpp
@@ -28,7 +28,10 @@ void _bounds_check_indices_cuda_v1(
     Tensor& warning,
     const std::optional<Tensor>& weights,
     const std::optional<Tensor>& B_offsets,
-    const int64_t max_B);
+    const int64_t max_B,
+    const std::optional<Tensor>& b_t_map,
+    const int32_t info_B_num_bits,
+    const uint32_t info_B_mask);
 
 void _bounds_check_indices_cuda_v2(
     Tensor& rows_per_table,
@@ -38,7 +41,10 @@ void _bounds_check_indices_cuda_v2(
     Tensor& warning,
     const std::optional<Tensor>& weights,
     const std::optional<Tensor>& B_offsets,
-    const int64_t max_B);
+    const int64_t max_B,
+    const std::optional<Tensor>& b_t_map,
+    const int32_t info_B_num_bits,
+    const uint32_t info_B_mask);
 
 ///@ingroup embedding-cuda
 void bounds_check_indices_cuda(
@@ -49,7 +55,10 @@ void bounds_check_indices_cuda(
     Tensor& warning,
     const std::optional<Tensor>& weights,
     const std::optional<Tensor>& B_offsets,
-    const int64_t max_B) {
+    const int64_t max_B,
+    const std::optional<Tensor>& b_t_map,
+    const int64_t info_B_num_bits,
+    const int64_t info_B_mask) {
   const static bool use_v2 = fbgemm_gpu::config::is_feature_enabled(
       fbgemm_gpu::config::FeatureGateName::BOUNDS_CHECK_INDICES_V2);
   const auto bounds_check_indices_fn =
@@ -62,7 +71,10 @@ void bounds_check_indices_cuda(
       warning,
       weights,
       B_offsets,
-      max_B);
+      max_B,
+      b_t_map,
+      static_cast<int32_t>(info_B_num_bits),
+      static_cast<uint32_t>(info_B_mask));
 }
 // Deprecated for fb namespace! Please use fbgemm namespace instead!
 TORCH_LIBRARY_FRAGMENT(fb, m) {
diff --git a/fbgemm_gpu/codegen/utils/embedding_bounds_check_host_cpu.cpp b/fbgemm_gpu/codegen/utils/embedding_bounds_check_host_cpu.cpp
@@ -48,7 +48,10 @@ void bounds_check_indices_cpu(
     Tensor& warning,
     const std::optional<Tensor>& weights,
     const std::optional<Tensor>& B_offsets,
-    const int64_t max_B) {
+    const int64_t max_B,
+    const std::optional<Tensor>& /*b_t_map*/,
+    const int64_t /*info_B_num_bits*/,
+    const int64_t /*info_B_mask*/) {
   if (offsets.scalar_type() != indices.scalar_type()) {
     offsets = offsets.toType(indices.scalar_type());
   }
@@ -190,7 +193,19 @@ TORCH_LIBRARY_FRAGMENT(fb, m) {
   // The (a!) tells PyTorch this is an impure operation and so cannot be CSE'd
   // or DCE'd, etc.
   m.def(
-      "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None, Tensor? B_offsets=None, SymInt max_B=-1) -> ()",
+      "bounds_check_indices("
+      "    Tensor rows_per_table, "
+      "    Tensor(a!) indices, "
+      "    Tensor(b!) offsets, "
+      "    int bounds_check_mode, "
+      "    Tensor(c!) warning, "
+      "    Tensor(d!)? weights=None, "
+      "    Tensor? B_offsets=None, "
+      "    SymInt max_B=-1, "
+      "    Tensor? b_t_map=None, "
+      "    int info_B_num_bits=-1, "
+      "    int info_B_mask=-1"
+      ") -> ()",
       {PT2_COMPLIANT_TAG});
   DISPATCH_TO_CPU("bounds_check_indices", bounds_check_indices_cpu);
 }
@@ -202,7 +217,19 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
       "fbgemm_gpu.sparse_ops",
       "//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_py");
   m.def(
-      "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None, Tensor? B_offsets=None, SymInt max_B=-1) -> ()",
+      "bounds_check_indices("
+      "    Tensor rows_per_table, "
+      "    Tensor(a!) indices, "
+      "    Tensor(b!) offsets, "
+      "    int bounds_check_mode, "
+      "    Tensor(c!) warning, "
+      "    Tensor(d!)? weights=None, "
+      "    Tensor? B_offsets=None, "
+      "    SymInt max_B=-1, "
+      "    Tensor? b_t_map=None, "
+      "    int info_B_num_bits=-1, "
+      "    int info_B_mask=-1"
+      ") -> ()",
       {PT2_COMPLIANT_TAG});
   DISPATCH_TO_CPU("bounds_check_indices", bounds_check_indices_cpu);
 }
diff --git a/fbgemm_gpu/codegen/utils/embedding_bounds_check_v1.cu b/fbgemm_gpu/codegen/utils/embedding_bounds_check_v1.cu
@@ -187,7 +187,10 @@ void _bounds_check_indices_cuda_v1(
     Tensor& warning,
     const std::optional<Tensor>& weights,
     const std::optional<Tensor>& B_offsets,
-    const int64_t max_B) {
+    const int64_t max_B,
+    const std::optional<Tensor>& /*b_t_map*/,
+    const int32_t /*info_b_num_bits*/,
+    const uint32_t /*info_B_mask*/) {
   TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
       rows_per_table, indices, offsets, warning, weights, B_offsets);
   TENSOR_NDIM_EQUALS(rows_per_table, 1);
diff --git a/fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu b/fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu
@@ -41,7 +41,9 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel_v2(
                                     // dummy PackedTensorAccessor
     pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> warning,
     FixedDivisor fd,
-    const int32_t vbe_bound,
+    const int32_t* const b_t_map,
+    const int32_t info_B_num_bits,
+    const int32_t info_B_mask,
     TORCH_DSA_KERNEL_ARGS) {
   int32_t T = rows_per_table.size(0);
   int32_t total_B = offsets.size(0) - 1;
@@ -80,28 +82,17 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel_v2(
     }
   }
 
-  for (int32_t b_t_init = blockIdx.x * blockDim.y + threadIdx.y;
-       b_t_init < (vbe ? vbe_bound : total_B);
-       b_t_init += blockDim.y * gridDim.x) {
+  for (int32_t b_t = blockIdx.x * blockDim.y + threadIdx.y; b_t < total_B;
+       b_t += blockDim.y * gridDim.x) {
+    // Compute b and t
     int32_t b;
     int32_t t;
-    int32_t b_t = b_t_init;
-
-    fd.DivMod(b_t, &t, &b);
-
     if (vbe) {
-      // Check if t is valid
-      if (t >= T) {
-        return;
-      }
-      const auto B_start = B_offsets[t];
-      B = B_offsets[t + 1] - B_start;
-      // Check if b is valid
-      if (b >= B) {
-        continue;
-      }
-      // Update b_t value
-      b_t = B_start + b;
+      const auto info = *reinterpret_cast<const uint32_t*>(&b_t_map[b_t]);
+      *reinterpret_cast<uint32_t*>(&t) = info >> info_B_num_bits;
+      *reinterpret_cast<uint32_t*>(&b) = info & info_B_mask;
+    } else {
+      fd.DivMod(b_t, &t, &b);
     }
 
     const auto num_rows = rows_per_table[t];
@@ -208,9 +199,12 @@ void _bounds_check_indices_cuda_v2(
     Tensor& warning,
     const std::optional<Tensor>& weights,
     const std::optional<Tensor>& B_offsets,
-    const int64_t max_B) {
+    const int64_t /*max_B*/,
+    const std::optional<Tensor>& b_t_map,
+    const int32_t info_B_num_bits,
+    const uint32_t info_B_mask) {
   TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      rows_per_table, indices, offsets, warning, weights, B_offsets);
+      rows_per_table, indices, offsets, warning, weights, B_offsets, b_t_map);
   TENSOR_NDIM_EQUALS(rows_per_table, 1);
   TENSOR_NDIM_EQUALS(indices, 1);
   TENSOR_NDIM_EQUALS(offsets, 1);
@@ -219,6 +213,8 @@ void _bounds_check_indices_cuda_v2(
   const auto vbe = B_offsets.has_value();
   if (vbe) {
     TENSOR_NDIM_EQUALS(B_offsets.value(), 1);
+    TORCH_CHECK(b_t_map.has_value());
+    TENSOR_NDIM_EQUALS(b_t_map.value(), 1);
   }
 
   CUDA_DEVICE_GUARD(rows_per_table);
@@ -236,9 +232,7 @@ void _bounds_check_indices_cuda_v2(
   }
   const int64_t num_indices = indices.size(0);
 
-  if (vbe) {
-    TORCH_CHECK(max_B >= 0);
-  } else {
+  if (!vbe) {
     TORCH_CHECK(
         offsets.size(0) == B * T + 1,
         "offsets size " + std::to_string(offsets.size(0)) +
@@ -253,11 +247,6 @@ void _bounds_check_indices_cuda_v2(
   }
 
   constexpr size_t kNumThreads = 1024;
-  const auto max_B_ = vbe ? max_B : B;
-
-  const int32_t vbe_bound = max_B_ * T;
-  TORCH_CHECK(
-      vbe_bound >= 0, "EmbeddingBoundsCheck: vbe_bound is out of bound");
 
 #define INVOKE_BOUNDS_CHECK_INDICES(MODE)                                      \
   if (bounds_check_mode == MODE) {                                             \
@@ -270,8 +259,7 @@ void _bounds_check_indices_cuda_v2(
                    : bounds_check_indices_kernel_v2<index_t, false, MODE>);    \
           TORCH_DSA_KERNEL_LAUNCH(                                             \
               bounds_check_kernel,                                             \
-              min(div_round_up(                                                \
-                      max_B_* T, kNumThreads / fbgemm_gpu::kWarpSize),         \
+              min(div_round_up(total_B, kNumThreads / fbgemm_gpu::kWarpSize),  \
                   get_max_thread_blocks_()),                                   \
               dim3(                                                            \
                   fbgemm_gpu::kWarpSize, kNumThreads / fbgemm_gpu::kWarpSize), \
@@ -282,8 +270,10 @@ void _bounds_check_indices_cuda_v2(
               MAKE_PTA_WITH_NAME(func_name, offsets, index_t, 1, 32),          \
               vbe ? B_offsets.value().data_ptr<int32_t>() : nullptr,           \
               MAKE_PTA_WITH_NAME(func_name, warning, int64_t, 1, 32),          \
-              FixedDivisor(max_B_),                                            \
-              vbe_bound);                                                      \
+              FixedDivisor(B),                                                 \
+              vbe ? b_t_map.value().data_ptr<int32_t>() : nullptr,             \
+              info_B_num_bits,                                                 \
+              info_B_mask);                                                    \
         });                                                                    \
   }
 
diff --git a/fbgemm_gpu/fbgemm_gpu/sparse_ops.py b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
@@ -826,6 +826,9 @@ def bounds_check_indices_abstract(
     per_sample_weights: Optional[torch.Tensor] = None,
     B_offsets: Optional[torch.Tensor] = None,
     max_B: Optional[SymInt] = None,
+    b_t_map: Optional[torch.Tensor] = None,
+    info_B_num_bits: int = -1,
+    info_B_mask: int = -1,
 ) -> None:
     """
     This meta function is used to fake the bounds checking
diff --git a/fbgemm_gpu/src/split_embeddings_utils/split_embeddings_utils.cpp b/fbgemm_gpu/src/split_embeddings_utils/split_embeddings_utils.cpp
@@ -33,6 +33,11 @@ generate_vbe_metadata_meta(
   return {row_output_offsets, b_t_map};
 }
 
+std::tuple<int64_t, int64_t>
+get_infos_metadata_meta(Tensor /*unused*/, int64_t /*B*/, int64_t /*T*/) {
+  return {-1, -1};
+}
+
 } // namespace
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
@@ -43,4 +48,5 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
 
 TORCH_LIBRARY_IMPL(fbgemm, Meta, m) {
   m.impl("generate_vbe_metadata", &generate_vbe_metadata_meta);
+  m.impl("get_infos_metadata", &get_infos_metadata);
 }

Original file line number	Diff line number	Diff line change
`@@ -33,6 +33,11 @@ generate_vbe_metadata_meta(`
`33`	`33`	`return {row_output_offsets, b_t_map};`
`34`	`34`	`}`
`35`	`35`
	`36`	`+std::tuple<int64_t, int64_t>`
	`37`	`+get_infos_metadata_meta(Tensor /unused/, int64_t /B/, int64_t /T/) {`
	`38`	`+ return {-1, -1};`
	`39`	`+}`
	`40`	`+`
`36`	`41`	`} // namespace`
`37`	`42`
`38`	`43`	`TORCH_LIBRARY_FRAGMENT(fbgemm, m) {`
`@@ -43,4 +48,5 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {`
`43`	`48`
`44`	`49`	`TORCH_LIBRARY_IMPL(fbgemm, Meta, m) {`
`45`	`50`	`m.impl("generate_vbe_metadata", &generate_vbe_metadata_meta);`
	`51`	`+ m.impl("get_infos_metadata", &get_infos_metadata);`
`46`	`52`	`}`