Prepare bounds_check_indices for VLE (pytorch#1633)

sryap · facebook-github-bot · commit 5659e9f2fc45 · 2023-03-20T22:15:16.000-07:00
Summary: Pull Request resolved: pytorch#1633 Prepare `bounds_check_indices` for variable length TBE (VLE). - Update the frontend API to accept VLE args - Update the backend logic to process VLE data Reviewed By: jianyuh Differential Revision: D43253703 fbshipit-source-id: f8c270fc26501bb43e2eb4a8d3739bb31b31fbe9
diff --git a/fbgemm_gpu/codegen/embedding_bounds_check.cu b/fbgemm_gpu/codegen/embedding_bounds_check.cu
@@ -23,31 +23,46 @@ __device__ void adjust_offset_kernel(
   *offset_acc_end = indices_end;
 }
 
-template <typename index_t>
+template <typename index_t, bool vle>
 __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel(
     const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         rows_per_table,
     at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> indices,
     at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> offsets,
+    const int32_t* const vle_metadata,
     const int64_t bounds_check_mode_,
     at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> warning,
     FixedDivisor fd) {
   int32_t T = rows_per_table.size(0);
-  int32_t B = (offsets.size(0) - 1) / T;
-
   int32_t b_t = blockIdx.x * blockDim.y + threadIdx.y;
-  int32_t b; // = b_t % B;
-  int32_t t; // = b_t / B;
-  fd.DivMod(b_t, &t, &b);
-  if (t >= T) {
+  int32_t b;
+  int32_t t;
+  int32_t B = 0;
+  int32_t total_B = offsets.size(0) - 1;
+
+  if (b_t >= total_B) {
     return;
   }
-  auto bounds_check_mode = static_cast<BoundsCheckMode>(bounds_check_mode_);
 
-  auto num_rows = rows_per_table[t];
-  auto indices_start = offsets[t * B + b];
-  auto indices_end = offsets[t * B + b + 1];
-  index_t num_indices = indices.size(0);
+  if (vle) {
+    if (threadIdx.x == 0) {
+      // binary_search_range takes inclusive sumscan array
+      binary_search_range(&t, vle_metadata + 1, b_t, T);
+      b = b_t - vle_metadata[t];
+    }
+    t = shfl_sync(t, 0);
+    b = shfl_sync(b, 0);
+  } else {
+    B = total_B / T;
+    fd.DivMod(b_t, &t, &b);
+  }
+
+  const auto bounds_check_mode =
+      static_cast<BoundsCheckMode>(bounds_check_mode_);
+  const auto num_rows = rows_per_table[t];
+  auto indices_start = offsets[b_t];
+  auto indices_end = offsets[b_t + 1];
+  const index_t num_indices = indices.size(0);
 
   if (bounds_check_mode == BoundsCheckMode::FATAL) {
     CUDA_KERNEL_ASSERT(indices_start >= 0);
@@ -58,12 +73,13 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel(
         indices_end > num_indices) {
       if (gpuAtomicIncrement(&warning[0]) == 0) {
         printf(
-            "EmbeddingBoundsCheck: (at least one) Out of bounds access for "
-            "batch: %lld, table: %lld, indices_start: %lld, indices_end: %lld,"
+            "EmbeddingBoundsCheck (VLE %s): (at least one) Out of bounds access for "
+            "batch: %d, table: %d, indices_start: %lld, indices_end: %lld,"
             " num_indices: %lld. Setting indices_start and indices_end within "
             "the range.\n",
-            static_cast<int64_t>(b),
-            static_cast<int64_t>(t),
+            vle ? "true" : "false",
+            b,
+            t,
             static_cast<int64_t>(indices_start),
             static_cast<int64_t>(indices_end),
             static_cast<int64_t>(num_indices));
@@ -72,16 +88,16 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel(
           indices_start,
           indices_end,
           num_indices,
-          &offsets[t * B + b],
-          &offsets[t * B + b + 1]);
+          &offsets[b_t],
+          &offsets[b_t + 1]);
     }
   } else if (bounds_check_mode == BoundsCheckMode::IGNORE) {
     adjust_offset_kernel(
         indices_start,
         indices_end,
         num_indices,
-        &offsets[t * B + b],
-        &offsets[t * B + b + 1]);
+        &offsets[b_t],
+        &offsets[b_t + 1]);
   }
 
   const auto L = indices_end - indices_start;
@@ -100,9 +116,10 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel(
       if (idx < 0 || idx >= num_rows) {
         if (gpuAtomicIncrement(&warning[0]) == 0) {
           printf(
-              "EmbeddingBoundsCheck: (at least one) Out of bounds access for batch: %lld, table: %lld, bag element: %lld, idx: %lld, num_rows: %lld, indices_start: %lld, indices_end: %lld, T: %d, B: %d, b_t: %d. Setting idx to zero.\n",
-              static_cast<int64_t>(b),
-              static_cast<int64_t>(t),
+              "EmbeddingBoundsCheck (VLE %s): (at least one) Out of bounds access for batch: %d, table: %d, bag element: %lld, idx: %lld, num_rows: %lld, indices_start: %lld, indices_end: %lld, T: %d, B: %d, b_t: %d. Setting idx to zero.\n",
+              vle ? "true" : "false",
+              b,
+              t,
               static_cast<int64_t>(i),
               static_cast<int64_t>(idx),
               num_rows,
@@ -122,25 +139,27 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel(
   }
 
   if (bounds_check_mode == BoundsCheckMode::FATAL) {
-    CUDA_KERNEL_ASSERT(num_indices == offsets[B * T]);
+    CUDA_KERNEL_ASSERT(num_indices == offsets[total_B]);
   } else if (bounds_check_mode == BoundsCheckMode::WARNING) {
-    if (num_indices != offsets[B * T]) {
+    if (num_indices != offsets[total_B]) {
       if (gpuAtomicIncrement(&warning[0]) == 0) {
         printf(
-            "EmbeddingBoundsCheck: the last element in offsets is incorrect for "
-            "total batch size B: %lld, total table num T: %lld, "
+            "EmbeddingBoundsCheck (VLE %s): the last element in offsets is incorrect for "
+            "total batch size %s: %d, total table num T: %d, "
             " last element in offsets: %lld, indices size: %lld. "
             " Setting the last element in offsets to be indices size.\n",
-            static_cast<int64_t>(B),
-            static_cast<int64_t>(T),
-            static_cast<int64_t>(offsets[B * T]),
+            vle ? "true" : "false",
+            vle ? "total_B" : "B",
+            vle ? total_B : B,
+            T,
+            static_cast<int64_t>(offsets[total_B]),
             static_cast<int64_t>(num_indices));
       }
-      offsets[B * T] = num_indices;
+      offsets[total_B] = num_indices;
     }
   } else if (bounds_check_mode == BoundsCheckMode::IGNORE) {
-    if (num_indices != offsets[B * T]) {
-      offsets[B * T] = num_indices;
+    if (num_indices != offsets[total_B]) {
+      offsets[total_B] = num_indices;
     }
   }
 }
@@ -151,19 +170,22 @@ void bounds_check_indices_cuda(
     Tensor& offsets,
     int64_t bounds_check_mode_,
     Tensor& warning,
-    c10::optional<Tensor> weights) {
+    const c10::optional<Tensor>& weights,
+    const c10::optional<Tensor>& vle_metadata) {
   TENSOR_ON_CUDA_GPU(rows_per_table);
   TENSOR_ON_CUDA_GPU(indices);
   TENSOR_ON_CUDA_GPU(offsets);
   TENSOR_ON_CUDA_GPU(warning);
   TENSOR_EMPTY_OR_ON_CUDA_GPU(weights);
+  TENSOR_EMPTY_OR_ON_CUDA_GPU(vle_metadata);
 
   at::cuda::OptionalCUDAGuard device_guard;
   device_guard.set_index(rows_per_table.get_device());
 
   const int32_t T = rows_per_table.size(0);
-  const int32_t B = (offsets.size(0) - 1) / T;
-  if (B == 0 || T == 0) {
+  const int32_t total_B = offsets.size(0) - 1;
+  const int32_t B = (total_B) / T;
+  if (total_B == 0 || T == 0) {
     return;
   }
   const auto bounds_check_mode =
@@ -173,11 +195,13 @@ void bounds_check_indices_cuda(
   }
   const int64_t num_indices = indices.size(0);
 
-  TORCH_CHECK(
-      offsets.size(0) == B * T + 1,
-      "offsets size " + std::to_string(offsets.size(0)) +
-          " is not equal to B (" + std::to_string(B) + ") * T (" +
-          std::to_string(T) + ") + 1");
+  if (!vle_metadata.has_value()) {
+    TORCH_CHECK(
+        offsets.size(0) == B * T + 1,
+        "offsets size " + std::to_string(offsets.size(0)) +
+            " is not equal to B (" + std::to_string(B) + ") * T (" +
+            std::to_string(T) + ") + 1");
+  }
   if (weights.has_value()) {
     TORCH_CHECK(
         weights.value().size(0) == num_indices,
@@ -187,19 +211,30 @@ void bounds_check_indices_cuda(
 
   constexpr size_t kNumThreads = 256;
 
+#define INVOKE_BOUNDS_CHECK_INDICES_KERNEL(VAR_BATCH_SIZE, VAR_B_METADATA) \
+  bounds_check_indices_kernel<index_t, VAR_BATCH_SIZE>                     \
+      <<<div_round_up(total_B, kNumThreads / fbgemm_gpu::kWarpSize),       \
+         dim3(fbgemm_gpu::kWarpSize, kNumThreads / fbgemm_gpu::kWarpSize), \
+         0,                                                                \
+         at::cuda::getCurrentCUDAStream()>>>(                              \
+          rows_per_table                                                   \
+              .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),     \
+          indices.packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),  \
+          offsets.packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),  \
+          VAR_B_METADATA,                                                  \
+          bounds_check_mode_,                                              \
+          warning.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),  \
+          FixedDivisor(B));                                                \
+  C10_CUDA_KERNEL_LAUNCH_CHECK()
+
   AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "bounds_check_indices", [&] {
-    bounds_check_indices_kernel<index_t>
-        <<<div_round_up(B * T, kNumThreads / fbgemm_gpu::kWarpSize),
-           dim3(fbgemm_gpu::kWarpSize, kNumThreads / fbgemm_gpu::kWarpSize),
-           0,
-           at::cuda::getCurrentCUDAStream()>>>(
-            rows_per_table
-                .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-            indices.packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-            offsets.packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-            bounds_check_mode_,
-            warning.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-            FixedDivisor(B));
+    if (vle_metadata.has_value()) {
+      INVOKE_BOUNDS_CHECK_INDICES_KERNEL(
+          true, vle_metadata.value().data_ptr<int32_t>());
+    } else {
+      INVOKE_BOUNDS_CHECK_INDICES_KERNEL(false, nullptr);
+    }
   });
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+#undef INVOKE_BOUNDS_CHECK_INDICES_KERNEL
 }
diff --git a/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp b/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp
@@ -23,7 +23,8 @@ void bounds_check_indices_cuda(
     Tensor& offsets,
     int64_t bounds_check_mode,
     Tensor& warning,
-    c10::optional<Tensor> weights);
+    const c10::optional<Tensor>& weights,
+    const c10::optional<Tensor>& vle_metadata);
 
 // Deprecated for fb namespace! Please use fbgemm namespace instead!
 TORCH_LIBRARY_FRAGMENT(fb, m) {
diff --git a/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp b/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp
@@ -42,7 +42,11 @@ void bounds_check_indices_cpu(
     Tensor& offsets,
     int64_t bounds_check_mode_,
     Tensor& warning,
-    c10::optional<Tensor> weights) {
+    const c10::optional<Tensor>& weights,
+    const c10::optional<Tensor>& vle_metadata) {
+  TORCH_CHECK(
+      !vle_metadata.has_value(),
+      "bounds_check_indices on CPU does not support variable length (batch size)");
   auto bounds_check_mode = static_cast<BoundsCheckMode>(bounds_check_mode_);
   if (bounds_check_mode == BoundsCheckMode::WARNING) {
     warning.zero_();
@@ -163,14 +167,14 @@ TORCH_LIBRARY_FRAGMENT(fb, m) {
   // The (a!) tells PyTorch this is an impure operation and so cannot be CSE'd
   // or DCE'd, etc.
   m.def(
-      "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None) -> ()");
+      "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None, Tensor? vle_metadata=None) -> ()");
   DISPATCH_TO_CPU("bounds_check_indices", bounds_check_indices_cpu);
 }
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   // The (a!) tells PyTorch this is an impure operation and so cannot be CSE'd
   // or DCE'd, etc.
   m.def(
-      "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None) -> ()");
+      "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None, Tensor? vle_metadata=None) -> ()");
   DISPATCH_TO_CPU("bounds_check_indices", bounds_check_indices_cpu);
 }