Prepare bounds_check_indices for VBE (pytorch#1633)

sryap · facebook-github-bot · commit 2cd19dd98b1f · 2023-03-28T16:20:28.000-07:00
Summary: Pull Request resolved: pytorch#1633 Prepare `bounds_check_indices` for variable batch size TBE (VBE). - Update the frontend API to accept VBE args - Update the backend logic to process VBE data Reviewed By: jianyuh Differential Revision: D43253703 fbshipit-source-id: bdc2315ff2849d36cb4202f4883482ad04b1f183
diff --git a/fbgemm_gpu/codegen/embedding_bounds_check.cu b/fbgemm_gpu/codegen/embedding_bounds_check.cu
@@ -23,31 +23,52 @@ __device__ void adjust_offset_kernel(
   *offset_acc_end = indices_end;
 }
 
-template <typename index_t>
+template <typename index_t, bool vbe>
 __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel(
     const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
         rows_per_table,
     at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> indices,
     at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> offsets,
+    const int32_t* const vbe_metadata,
     const int64_t bounds_check_mode_,
     at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> warning,
     FixedDivisor fd) {
   int32_t T = rows_per_table.size(0);
-  int32_t B = (offsets.size(0) - 1) / T;
-
   int32_t b_t = blockIdx.x * blockDim.y + threadIdx.y;
-  int32_t b; // = b_t % B;
-  int32_t t; // = b_t / B;
-  fd.DivMod(b_t, &t, &b);
-  if (t >= T) {
+  int32_t b;
+  int32_t t;
+  int32_t B = 0;
+  int32_t total_B = offsets.size(0) - 1;
+
+  if (!vbe && b_t >= total_B) {
     return;
   }
-  auto bounds_check_mode = static_cast<BoundsCheckMode>(bounds_check_mode_);
 
-  auto num_rows = rows_per_table[t];
-  auto indices_start = offsets[t * B + b];
-  auto indices_end = offsets[t * B + b + 1];
-  index_t num_indices = indices.size(0);
+  fd.DivMod(b_t, &t, &b);
+
+  if (vbe) {
+    // Check if t is valid
+    if (t >= T) {
+      return;
+    }
+    const auto B_start = vbe_metadata[t];
+    B = vbe_metadata[t + 1] - B_start;
+    // Check if b is valid
+    if (b >= B) {
+      return;
+    }
+    // Update b_t value
+    b_t = B_start + b;
+  } else {
+    B = total_B / T;
+  }
+
+  const auto bounds_check_mode =
+      static_cast<BoundsCheckMode>(bounds_check_mode_);
+  const auto num_rows = rows_per_table[t];
+  auto indices_start = offsets[b_t];
+  auto indices_end = offsets[b_t + 1];
+  const index_t num_indices = indices.size(0);
 
   if (bounds_check_mode == BoundsCheckMode::FATAL) {
     CUDA_KERNEL_ASSERT(indices_start >= 0);
@@ -58,12 +79,13 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel(
         indices_end > num_indices) {
       if (gpuAtomicIncrement(&warning[0]) == 0) {
         printf(
-            "EmbeddingBoundsCheck: (at least one) Out of bounds access for "
-            "batch: %lld, table: %lld, indices_start: %lld, indices_end: %lld,"
+            "EmbeddingBoundsCheck (VBE %s): (at least one) Out of bounds access for "
+            "batch: %d, table: %d, indices_start: %lld, indices_end: %lld,"
             " num_indices: %lld. Setting indices_start and indices_end within "
             "the range.\n",
-            static_cast<int64_t>(b),
-            static_cast<int64_t>(t),
+            vbe ? "true" : "false",
+            b,
+            t,
             static_cast<int64_t>(indices_start),
             static_cast<int64_t>(indices_end),
             static_cast<int64_t>(num_indices));
@@ -72,16 +94,16 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel(
           indices_start,
           indices_end,
           num_indices,
-          &offsets[t * B + b],
-          &offsets[t * B + b + 1]);
+          &offsets[b_t],
+          &offsets[b_t + 1]);
     }
   } else if (bounds_check_mode == BoundsCheckMode::IGNORE) {
     adjust_offset_kernel(
         indices_start,
         indices_end,
         num_indices,
-        &offsets[t * B + b],
-        &offsets[t * B + b + 1]);
+        &offsets[b_t],
+        &offsets[b_t + 1]);
   }
 
   const auto L = indices_end - indices_start;
@@ -100,9 +122,10 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel(
       if (idx < 0 || idx >= num_rows) {
         if (gpuAtomicIncrement(&warning[0]) == 0) {
           printf(
-              "EmbeddingBoundsCheck: (at least one) Out of bounds access for batch: %lld, table: %lld, bag element: %lld, idx: %lld, num_rows: %lld, indices_start: %lld, indices_end: %lld, T: %d, B: %d, b_t: %d. Setting idx to zero.\n",
-              static_cast<int64_t>(b),
-              static_cast<int64_t>(t),
+              "EmbeddingBoundsCheck (VBE %s): (at least one) Out of bounds access for batch: %d, table: %d, bag element: %lld, idx: %lld, num_rows: %lld, indices_start: %lld, indices_end: %lld, T: %d, B: %d, b_t: %d. Setting idx to zero.\n",
+              vbe ? "true" : "false",
+              b,
+              t,
               static_cast<int64_t>(i),
               static_cast<int64_t>(idx),
               num_rows,
@@ -122,25 +145,27 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel(
   }
 
   if (bounds_check_mode == BoundsCheckMode::FATAL) {
-    CUDA_KERNEL_ASSERT(num_indices == offsets[B * T]);
+    CUDA_KERNEL_ASSERT(num_indices == offsets[total_B]);
   } else if (bounds_check_mode == BoundsCheckMode::WARNING) {
-    if (num_indices != offsets[B * T]) {
+    if (num_indices != offsets[total_B]) {
       if (gpuAtomicIncrement(&warning[0]) == 0) {
         printf(
-            "EmbeddingBoundsCheck: the last element in offsets is incorrect for "
-            "total batch size B: %lld, total table num T: %lld, "
+            "EmbeddingBoundsCheck (VBE %s): the last element in offsets is incorrect for "
+            "total batch size %s: %d, total table num T: %d, "
             " last element in offsets: %lld, indices size: %lld. "
             " Setting the last element in offsets to be indices size.\n",
-            static_cast<int64_t>(B),
-            static_cast<int64_t>(T),
-            static_cast<int64_t>(offsets[B * T]),
+            vbe ? "true" : "false",
+            vbe ? "total_B" : "B",
+            vbe ? total_B : B,
+            T,
+            static_cast<int64_t>(offsets[total_B]),
             static_cast<int64_t>(num_indices));
       }
-      offsets[B * T] = num_indices;
+      offsets[total_B] = num_indices;
     }
   } else if (bounds_check_mode == BoundsCheckMode::IGNORE) {
-    if (num_indices != offsets[B * T]) {
-      offsets[B * T] = num_indices;
+    if (num_indices != offsets[total_B]) {
+      offsets[total_B] = num_indices;
     }
   }
 }
@@ -151,19 +176,23 @@ void bounds_check_indices_cuda(
     Tensor& offsets,
     int64_t bounds_check_mode_,
     Tensor& warning,
-    c10::optional<Tensor> weights) {
+    const c10::optional<Tensor>& weights,
+    const c10::optional<Tensor>& vbe_metadata,
+    const int64_t max_B) {
   TENSOR_ON_CUDA_GPU(rows_per_table);
   TENSOR_ON_CUDA_GPU(indices);
   TENSOR_ON_CUDA_GPU(offsets);
   TENSOR_ON_CUDA_GPU(warning);
   TENSOR_EMPTY_OR_ON_CUDA_GPU(weights);
+  TENSOR_EMPTY_OR_ON_CUDA_GPU(vbe_metadata);
 
   at::cuda::OptionalCUDAGuard device_guard;
   device_guard.set_index(rows_per_table.get_device());
 
   const int32_t T = rows_per_table.size(0);
-  const int32_t B = (offsets.size(0) - 1) / T;
-  if (B == 0 || T == 0) {
+  const int32_t total_B = offsets.size(0) - 1;
+  const int32_t B = (total_B) / T;
+  if (total_B == 0 || T == 0) {
     return;
   }
   const auto bounds_check_mode =
@@ -172,12 +201,17 @@ void bounds_check_indices_cuda(
     warning.zero_();
   }
   const int64_t num_indices = indices.size(0);
+  const auto vbe = vbe_metadata.has_value();
 
-  TORCH_CHECK(
-      offsets.size(0) == B * T + 1,
-      "offsets size " + std::to_string(offsets.size(0)) +
-          " is not equal to B (" + std::to_string(B) + ") * T (" +
-          std::to_string(T) + ") + 1");
+  if (vbe) {
+    TORCH_CHECK(max_B >= 0);
+  } else {
+    TORCH_CHECK(
+        offsets.size(0) == B * T + 1,
+        "offsets size " + std::to_string(offsets.size(0)) +
+            " is not equal to B (" + std::to_string(B) + ") * T (" +
+            std::to_string(T) + ") + 1");
+  }
   if (weights.has_value()) {
     TORCH_CHECK(
         weights.value().size(0) == num_indices,
@@ -186,20 +220,24 @@ void bounds_check_indices_cuda(
   }
 
   constexpr size_t kNumThreads = 256;
+  const auto max_B_ = vbe ? max_B : B;
 
   AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "bounds_check_indices", [&] {
-    bounds_check_indices_kernel<index_t>
-        <<<div_round_up(B * T, kNumThreads / fbgemm_gpu::kWarpSize),
-           dim3(fbgemm_gpu::kWarpSize, kNumThreads / fbgemm_gpu::kWarpSize),
-           0,
-           at::cuda::getCurrentCUDAStream()>>>(
-            rows_per_table
-                .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-            indices.packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-            offsets.packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-            bounds_check_mode_,
-            warning.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
-            FixedDivisor(B));
+    const auto bounds_check_kernel =
+        (vbe ? bounds_check_indices_kernel<index_t, true>
+             : bounds_check_indices_kernel<index_t, false>);
+    bounds_check_kernel<<<
+        div_round_up(max_B_ * T, kNumThreads / fbgemm_gpu::kWarpSize),
+        dim3(fbgemm_gpu::kWarpSize, kNumThreads / fbgemm_gpu::kWarpSize),
+        0,
+        at::cuda::getCurrentCUDAStream()>>>(
+        rows_per_table.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
+        indices.packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+        offsets.packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+        vbe ? vbe_metadata.value().data_ptr<int32_t>() : nullptr,
+        bounds_check_mode_,
+        warning.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
+        FixedDivisor(max_B_));
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
   });
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
diff --git a/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp b/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp
@@ -23,7 +23,9 @@ void bounds_check_indices_cuda(
     Tensor& offsets,
     int64_t bounds_check_mode,
     Tensor& warning,
-    c10::optional<Tensor> weights);
+    const c10::optional<Tensor>& weights,
+    const c10::optional<Tensor>& vbe_metadata,
+    const int64_t max_B);
 
 // Deprecated for fb namespace! Please use fbgemm namespace instead!
 TORCH_LIBRARY_FRAGMENT(fb, m) {
diff --git a/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp b/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp
@@ -42,7 +42,12 @@ void bounds_check_indices_cpu(
     Tensor& offsets,
     int64_t bounds_check_mode_,
     Tensor& warning,
-    c10::optional<Tensor> weights) {
+    const c10::optional<Tensor>& weights,
+    const c10::optional<Tensor>& vbe_metadata,
+    const int64_t /*max_B*/) {
+  TORCH_CHECK(
+      !vbe_metadata.has_value(),
+      "bounds_check_indices on CPU does not support variable length (batch size)");
   auto bounds_check_mode = static_cast<BoundsCheckMode>(bounds_check_mode_);
   if (bounds_check_mode == BoundsCheckMode::WARNING) {
     warning.zero_();
@@ -163,14 +168,14 @@ TORCH_LIBRARY_FRAGMENT(fb, m) {
   // The (a!) tells PyTorch this is an impure operation and so cannot be CSE'd
   // or DCE'd, etc.
   m.def(
-      "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None) -> ()");
+      "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None, Tensor? vbe_metadata=None, int max_B=-1) -> ()");
   DISPATCH_TO_CPU("bounds_check_indices", bounds_check_indices_cpu);
 }
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   // The (a!) tells PyTorch this is an impure operation and so cannot be CSE'd
   // or DCE'd, etc.
   m.def(
-      "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None) -> ()");
+      "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None, Tensor? vbe_metadata=None, int max_B=-1) -> ()");
   DISPATCH_TO_CPU("bounds_check_indices", bounds_check_indices_cpu);
 }