Support variable bucket size for block_bucketize_sparse_features (pytorch#2107)

tissue3 · facebook-github-bot · commit fb684f62db7c · 2023-11-06T19:37:34.000-08:00
Summary:

This diff add support for variable bucket size for block bucketize_sparse features for RW sharding.
E.g. Given bucket_sizes_pos as [[0,5,15], [0,10,13]]
For batch 0, indices in [0,5) will be assigned to bucket 0, indices in [5,15) will be assigned to bucket 1.
For batch 1, indices in [0,10) will be assigned to bucket 0, indices in [10,13) will be assigned to bucket 1.
The new index will be original index - bucket_sizes_pos[new_bucket_id-1]
i.e. for batch = 0, index = 12, it will be assigned to bucket 1 and the new index is 12 - 5 = 7.

Differential Revision: D50868649
diff --git a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
@@ -149,7 +149,8 @@ block_bucketize_sparse_features_cuda(
     const int64_t my_size,
     const c10::optional<at::Tensor>& weights,
     const c10::optional<at::Tensor>& batch_size_per_feature,
-    const int64_t max_batch_size);
+    const int64_t max_batch_size,
+    const c10::optional<std::vector<at::Tensor>>& block_bucketize_pos);
 
 std::tuple<
     at::Tensor,
@@ -168,7 +169,8 @@ block_bucketize_sparse_features_cpu(
     const int64_t my_size,
     const c10::optional<at::Tensor>& weights,
     const c10::optional<at::Tensor>& batch_size_per_feature,
-    const int64_t max_batch_size);
+    const int64_t max_batch_size,
+    const c10::optional<std::vector<at::Tensor>>& block_bucketize_pos);
 
 std::tuple<
     at::Tensor,
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu b/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu
@@ -147,7 +147,10 @@ block_bucketize_sparse_features_cuda(
     const int64_t my_size,
     const c10::optional<Tensor>& weights,
     const c10::optional<Tensor>& batch_size_per_feature,
-    const int64_t max_B) {
+    const int64_t max_B,
+    const c10::optional<std::vector<
+        at::Tensor>>& /*block_bucketize_pos*/ // Only used in GPU variant
+) {
   TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(lengths, indices);
 
   at::cuda::OptionalCUDAGuard device_guard;
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -264,6 +264,21 @@ void _permute_2D_lengths_cpu_kernel(
     input_offsets[i + 1] = lengths[i] + input_offsets[i];
   }
 }
+template <typename index_t>
+int64_t
+_find_lower_bound(const index_t* arr, const int64_t size, index_t target) {
+  int64_t l = 0;
+  int64_t h = size - 1;
+  while (l <= h) {
+    int mid = l + (h - l) / 2;
+    if (arr[mid] > target) {
+      h = mid - 1;
+    } else {
+      l = mid + 1;
+    }
+  }
+  return l - 1;
+}
 
 template <
     bool sequence,
@@ -283,7 +298,8 @@ void _block_bucketize_sparse_features_cpu(
     c10::optional<Tensor> new_weights,
     c10::optional<Tensor> new_pos,
     const c10::optional<Tensor>& unbucketize_permute,
-    const c10::optional<Tensor>& batch_size_per_feature) {
+    const c10::optional<Tensor>& batch_size_per_feature,
+    const c10::optional<std::vector<at::Tensor>>& block_bucketize_pos) {
   // allocate tensors and buffers
   const auto lengths_size = lengths.numel();
   const auto new_lengths_size = lengths_size * my_size;
@@ -304,7 +320,7 @@ void _block_bucketize_sparse_features_cpu(
   const index_t* const block_sizes_data = block_sizes.data_ptr<index_t>();
   offset_t* batch_sizes_data = nullptr;
   const auto variable_batch_size = batch_size_per_feature.has_value();
-
+  const auto variable_bucket_sizes = block_bucketize_pos.has_value();
   using uindex_t = std::make_unsigned_t<index_t>;
   using uoffset_t = std::make_unsigned_t<offset_t>;
 
@@ -330,6 +346,12 @@ void _block_bucketize_sparse_features_cpu(
   for (const auto t : c10::irange(T)) {
     const auto blk_size = block_sizes_data[t];
     const auto cur_batch_size = variable_batch_size ? batch_sizes_data[t] : B;
+    const index_t* bucketize_offset = nullptr;
+    int64_t bucket_size = 0;
+    if (variable_bucket_sizes) {
+      bucketize_offset = block_bucketize_pos.value()[t].data_ptr<index_t>();
+      bucket_size = block_bucketize_pos.value()[t].numel();
+    }
     for (const auto b : c10::irange(cur_batch_size)) {
       const auto b_t = (variable_batch_size ? cur_offset : t * B) + b;
       const offset_t rowstart = offsets_data[b_t];
@@ -342,10 +364,17 @@ void _block_bucketize_sparse_features_cpu(
         // range of blk_size, we expect the later embedding module to take care
         // of hashing indices calculation.
         uindex_t idx = static_cast<uindex_t>(indices_data[i]);
-        uindex_t p = idx < static_cast<uindex_t>(blk_size * my_size)
-            ? idx / blk_size
-            : idx % my_size;
-        new_lengths_data[p * lengths_size + b_t]++;
+        if (variable_bucket_sizes) {
+          auto lb = _find_lower_bound<index_t>(
+              bucketize_offset, bucket_size, indices_data[i]);
+          uindex_t p = lb < my_size ? lb : idx % my_size;
+          new_lengths_data[p * lengths_size + b_t]++;
+        } else {
+          uindex_t p = idx < static_cast<uindex_t>(blk_size * my_size)
+              ? idx / blk_size
+              : idx % my_size;
+          new_lengths_data[p * lengths_size + b_t]++;
+        }
       }
     }
     cur_offset += cur_batch_size;
@@ -358,6 +387,12 @@ void _block_bucketize_sparse_features_cpu(
   for (const auto t : c10::irange(T)) {
     const auto blk_size = block_sizes_data[t];
     const auto cur_batch_size = variable_batch_size ? batch_sizes_data[t] : B;
+    const index_t* bucketize_offset = nullptr;
+    int64_t bucket_size = 0;
+    if (variable_bucket_sizes) {
+      bucketize_offset = block_bucketize_pos.value()[t].data_ptr<index_t>();
+      bucket_size = block_bucketize_pos.value()[t].numel();
+    }
     for (const auto b : c10::irange(cur_batch_size)) {
       const auto b_t = (variable_batch_size ? cur_offset : t * B) + b;
       const offset_t rowstart = offsets_data[b_t];
@@ -370,12 +405,19 @@ void _block_bucketize_sparse_features_cpu(
         // range of blk_size, we expect the later embedding module to take care
         // of hashing indices calculation.
         const uindex_t idx = static_cast<uindex_t>(indices_data[i]);
-        const uindex_t p = idx < static_cast<uindex_t>(blk_size * my_size)
-            ? idx / blk_size
-            : idx % my_size;
-        const uindex_t new_idx = idx < static_cast<uindex_t>(blk_size * my_size)
-            ? idx % blk_size
-            : idx / my_size;
+        uindex_t p, new_idx;
+        if (variable_bucket_sizes) {
+          auto lb = _find_lower_bound<index_t>(
+              bucketize_offset, bucket_size, indices_data[i]);
+          p = lb < my_size ? lb : idx % my_size;
+          new_idx = lb < my_size ? idx - bucketize_offset[lb] : idx / my_size;
+        } else {
+          p = idx < static_cast<uindex_t>(blk_size * my_size) ? idx / blk_size
+                                                              : idx % my_size;
+          new_idx = idx < static_cast<uindex_t>(blk_size * my_size)
+              ? idx % blk_size
+              : idx / my_size;
+        }
         const uoffset_t pos = new_offsets_data[p * lengths_size + b_t];
         new_indices_data[pos] = new_idx;
         if (sequence) {
@@ -910,8 +952,8 @@ block_bucketize_sparse_features_cpu(
     const int64_t my_size,
     const c10::optional<Tensor>& weights,
     const c10::optional<Tensor>& batch_size_per_feature,
-    const int64_t /* max_batch_size */ // Only used in GPU variant
-) {
+    const int64_t /* max_batch_size */, // Only used in GPU variant
+    const c10::optional<std::vector<at::Tensor>>& block_bucketize_pos) {
   const auto lengths_size = lengths.numel();
   const auto new_lengths_size = lengths_size * my_size;
   auto new_lengths = at::zeros({new_lengths_size}, lengths.options());
@@ -958,7 +1000,8 @@ block_bucketize_sparse_features_cpu(
                             new_weights,
                             new_pos,
                             unbucketize_permute,
-                            batch_size_per_feature);
+                            batch_size_per_feature,
+                            block_bucketize_pos);
                       });
                 });
           });
@@ -993,7 +1036,8 @@ block_bucketize_sparse_features_cpu(
                             new_weights,
                             new_pos,
                             unbucketize_permute,
-                            batch_size_per_feature);
+                            batch_size_per_feature,
+                            block_bucketize_pos);
                       });
                 });
           });
@@ -1026,7 +1070,8 @@ block_bucketize_sparse_features_cpu(
                       new_weights,
                       new_pos,
                       unbucketize_permute,
-                      batch_size_per_feature);
+                      batch_size_per_feature,
+                      block_bucketize_pos);
                 });
           });
     } else {
@@ -1054,7 +1099,8 @@ block_bucketize_sparse_features_cpu(
                       new_weights,
                       new_pos,
                       unbucketize_permute,
-                      batch_size_per_feature);
+                      batch_size_per_feature,
+                      block_bucketize_pos);
                 });
           });
     }
@@ -2696,7 +2742,7 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "expand_into_jagged_permute(Tensor permute, Tensor input_offset, Tensor output_offset, SymInt output_size) -> Tensor");
   m.def(
-      "block_bucketize_sparse_features(Tensor lengths, Tensor indices, bool bucketize_pos, bool sequence, Tensor block_sizes, SymInt my_size, Tensor? weights=None, Tensor? batch_size_per_feature=None, SymInt max_B= -1) -> (Tensor, Tensor, Tensor?, Tensor?, Tensor?)");
+      "block_bucketize_sparse_features(Tensor lengths, Tensor indices, bool bucketize_pos, bool sequence, Tensor block_sizes, SymInt my_size, Tensor? weights=None, Tensor? batch_size_per_feature=None, SymInt max_B= -1, Tensor[]? block_bucketize_pos=None) -> (Tensor, Tensor, Tensor?, Tensor?, Tensor?)");
   m.def(
       "bucketize_sparse_features(Tensor lengths, Tensor indices, bool bucketize_pos, SymInt my_size, Tensor? weights=None) -> (Tensor, Tensor, Tensor?, Tensor?)");
   m.def("asynchronous_exclusive_cumsum(Tensor t_in) -> Tensor");
diff --git a/fbgemm_gpu/test/failures_dict.json b/fbgemm_gpu/test/failures_dict.json
@@ -33,6 +33,10 @@
         "comment": "",
         "status": "xfail"
       },
+      "SparseOpsTest.test_aot_dispatch_dynamic__test_block_bucketize_sparse_features_with_block_bucketize_pos": {
+        "comment": "",
+        "status": "xfail"
+      },
       "SparseOpsTest.test_aot_dispatch_dynamic__test_block_bucketize_sparse_features_with_variable_batch_sizes": {
         "comment": "",
         "status": "xfail"
@@ -57,6 +61,10 @@
         "comment": "",
         "status": "xfail"
       },
+      "SparseOpsTest.test_faketensor__test_block_bucketize_sparse_features_with_block_bucketize_pos": {
+        "comment": "",
+        "status": "xfail"
+      },
       "SparseOpsTest.test_faketensor__test_block_bucketize_sparse_features_with_variable_batch_sizes": {
         "comment": "",
         "status": "xfail"
diff --git a/fbgemm_gpu/test/sparse_ops_test.py b/fbgemm_gpu/test/sparse_ops_test.py
@@ -869,6 +869,14 @@ def test_block_bucketize_sparse_features_with_variable_batch_sizes(
         bucketize_pos: bool,
         sequence: bool,
     ) -> None:
+        """
+        Test variable bucket size for block bucketize_sparse features for RW sharding.
+        E.g. Given bucket_sizes_pos as [[0,5,15], [0,10,13]]
+        For batch 0, indices in [0,5) will be assigned to bucket 0, indices in [5,15) will be assigned to bucket 1.
+        For batch 1, indices in [0,10) will be assigned to bucket 0, indices in [10,13) will be assigned to bucket 1.
+        The new index will be original index - bucket_sizes_pos[new_bucket_id-1]
+        i.e. for batch = 0, index = 12, it will be assigned to bucket 1 and the new index is 12 - 5 = 7.
+        """
         lengths = torch.tensor([2, 1, 1, 2, 0, 2], dtype=index_type)
         indices = torch.tensor(
             [1, 8, 5, 6, 7, 8, 8, 4],
@@ -942,6 +950,90 @@ def test_block_bucketize_sparse_features_with_variable_batch_sizes(
                 new_indices_gpu.cpu(), new_indices_ref, rtol=0, atol=0
             )
 
+    @given(
+        index_type=st.sampled_from([torch.int, torch.long]),
+        has_weight=st.booleans(),
+        bucketize_pos=st.booleans(),
+        sequence=st.booleans(),
+    )
+    @settings(verbosity=Verbosity.verbose, max_examples=16, deadline=None)
+    def test_block_bucketize_sparse_features_with_block_bucketize_pos(
+        self,
+        index_type: Optional[torch.dtype],
+        has_weight: bool,
+        bucketize_pos: bool,
+        sequence: bool,
+    ) -> None:
+        lengths = torch.tensor([2, 1, 1, 2, 0, 2], dtype=index_type)
+        indices = torch.tensor(
+            [1, 7, 2, 6, 7, 8, 8, 4],
+            dtype=index_type,
+        )
+        batch_sizes = torch.tensor([3, 1, 2], dtype=index_type)
+        weights = (
+            torch.tensor(
+                [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+                dtype=torch.float,
+            )
+            if has_weight
+            else None
+        )
+
+        block_sizes = torch.tensor([5, 10, 8], dtype=index_type)
+        my_size = 2
+        max_B = batch_sizes.max().item()  # unused
+
+        block_bucketize_pos = [
+            torch.tensor([0, 2, 8], dtype=index_type),
+            torch.tensor([0, 5, 10], dtype=index_type),
+            torch.tensor([0, 7, 12], dtype=index_type),
+        ]
+
+        new_lengths_ref = torch.tensor(
+            [1, 0, 0, 0, 0, 1, 1, 1, 1, 2, 0, 1],
+            dtype=index_type,
+        )
+        new_indices_ref = torch.tensor(
+            [1, 4, 5, 0, 4, 2, 3, 1],
+            dtype=index_type,
+        )
+        new_weights_ref = torch.tensor(
+            [
+                1.0,
+                8.0,
+                2.0,
+                3.0,
+                4.0,
+                5.0,
+                6.0,
+                7.0,
+            ],
+            dtype=torch.float,
+        )
+        (
+            new_lengths_cpu,
+            new_indices_cpu,
+            new_weights_cpu,
+            new_pos_cpu,
+            unbucketize_permute,
+        ) = torch.ops.fbgemm.block_bucketize_sparse_features(
+            lengths,
+            indices,
+            bucketize_pos,
+            sequence,
+            block_sizes,
+            my_size,
+            weights,
+            batch_sizes,
+            max_B,
+            block_bucketize_pos,
+        )
+        breakpoint()
+        torch.testing.assert_close(new_lengths_cpu, new_lengths_ref, rtol=0, atol=0)
+        torch.testing.assert_close(new_indices_cpu, new_indices_ref, rtol=0, atol=0)
+        if has_weight:
+            torch.testing.assert_close(new_weights_cpu, new_weights_ref)
+
     @given(
         index_type=st.sampled_from([torch.int, torch.long]),
         has_weight=st.booleans(),