Add BF16 support for reorder_batched_ad_indices (pytorch#2116)

lequytra · facebook-github-bot · commit 80990a6c4d6e · 2023-11-06T14:33:03.000-08:00
Summary: Pull Request resolved: pytorch#2116 We use `reorder_batched_ad_indices` to [rebatch id_score_list weights](https://www.internalfb.com/code/fbsource/[e3bbe1eaf65e]/fbcode/caffe2/caffe2/fb/predictor/rebatch/GPURebatchUtils.cpp?lines=305) which is quantized to BFloat 16. However, BFloat16 is currently not supported in `reorder_batched_ad_indices`, see error trace: P868895010 This diff adds this support for BFloat16 dtype. Reviewed By: YazhiGao Differential Revision: D50817983 fbshipit-source-id: 4949acac8d1524dc10c7931e28bdfcabd2e94477
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_reorder_batched_ad.cu b/fbgemm_gpu/src/sparse_ops/sparse_reorder_batched_ad.cu
@@ -243,7 +243,8 @@ DLL_PUBLIC Tensor reorder_batched_ad_indices_gpu(
   const dim3 threads(32, 32);
   const dim3 blocks((B * T + 32 - 1) / 32);
 
-  AT_DISPATCH_ALL_TYPES(
+  AT_DISPATCH_ALL_TYPES_AND(
+      at::ScalarType::BFloat16,
       cat_ad_indices.scalar_type(),
       "reorder_batched_ad_indices_gpu_kernel_1",
       [&] {
diff --git a/fbgemm_gpu/test/sparse_ops_test.py b/fbgemm_gpu/test/sparse_ops_test.py
@@ -1114,7 +1114,7 @@ def test_reorder_batched_ad_lengths_cpu(
         T=st.integers(min_value=1, max_value=20),
         L=st.integers(min_value=2, max_value=20),
         A=st.integers(min_value=1, max_value=20),
-        Dtype=st.sampled_from([torch.int32, torch.float, torch.int64]),
+        Dtype=st.sampled_from([torch.int32, torch.float, torch.int64, torch.bfloat16]),
         Itype=st.sampled_from([torch.int32, torch.int64]),
         broadcast_indices=st.booleans(),
     )

Original file line number	Diff line number	Diff line change
`@@ -1114,7 +1114,7 @@ def test_reorder_batched_ad_lengths_cpu(`
`1114`	`1114`	`T=st.integers(min_value=1, max_value=20),`
`1115`	`1115`	`L=st.integers(min_value=2, max_value=20),`
`1116`	`1116`	`A=st.integers(min_value=1, max_value=20),`
`1117`		`- Dtype=st.sampled_from([torch.int32, torch.float, torch.int64]),`
	`1117`	`+ Dtype=st.sampled_from([torch.int32, torch.float, torch.int64, torch.bfloat16]),`
`1118`	`1118`	`Itype=st.sampled_from([torch.int32, torch.int64]),`
`1119`	`1119`	`broadcast_indices=st.booleans(),`
`1120`	`1120`	`)`