Return if no data to allreduce (pytorch#3586)

xw285cornell · facebook-github-bot · commit 6a684cbdd174 · 2025-01-22T10:01:37.000-08:00
Summary: Pull Request resolved: pytorch#3586 X-link: facebookresearch/FBGEMM#669 When the input tensor is empty, just return. Otherwise the num_thread will be 0 and fail to launch cuda kernels. Reviewed By: feikou, jianyuh Differential Revision: D68318641 fbshipit-source-id: ff1c0c401fc4884cef9ee71fdcccaa6d68e1bf80
diff --git a/fbgemm_gpu/experimental/gen_ai/src/comm/car.cu b/fbgemm_gpu/experimental/gen_ai/src/comm/car.cu
@@ -480,6 +480,10 @@ void one_shot_car_allreduce(
   TORCH_CHECK(y.numel() % 8 == 0);
   TORCH_CHECK(y.numel() < kMaxCAR);
   const auto N = y.numel();
+  if (N == 0) {
+    // no data to allreduce, return
+    return;
+  }
   if (z) {
     TORCH_CHECK(z->numel() == y.numel());
   }
diff --git a/fbgemm_gpu/experimental/gen_ai/test/comm/multi_gpu_car_test.py b/fbgemm_gpu/experimental/gen_ai/test/comm/multi_gpu_car_test.py
@@ -246,7 +246,7 @@ def _run_oneshot_car_stress_inner(path: str) -> None:
     torch.distributed.barrier()
 
     ITER = 1000
-    for idx, N in enumerate(np.logspace(4, 24, num=20, base=2).tolist()):
+    for idx, N in enumerate([0] + np.logspace(4, 24, num=20, base=2).tolist()):
         N = int(N)
 
         def round_up(a: int, b: int) -> int: