Handle 0 inputs for gmm (pytorch#3901)

jasonjk-park · facebook-github-bot · commit dcb347fd165a · 2025-04-02T03:05:00.000-07:00
Summary: Pull Request resolved: pytorch#3901 X-link: facebookresearch/FBGEMM#992 Add support for 0-sized input for triton gmm. Add unit test Reviewed By: levendlee Differential Revision: D72134331 fbshipit-source-id: 6b212993e0e270caefed27de892033ab75a57027
diff --git a/fbgemm_gpu/experimental/gemm/test/grouped_gemm_test.py b/fbgemm_gpu/experimental/gemm/test/grouped_gemm_test.py
@@ -45,6 +45,8 @@ def _test_grouped_gemm_fp8_rowwise(
                 torch.randint(
                     low=0, high=M, size=[G - 1], device=device, dtype=torch.int32
                 )
+                if M > 0
+                else torch.zeros([G - 1], device=device, dtype=torch.int32)
             )
             m_ends = m_ends.tolist()
             m_starts = [0] + m_ends
@@ -85,7 +87,7 @@ def _test_grouped_gemm_fp8_rowwise(
             torch.testing.assert_close(result, expected_result, atol=2e-2, rtol=1.6e-2)
 
         for G in (1, 4, 16):
-            for M in (64, 512):
+            for M in (0, 64, 512):
                 for fast_accu in (True, False):
                     for ws in (True, False):
                         logging.info(
@@ -111,6 +113,8 @@ def _test_grouped_gemm_bf16(
                 torch.randint(
                     low=0, high=M, size=[G - 1], device=device, dtype=torch.int32
                 )
+                if M > 0
+                else torch.zeros([G - 1], device=device, dtype=torch.int32)
             )
             m_ends = m_ends.tolist()
             m_starts = [0] + m_ends
@@ -138,7 +142,7 @@ def _test_grouped_gemm_bf16(
             torch.testing.assert_close(result, expected_result, atol=1e-5, rtol=1.6e-2)
 
         for G in (1, 4, 16):
-            for M in (64, 512):
+            for M in (0, 64, 512):
                 for ws in (True, False):
                     logging.info(f"Testing BF16 GMM with G={G}, M={M}")
                     _test_grouped_gemm_bf16(
diff --git a/fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py b/fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py
@@ -780,6 +780,8 @@ def _grouped_gemm(
     assert K == w.shape[1]
 
     y = torch.empty((M, N), device=x.device, dtype=torch.bfloat16)
+    if M == 0 or N == 0:
+        return y
 
     NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count