Makes use_fast_accum configurable. (pytorch#3829)

levendlee · liligwu · commit a6f98916c519 · 2025-03-19T19:30:04.000Z
Summary: Pull Request resolved: pytorch#3829 X-link: https://github.com/facebookresearch/FBGEMM/pull/913 [Public to OSS] Thanks htyu for pointing out the issue. Looking forward to warp specialization support on Nvidia! - Exposes fast accumulation as a configurable. - Not enable it by default. No change in default behavior. - No additional tuning regarding to `use_fast_accum=True`. W/ HIP backend, the semantics of `c += tl.dot(a, b)` and `c = tl.dot(a,b,c)` seems to be the same. Reviewed By: htyu Differential Revision: D71290596 fbshipit-source-id: 8e2a20899f301f861d8d72f6290e573e23288e63
diff --git a/fbgemm_gpu/experimental/gemm/test/grouped_gemm_test.py b/fbgemm_gpu/experimental/gemm/test/grouped_gemm_test.py
@@ -35,6 +35,7 @@ def test_grouped_gemm_fp8_rowwise(self) -> None:
         def _test_grouped_gemm_fp8_rowwise(
             shape: Tuple[int, int, int, int],
             device: torch.device,
+            fast_accu: bool,
         ) -> None:
             G, M, N, K = shape
             a = torch.randn(M, K, dtype=torch.bfloat16, device=device)
@@ -60,6 +61,7 @@ def _test_grouped_gemm_fp8_rowwise(
                 m_sizes,
                 a_scale,
                 b_scale,
+                use_fast_accum=fast_accu,
             )
             self.assertTrue(result.shape == (M, N))
 
@@ -82,8 +84,13 @@ def _test_grouped_gemm_fp8_rowwise(
 
         for G in (1, 4, 16):
             for M in (64, 512):
-                logging.info(f"Testing FP8 GMM with G={G}, M={M}")
-                _test_grouped_gemm_fp8_rowwise((G, M, 256, 256), torch.device("cuda"))
+                for fast_accu in (True, False):
+                    logging.info(
+                        f"Testing FP8 GMM with G={G}, M={M}, FastAccu={fast_accu}"
+                    )
+                    _test_grouped_gemm_fp8_rowwise(
+                        (G, M, 256, 256), torch.device("cuda"), fast_accu=fast_accu
+                    )
 
     def test_grouped_gemm_bf16(self) -> None:
         def _test_grouped_gemm_bf16(
diff --git a/fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py b/fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py
@@ -142,6 +142,7 @@ def _kernel_grouped_gemm(
     NUM_SMS: tl.constexpr,
     USE_TMA_LOAD: tl.constexpr,
     USE_TMA_STORE: tl.constexpr,
+    USE_FAST_ACCUM: tl.constexpr,
     # tile sizes
     BLOCK_SIZE_M: tl.constexpr,
     BLOCK_SIZE_N: tl.constexpr,
@@ -208,7 +209,10 @@ def _kernel_grouped_gemm(
                             [BLOCK_SIZE_N, BLOCK_SIZE_K],
                             dtype,
                         )
-                        accumulator += tl.dot(a, b.T)
+                        if USE_FAST_ACCUM:
+                            accumulator = tl.dot(a, b.T, accumulator)
+                        else:
+                            accumulator += tl.dot(a, b.T)
                 else:
                     offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
                     offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
@@ -283,6 +287,7 @@ def _kernel_grouped_gemm_fp8_rowwise(
     NUM_SMS: tl.constexpr,
     USE_TMA_LOAD: tl.constexpr,
     USE_TMA_STORE: tl.constexpr,
+    USE_FAST_ACCUM: tl.constexpr,
     # tile sizes
     BLOCK_SIZE_M: tl.constexpr,
     BLOCK_SIZE_N: tl.constexpr,
@@ -349,7 +354,10 @@ def _kernel_grouped_gemm_fp8_rowwise(
                             [BLOCK_SIZE_N, BLOCK_SIZE_K],
                             dtype,
                         )
-                        accumulator += tl.dot(a, b.T)
+                        if USE_FAST_ACCUM:
+                            accumulator = tl.dot(a, b.T, accumulator)
+                        else:
+                            accumulator += tl.dot(a, b.T)
                 else:
                     offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
                     offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
@@ -410,6 +418,7 @@ def _grouped_gemm(
     m_sizes: torch.Tensor,
     x_scale: Optional[torch.Tensor] = None,
     w_scale: Optional[torch.Tensor] = None,
+    use_fast_accum: bool = False,
 ) -> torch.Tensor:
     if not utils.HAS_TMA_DESC:
         raise NotImplementedError("Grouped GEMM without TMA is not supported yet")
@@ -493,6 +502,7 @@ def grid(META):
             NUM_SMS,
             USE_TMA_LOAD,
             USE_TMA_STORE,
+            use_fast_accum,
         )
     else:
         assert x_scale is None
@@ -510,15 +520,19 @@ def grid(META):
             NUM_SMS,
             USE_TMA_LOAD,
             USE_TMA_STORE,
+            use_fast_accum,
         )
 
     return y
 
 
 def grouped_gemm(
-    x: torch.Tensor, w: torch.Tensor, m_sizes: torch.Tensor
+    x: torch.Tensor,
+    w: torch.Tensor,
+    m_sizes: torch.Tensor,
+    use_fast_accum: bool = False,
 ) -> torch.Tensor:
-    return _grouped_gemm(x, w, m_sizes)
+    return _grouped_gemm(x, w, m_sizes, use_fast_accum=use_fast_accum)
 
 
 def grouped_gemm_fp8_rowwise(
@@ -527,5 +541,6 @@ def grouped_gemm_fp8_rowwise(
     m_sizes: torch.Tensor,
     x_scale: torch.Tensor,
     w_scale: torch.Tensor,
+    use_fast_accum: bool = False,
 ) -> torch.Tensor:
-    return _grouped_gemm(x, w, m_sizes, x_scale, w_scale)
+    return _grouped_gemm(x, w, m_sizes, x_scale, w_scale, use_fast_accum=use_fast_accum)