q10
diff --git a/‎fbgemm_gpu/experimental/gemm/test/grouped_gemm_test.py
Lines changed: 21 additions & 8 deletions b/‎fbgemm_gpu/experimental/gemm/test/grouped_gemm_test.py
Lines changed: 21 additions & 8 deletions
@@ -36,6 +36,7 @@ def _test_grouped_gemm_fp8_rowwise(
             shape: Tuple[int, int, int, int],
             device: torch.device,
             fast_accu: bool,
+            use_warp_specialization: bool,
         ) -> None:
             G, M, N, K = shape
             a = torch.randn(M, K, dtype=torch.bfloat16, device=device)
@@ -62,6 +63,7 @@ def _test_grouped_gemm_fp8_rowwise(
                 a_scale,
                 b_scale,
                 use_fast_accum=fast_accu,
+                _use_warp_specialization=use_warp_specialization,
             )
             self.assertTrue(result.shape == (M, N))
 
@@ -85,17 +87,22 @@ def _test_grouped_gemm_fp8_rowwise(
         for G in (1, 4, 16):
             for M in (64, 512):
                 for fast_accu in (True, False):
-                    logging.info(
-                        f"Testing FP8 GMM with G={G}, M={M}, FastAccu={fast_accu}"
-                    )
-                    _test_grouped_gemm_fp8_rowwise(
-                        (G, M, 256, 256), torch.device("cuda"), fast_accu=fast_accu
-                    )
+                    for ws in (True, False):
+                        logging.info(
+                            f"Testing FP8 GMM with G={G}, M={M}, FastAccu={fast_accu}"
+                        )
+                        _test_grouped_gemm_fp8_rowwise(
+                            (G, M, 256, 256),
+                            torch.device("cuda"),
+                            fast_accu=fast_accu,
+                            use_warp_specialization=ws,
+                        )
 
     def test_grouped_gemm_bf16(self) -> None:
         def _test_grouped_gemm_bf16(
             shape: Tuple[int, int, int, int],
             device: torch.device,
+            use_warp_specialization: bool,
         ) -> None:
             G, M, N, K = shape
             a = torch.randn(M, K, dtype=torch.bfloat16, device=device)
@@ -116,6 +123,7 @@ def _test_grouped_gemm_bf16(
                 a,
                 b,
                 m_sizes,
+                _use_warp_specialization=use_warp_specialization,
             )
             self.assertTrue(result.shape == (M, N))
 
@@ -131,5 +139,10 @@ def _test_grouped_gemm_bf16(
 
         for G in (1, 4, 16):
             for M in (64, 512):
-                logging.info(f"Testing BF16 GMM with G={G}, M={M}")
-                _test_grouped_gemm_bf16((G, M, 256, 256), torch.device("cuda"))
+                for ws in (True, False):
+                    logging.info(f"Testing BF16 GMM with G={G}, M={M}")
+                    _test_grouped_gemm_bf16(
+                        (G, M, 256, 256),
+                        torch.device("cuda"),
+                        use_warp_specialization=ws,
+                    )