Enable groupwise scales for F8I4 Grouped Gemm (pytorch#3884)

jwfromm · facebook-github-bot · commit 851815d198b4 · 2025-03-26T15:49:03.000-07:00
Summary: X-link: facebookresearch/FBGEMM#975 Pull Request resolved: pytorch#3884 Due to cutlass support limitations, we previously required that F8I4 grouped gemm use rowwise scales for its weights. This leaves a lot of accuracy on the table compared to groupwise scales (which we use for standard f8i4 gemm). This diff adds support for groupwise scaling and lifts the restriction from our implementation. Reviewed By: jiawenliu64 Differential Revision: D71905839 fbshipit-source-id: f49d54aad558730992c79fe1c714ba3a66fa523e
diff --git a/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py b/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py
@@ -1402,10 +1402,7 @@ def preprocess(self, x, w):
         m_sizes = torch.tensor(m_values).to(dtype=torch.int32, device=x[0].device)
         # Quantize weights.
         # TODO Only rowwise scaling is currently supported. This needs to be fixed.
-        K = x[0].shape[-1]
-        wq, row_scale, group_scale = zip(
-            *[quantize_int4_preshuffle(i, group_size=K) for i in w]
-        )
+        wq, row_scale, group_scale = zip(*[quantize_int4_preshuffle(i) for i in w])
         # Group weights as single tensor.
         wq = torch.stack(wq, dim=0).contiguous()
         row_scale = torch.stack(row_scale, dim=0).contiguous()
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8i4bf16_shuffled_grouped.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8i4bf16_shuffled_grouped.cu
@@ -145,10 +145,6 @@ void _f8i4bf16_shuffled_grouped(
   // Group scales should have shape [G, num_scale_groups, 8, N]
   int num_scale_groups = w_scale_group.size(1);
   int group_size = K / num_scale_groups;
-  TORCH_CHECK(
-      num_scale_groups == 1,
-      "Mixed dtype grouped gemm only supports rowwise scaling currently (group_size=K).");
-
   // Define cutlass types.
   using ProblemShape = cutlass::gemm::GroupProblemShape<
       cute::Shape<int, int, int>>; // <M,N,K> per group.