FBGEMM Add Columnwise Weight Scaling to F8I4 GEMM (pytorch#3766)

jwfromm · facebook-github-bot · commit aec40d13e51c · 2025-03-07T09:33:27.000-08:00
Summary: X-link: facebookresearch/FBGEMM#847 One of the new interesting changes in the preshuffled F8I4 kernel is that group scales are downcast to FP8. This has the risk of running into dynamic range issues and impacting accuracy. We can mitigate this risk by adding FP32 columnwise scaling to the output. Fortunately, we can do this using EVT so the performance impact is negligible. Reviewed By: jiawenliu64 Differential Revision: D70587477
diff --git a/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py b/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py
@@ -1297,27 +1297,33 @@ def _int4_row_quantize(
         out = out.to(dtype=torch.int8).reshape(x.shape)
 
         # Scales should be in [num_groups, N] layout.
-        scales = scales.view(x.shape[0], -1).t().contiguous()
+        scales = scales.view(x.shape[0], -1).t().contiguous().to(torch.float8_e4m3fn)
 
         return out, scales
 
     def quantize(self, x, w):
         # Quantize both input tensors.
         xq, x_scale = quantize_fp8_row(x)
-        wq, w_scale = self._int4_row_quantize(w)
+        # Weight quantization happens in two steps. First we quantize to fp8
+        # then to int4.
+        wq, w_scale = quantize_fp8_row(w)
+        # Now quantize to int4 with group scaling.
+        wq, w_scale_group = self._int4_row_quantize(wq)
         # Pack int4 values together.
         wq = self._pack_int4(wq)
         # Shuffle weights and scales for faster compute.
-        wq, w_scale = torch.ops.fbgemm.preshuffle_i4(wq, w_scale)
-        return xq, wq, x_scale, w_scale
+        wq, w_scale_group = torch.ops.fbgemm.preshuffle_i4(wq, w_scale_group)
+        return xq, wq, x_scale, w_scale, w_scale_group
 
-    def compute(self, xq, wq, x_scale, w_scale):
-        out = torch.ops.fbgemm.f8i4bf16_shuffled(xq, wq, x_scale, w_scale)
+    def compute(self, xq, wq, x_scale, w_scale, w_scale_group):
+        out = torch.ops.fbgemm.f8i4bf16_shuffled(
+            xq, wq, x_scale, w_scale, w_scale_group
+        )
         return out
 
     def quantize_and_compute(self, x, w):
-        xq, wq, x_scale, w_scale = self.quantize(x, w)
-        return self.compute(xq, wq, x_scale, w_scale)
+        xq, wq, x_scale, w_scale, w_scale_group = self.quantize(x, w)
+        return self.compute(xq, wq, x_scale, w_scale, w_scale_group)
 
     @property
     def name(self) -> str:
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8i4bf16_shuffled.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8i4bf16_shuffled.cu
@@ -33,16 +33,17 @@ at::Tensor _f8i4bf16_shuffled(
     at::Tensor XQ,
     at::Tensor WQ,
     at::Tensor x_scale,
-    at::Tensor w_scale) {
+    at::Tensor w_scale,
+    at::Tensor w_scale_group) {
   // Get shape information from input tensors.
   int M = XQ.size(0);
   int K = XQ.size(1);
   int N = WQ.size(0);
-  // Make sure w_scale is in proper format.
+  // Make sure w_scale_group is in proper format.
   TORCH_CHECK(
-      w_scale.size(1) == 8,
-      "Weights and scales must be prepacked with preshuffle_i4.");
-  int num_groups = w_scale.size(0);
+      w_scale_group.size(1) == 8,
+      "Weights and group scales must be prepacked with preshuffle_i4.");
+  int num_groups = w_scale_group.size(0);
   int group_size = K / num_groups;
   // Allocate output.
   at::Tensor Y = at::empty({M, N}, XQ.options().dtype(at::kBFloat16));
@@ -108,7 +109,15 @@ at::Tensor _f8i4bf16_shuffled(
   using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
 
   // Define EVT for rowwise scaling.
-  using XScale = cutlass::epilogue::fusion::Sm90RowBroadcast<
+  // Implement rowwise scaling epilogue.
+  using XScale = cutlass::epilogue::fusion::Sm90ColBroadcast<
+      0,
+      TileShape,
+      ElementAccumulator,
+      ElementAccumulator,
+      cute::Stride<cute::Int<1>, cute::Int<0>, cute::Int<0>>>;
+
+  using WScale = cutlass::epilogue::fusion::Sm90RowBroadcast<
       0,
       TileShape,
       ElementAccumulator,
@@ -119,12 +128,21 @@ at::Tensor _f8i4bf16_shuffled(
 
   using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
       cutlass::multiplies,
-      ElementC, // First stage output type.
+      ElementAccumulator, // First stage output type.
       ElementAccumulator, // First stage input types.
       cutlass::FloatRoundStyle::round_to_nearest>;
 
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, WScale, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies,
+      ElementC,
+      ElementAccumulator, // Second stage input types.
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
   using EpilogueEVT =
-      cutlass::epilogue::fusion::Sm90EVT<Compute0, XScale, Accum>;
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, XScale, EVTCompute0>;
 
   using CollectiveEpilogue =
       typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -192,7 +210,8 @@ at::Tensor _f8i4bf16_shuffled(
        layout_B_reordered,
        reinterpret_cast<ElementA*>(XQ.data_ptr()),
        stride_A,
-       reinterpret_cast<cutlass::Array<ElementScale, 8>*>(w_scale.data_ptr()),
+       reinterpret_cast<cutlass::Array<ElementScale, 8>*>(
+           w_scale_group.data_ptr()),
        stride_S,
        group_size},
       {{},
@@ -202,8 +221,14 @@ at::Tensor _f8i4bf16_shuffled(
        stride_C}};
 
   arguments.epilogue.thread = {
-      {reinterpret_cast<ElementAccumulator*>(x_scale.data_ptr())}, // x_scale
-      {}, // Accumulator
+      {reinterpret_cast<ElementAccumulator*>(w_scale.data_ptr())}, // w_scale
+      // compute_0
+      {
+          {reinterpret_cast<ElementAccumulator*>(
+              x_scale.data_ptr())}, // w_scale
+          {}, // Accumulator
+          {} // Multiplies
+      },
       {}, // Multiplies
   };
 
@@ -212,10 +237,11 @@ at::Tensor _f8i4bf16_shuffled(
 
   // Using the arguments, query for extra workspace required for matrix
   // multiplication computation
-  size_t workspace_size = GemmShuffled::get_workspace_size(arguments);
+  int workspace_size = GemmShuffled::get_workspace_size(arguments);
 
   // Allocate workspace memory
-  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+  at::Tensor workspace =
+      at::empty(workspace_size, XQ.options().dtype(at::kByte));
 
   // Check the problem size is supported or not
   cutlass::Status status = gemm.can_implement(arguments);
@@ -224,7 +250,7 @@ at::Tensor _f8i4bf16_shuffled(
   }
 
   // Initialize CUTLASS kernel with arguments and workspace pointer
-  status = gemm.initialize(arguments, workspace.get());
+  status = gemm.initialize(arguments, workspace.data_ptr());
   if (status != cutlass::Status::kSuccess) {
     throw std::runtime_error("cutlass cannot initialize");
   }
@@ -245,54 +271,58 @@ at::Tensor f8i4bf16_shuffled(
     at::Tensor XQ,
     at::Tensor WQ,
     at::Tensor x_scale,
-    at::Tensor w_scale) {
+    at::Tensor w_scale,
+    at::Tensor w_scale_group) {
   int M = XQ.size(0);
   int K = XQ.size(1);
   int N = WQ.size(0);
   // Use shape heuristics to dispatch to optimized kernel configuration.
   if (M <= 16) {
-    return _f8i4bf16_shuffled<64, 16, 2, 1, 1, false>(XQ, WQ, x_scale, w_scale);
+    return _f8i4bf16_shuffled<64, 16, 2, 1, 1, false>(
+        XQ, WQ, x_scale, w_scale, w_scale_group);
   } else if (M <= 32) {
-    return _f8i4bf16_shuffled<64, 32, 2, 1, 1, false>(XQ, WQ, x_scale, w_scale);
+    return _f8i4bf16_shuffled<64, 32, 2, 1, 1, false>(
+        XQ, WQ, x_scale, w_scale, w_scale_group);
   } else if (M <= 64) {
-    return _f8i4bf16_shuffled<64, 64, 2, 1, 1, false>(XQ, WQ, x_scale, w_scale);
+    return _f8i4bf16_shuffled<64, 64, 2, 1, 1, false>(
+        XQ, WQ, x_scale, w_scale, w_scale_group);
   } else if (M <= 128) {
     return _f8i4bf16_shuffled<64, 128, 2, 1, 1, false>(
-        XQ, WQ, x_scale, w_scale);
+        XQ, WQ, x_scale, w_scale, w_scale_group);
   } else if (M <= 256) {
     if (N <= 4096) {
       return _f8i4bf16_shuffled<64, 128, 2, 1, 1, false>(
-          XQ, WQ, x_scale, w_scale);
+          XQ, WQ, x_scale, w_scale, w_scale_group);
     } else {
       return _f8i4bf16_shuffled<64, 256, 1, 1, 1, false>(
-          XQ, WQ, x_scale, w_scale);
+          XQ, WQ, x_scale, w_scale, w_scale_group);
     }
   } else if (M <= 512) {
     if (N <= 4096) {
       return _f8i4bf16_shuffled<64, 256, 2, 1, 1, false>(
-          XQ, WQ, x_scale, w_scale);
+          XQ, WQ, x_scale, w_scale, w_scale_group);
     } else {
       return _f8i4bf16_shuffled<128, 256, 2, 1, 1, true>(
-          XQ, WQ, x_scale, w_scale);
+          XQ, WQ, x_scale, w_scale, w_scale_group);
     }
   } else if (M <= 1024) {
     if (N <= 1024) {
       return _f8i4bf16_shuffled<64, 128, 2, 1, 1, false>(
-          XQ, WQ, x_scale, w_scale);
+          XQ, WQ, x_scale, w_scale, w_scale_group);
     } else if (N <= 2048) {
       return _f8i4bf16_shuffled<64, 256, 2, 1, 1, false>(
-          XQ, WQ, x_scale, w_scale);
+          XQ, WQ, x_scale, w_scale, w_scale_group);
     } else {
       return _f8i4bf16_shuffled<128, 256, 2, 1, 1, true>(
-          XQ, WQ, x_scale, w_scale);
+          XQ, WQ, x_scale, w_scale, w_scale_group);
     }
   } else {
     if (N <= 1024) {
       return _f8i4bf16_shuffled<64, 256, 2, 1, 1, false>(
-          XQ, WQ, x_scale, w_scale);
+          XQ, WQ, x_scale, w_scale, w_scale_group);
     } else {
       return _f8i4bf16_shuffled<128, 256, 2, 1, 1, true>(
-          XQ, WQ, x_scale, w_scale);
+          XQ, WQ, x_scale, w_scale, w_scale_group);
     }
   }
 }
@@ -303,7 +333,8 @@ at::Tensor f8i4bf16_shuffled(
     at::Tensor XQ,
     at::Tensor WQ,
     at::Tensor x_scale,
-    at::Tensor w_scale) {
+    at::Tensor w_scale,
+    at::Tensor w_scale_group) {
   throw std::runtime_error(
       "CUDA version is older than 12.0"); // requires CUDA>=12
 }
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cpp b/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cpp
@@ -136,7 +136,8 @@ at::Tensor f8i4bf16_shuffled(
     at::Tensor XQ,
     at::Tensor WQ,
     at::Tensor x_scale,
-    at::Tensor w_scale);
+    at::Tensor w_scale,
+    at::Tensor w_scale_group);
 std::tuple<at::Tensor, at::Tensor> preshuffle_i4(
     at::Tensor WQ,
     at::Tensor w_scale);
@@ -198,7 +199,7 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "f8i4bf16_rowwise(Tensor XQ, Tensor WQ, Tensor x_scale, Tensor w_scale, Tensor w_zp) -> Tensor");
   m.def(
-      "f8i4bf16_shuffled(Tensor XQ, Tensor WQ, Tensor x_scale, Tensor w_scale) -> Tensor");
+      "f8i4bf16_shuffled(Tensor XQ, Tensor WQ, Tensor x_scale, Tensor w_scale, Tensor w_scale_group) -> Tensor");
   m.def("preshuffle_i4(Tensor WQ, Tensor w_scale) -> (Tensor, Tensor)");
   m.def("bf16_fast_gemv(Tensor X, Tensor W) -> Tensor");
   m.def("bf16fp8bf16_fast_gemv(Tensor X, Tensor W, Tensor w_scale) -> Tensor");