Provide helper functions for int4 quantization (pytorch#3775)

jwfromm · facebook-github-bot · commit d9e4e3f60c78 · 2025-03-07T09:33:27.000-08:00
Summary: X-link: facebookresearch/FBGEMM#855 This diff introduces a set of quantization helper functions to fbgemm_gpu/experimental/gen_ai to make it easier to apply the new Int4 packing and preshuffling to weights. Reviewed By: summerdengfb Differential Revision: D70643388
diff --git a/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py b/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py
@@ -24,6 +24,7 @@
 from fbgemm_gpu.experimental.gemm.triton_gemm.grouped_gemm import (
     grouped_gemm_fp8_rowwise,
 )
+from fbgemm_gpu.experimental.gen_ai.quantize import quantize_int4_preshuffle
 from tinygemm.utils import group_quantize_tensor
 
 if torch.cuda.is_available() and torch.version.cuda:
@@ -1277,58 +1278,52 @@ def cuda(self) -> bool:
 
 
 @register_quantize_op
-class F8I4ShuffledGemm(F8I4RowwiseGemm):
-    def _int4_row_quantize(
-        self,
-        x: torch.Tensor,
-        group_size: int = 128,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        n_bit = 4  # Number of target bits.
-        to_quant = x.reshape(-1, group_size).to(torch.float)
-
-        max_val = torch.abs(to_quant).amax(dim=1, keepdim=True)
-        max_int = 2 ** (n_bit - 1)
-        min_int = -(2 ** (n_bit - 1))
-        scales = max_val.clamp(min=1e-6) / max_int
-
-        out = to_quant.div(scales).round().clamp_(min_int, max_int - 1)
-
-        # Cast to int8 and restore shape.
-        out = out.to(dtype=torch.int8).reshape(x.shape)
-
-        # Scales should be in [num_groups, N] layout.
-        scales = scales.view(x.shape[0], -1).t().contiguous().to(torch.float8_e4m3fn)
-
-        return out, scales
+class F8I4ShuffledGemm(QuantizeOpBase):
+    def preprocess(self, x, w):
+        # Prequantize and pack weights.
+        wq, row_scale, group_scale = quantize_int4_preshuffle(w)
+        return x, wq, row_scale, group_scale
 
-    def quantize(self, x, w):
+    def quantize(self, x, wq, row_scale, group_scale):
         # Quantize both input tensors.
         xq, x_scale = quantize_fp8_row(x)
-        # Weight quantization happens in two steps. First we quantize to fp8
-        # then to int4.
-        wq, w_scale = quantize_fp8_row(w)
-        # Now quantize to int4 with group scaling.
-        wq, w_scale_group = self._int4_row_quantize(wq)
-        # Pack int4 values together.
-        wq = self._pack_int4(wq)
-        # Shuffle weights and scales for faster compute.
-        wq, w_scale_group = torch.ops.fbgemm.preshuffle_i4(wq, w_scale_group)
-        return xq, wq, x_scale, w_scale, w_scale_group
+        return xq, wq, x_scale, row_scale, group_scale
 
-    def compute(self, xq, wq, x_scale, w_scale, w_scale_group):
-        out = torch.ops.fbgemm.f8i4bf16_shuffled(
-            xq, wq, x_scale, w_scale, w_scale_group
+    def compute(self, xq, wq, x_scale, row_scale, group_scale):
+        # Handle batched cases by looping over each batch.
+        if xq.dim() == 3:
+            B, M, _ = xq.shape
+            _, N, _ = wq.shape
+            y = torch.empty((B, M, N), device=xq.device, dtype=torch.bfloat16)
+            for i in range(B):
+                y[i] = torch.ops.fbgemm.f8i4bf16_shuffled(
+                    xq[i], wq[i], x_scale[i], row_scale[i], group_scale[i]
+                )
+            return y
+        # Otherwise run gemm normally.
+        return torch.ops.fbgemm.f8i4bf16_shuffled(
+            xq, wq, x_scale, row_scale, group_scale
         )
-        return out
 
-    def quantize_and_compute(self, x, w):
-        xq, wq, x_scale, w_scale, w_scale_group = self.quantize(x, w)
-        return self.compute(xq, wq, x_scale, w_scale, w_scale_group)
+    def quantize_and_compute(self, x, wq, row_scale, group_scale):
+        xq, wq, x_scale, row_scale, group_scale = self.quantize(
+            x, wq, row_scale, group_scale
+        )
+        return self.compute(xq, wq, x_scale, row_scale, group_scale)
 
     @property
     def name(self) -> str:
         return "cutlass_f8i4_preshuffle"
 
+    @property
+    def hip(self) -> bool:
+        # Not yet supported on AMD.
+        return False
+
+    @property
+    def cuda(self) -> bool:
+        return True
+
 
 @register_quantize_op
 class BF16I4RowwiseGemm(F8I4RowwiseGemm):
diff --git a/fbgemm_gpu/experimental/gen_ai/gen_ai/quantize.py b/fbgemm_gpu/experimental/gen_ai/gen_ai/quantize.py
@@ -0,0 +1,119 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+# Helper functions for using FBGEMM quantized operators.
+
+from typing import Tuple
+
+import torch
+
+from fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemm import quantize_fp8_row
+
+
+def pack_int4(x: torch.Tensor) -> torch.Tensor:
+    # Given int8 x, pack adjacent int4 values into a single int8.
+    low_x = x[:, ::2]
+    high_x = x[:, 1::2]
+
+    # High bits need to left shift, this also masks off extra bits.
+    high_x = torch.bitwise_left_shift(high_x, 4)
+    # Low bits need to have sign bits removed.
+    low_x = torch.bitwise_and(low_x, 0xF)
+
+    # Recombine into a single value with bitwise or.
+    return torch.bitwise_or(low_x, high_x).contiguous()
+
+
+def int4_row_quantize(
+    x: torch.Tensor,
+    group_size: int = 128,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Helper function to quantize a tensor to int4 with groupwise scales.
+
+    Args:
+        x (Tensor): [N, K] Higher precision weight tensor to quantize.
+        group_size (int): Number of elements to calculate group scale for.
+    Returns:
+        wq (Tensor): [N, K // 2] Quantized int4 tensor stored in int8 elements.
+        group_scale (Tensor): [K / group_size, N] FP32 Scale per group.
+    """
+    n_bit = 4  # Number of target bits.
+    to_quant = x.reshape(-1, group_size).to(torch.float)
+
+    max_val = torch.abs(to_quant).amax(dim=1, keepdim=True)
+    max_int = 2 ** (n_bit - 1)
+    min_int = -(2 ** (n_bit - 1))
+    scales = max_val.clamp(min=1e-6) / max_int
+
+    out = to_quant.div(scales).round().clamp_(min_int, max_int - 1)
+
+    # Cast to int8 and restore shape.
+    out = out.to(dtype=torch.int8).reshape(x.shape)
+
+    # Scales should be in [num_groups, N] layout.
+    scales = scales.view(x.shape[0], -1).t().contiguous()
+
+    return out, scales
+
+
+def quantize_int4_preshuffle(
+    w: torch.Tensor, group_size: int = 128
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Quantizes an input weight tensor to int4 using preshuffling and scale packing.
+    This function is intended to be used with fbgemms mixed dtype kernels and is expected
+    to be applied to weights ahead of time. As such, it is not perfectly optimized.
+
+    Args:
+        w (Tensor): [N, K] Higher precision weight tensor to quantize. May optionally have a batch dimension.
+        group_size (int): Number of elements to calculate group scale for, must be at least 128.
+    Returns:
+        wq (Tensor): [N, K // 2] Quantized int4 weight tensor packed into int8 elements.
+        row_scale (Tensor): [N] FP32 Scale per row of the weight tensor.
+        group_scale (Tensor): [K / group_size, 8, N] FP8 Scale per group of the weight tensor.
+    """
+
+    def _quantize(w: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Start by lowering weights to FP8 and producing row scales.
+        wq, row_scale = quantize_fp8_row(w)
+
+        # Now reduce to INT4.
+        wq, group_scale = int4_row_quantize(wq, group_size)
+        # Reduce group scale to FP8.
+        group_scale = group_scale.to(torch.float8_e4m3fn)
+
+        # Take quantized weights and pack them efficiently.
+        wq = pack_int4(wq)
+
+        # Finally pack weights and scales into efficient preshuffled format.
+        wq, group_scale = torch.ops.fbgemm.preshuffle_i4(wq, group_scale)
+
+        return wq, row_scale, group_scale
+
+    if w.ndim >= 3:
+        orig_shape = w.shape
+        # Flatten to 3 dimensions then iterate over batches.
+        w = w.view(-1, *w.shape[1:])
+        w.unbind(dim=0)
+        wq = []
+        row_scale = []
+        group_scale = []
+        for batch in w:
+            wq_, row_scale_, group_scale_ = _quantize(batch)
+            wq.append(wq_)
+            row_scale.append(row_scale_)
+            group_scale.append(group_scale_)
+        wq = torch.stack(wq).view(*orig_shape[:-2], *wq[0].shape)
+        row_scale = torch.stack(row_scale).view(*orig_shape[:-2], *row_scale[0].shape)
+        group_scale = torch.stack(group_scale).view(
+            *orig_shape[:-2], *group_scale[0].shape
+        )
+    else:
+        wq, row_scale, group_scale = _quantize(w)
+    return wq, row_scale, group_scale
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8i4bf16_rowwise.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8i4bf16_rowwise.cu
@@ -47,6 +47,10 @@ at::Tensor f8i4bf16_rowwise_impl(
 
   int group_size = K / num_groups;
 
+  // Return immediately if input is empty.
+  if (M == 0 || N == 0 || K == 0) {
+    return at::zeros({M, N}, XQ.options().dtype(at::kBFloat16));
+  }
   auto Y = at::empty({M, N}, XQ.options().dtype(at::kBFloat16));
 
   using ElementInputA = INPUT_DTYPE;
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8i4bf16_shuffled.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8i4bf16_shuffled.cu
@@ -22,8 +22,6 @@
 #include "cutlass/util/mixed_dtype_utils.hpp"
 #include "cutlass/util/packed_stride.hpp"
 
-#include "cutlass_extensions/include/kernel_mode.h"
-
 namespace fbgemm_gpu {
 
 #if CUDART_VERSION >= 12000
@@ -34,19 +32,14 @@ at::Tensor _f8i4bf16_shuffled(
     at::Tensor WQ,
     at::Tensor x_scale,
     at::Tensor w_scale,
-    at::Tensor w_scale_group) {
+    at::Tensor w_scale_group,
+    at::Tensor Y) {
   // Get shape information from input tensors.
-  int M = XQ.size(0);
-  int K = XQ.size(1);
-  int N = WQ.size(0);
-  // Make sure w_scale_group is in proper format.
-  TORCH_CHECK(
-      w_scale_group.size(1) == 8,
-      "Weights and group scales must be prepacked with preshuffle_i4.");
+  int M = size_to_dim_(XQ.dim() - 1, XQ.sizes());
+  int K = XQ.size(-1);
+  int N = size_to_dim_(WQ.dim() - 1, WQ.sizes());
   int num_groups = w_scale_group.size(0);
   int group_size = K / num_groups;
-  // Allocate output.
-  at::Tensor Y = at::empty({M, N}, XQ.options().dtype(at::kBFloat16));
 
   // Define input types.
   using MmaType = cutlass::float_e4m3_t;
@@ -273,56 +266,86 @@ at::Tensor f8i4bf16_shuffled(
     at::Tensor x_scale,
     at::Tensor w_scale,
     at::Tensor w_scale_group) {
-  int M = XQ.size(0);
-  int K = XQ.size(1);
-  int N = WQ.size(0);
+  int M = size_to_dim_(XQ.dim() - 1, XQ.sizes());
+  int K = XQ.size(-1);
+  int N = size_to_dim_(WQ.dim() - 1, WQ.sizes());
+  // Check input types and shapes.
+  TORCH_CHECK(
+      XQ.is_cuda() && XQ.is_contiguous() && XQ.dtype() == at::kFloat8_e4m3fn,
+      "XQ must be FP8 and contiguous on GPU.");
+  TORCH_CHECK(
+      WQ.size(-1) == K / 2 && WQ.is_cuda() && WQ.is_contiguous() &&
+          WQ.dtype() == at::kChar,
+      "WQ should be int8 (which represent two int4 values), have shape [..., N, K/2], "
+      "and be contiguous on GPU.");
+  TORCH_CHECK(
+      x_scale.numel() == M && x_scale.dtype() == at::kFloat &&
+          x_scale.is_cuda(),
+      "x_scale must be fp32 and have M total elements.");
+  TORCH_CHECK(
+      w_scale.numel() == N && w_scale.dtype() == at::kFloat &&
+          w_scale.is_cuda(),
+      "Weight row scale should have N elements and be on GPU.");
+  // Make sure w_scale_group is in proper format.
+  TORCH_CHECK(
+      w_scale_group.dtype() == at::kFloat8_e4m3fn && w_scale_group.dim() == 3 &&
+          w_scale_group.size(1) == 8 && w_scale_group.size(2) == N,
+      "Weights and group scales must be prepacked with preshuffle_i4. "
+      "Group scales are expected to be FP8 and have shape [num_groups, 8, N].");
+
+  // Allocate output or return an empty tensor if input is empty.
+  if (M == 0 || N == 0 || K == 0) {
+    return at::zeros({M, N}, XQ.options().dtype(at::kBFloat16));
+  }
+  at::Tensor Y = at::empty({M, N}, XQ.options().dtype(at::kBFloat16));
+
   // Use shape heuristics to dispatch to optimized kernel configuration.
   if (M <= 16) {
     return _f8i4bf16_shuffled<64, 16, 2, 1, 1, false>(
-        XQ, WQ, x_scale, w_scale, w_scale_group);
+        XQ, WQ, x_scale, w_scale, w_scale_group, Y);
   } else if (M <= 32) {
     return _f8i4bf16_shuffled<64, 32, 2, 1, 1, false>(
-        XQ, WQ, x_scale, w_scale, w_scale_group);
+        XQ, WQ, x_scale, w_scale, w_scale_group, Y);
   } else if (M <= 64) {
     return _f8i4bf16_shuffled<64, 64, 2, 1, 1, false>(
-        XQ, WQ, x_scale, w_scale, w_scale_group);
+        XQ, WQ, x_scale, w_scale, w_scale_group, Y);
   } else if (M <= 128) {
     return _f8i4bf16_shuffled<64, 128, 2, 1, 1, false>(
-        XQ, WQ, x_scale, w_scale, w_scale_group);
+        XQ, WQ, x_scale, w_scale, w_scale_group, Y);
   } else if (M <= 256) {
     if (N <= 4096) {
       return _f8i4bf16_shuffled<64, 128, 2, 1, 1, false>(
-          XQ, WQ, x_scale, w_scale, w_scale_group);
+          XQ, WQ, x_scale, w_scale, w_scale_group, Y);
     } else {
       return _f8i4bf16_shuffled<64, 256, 1, 1, 1, false>(
-          XQ, WQ, x_scale, w_scale, w_scale_group);
+          XQ, WQ, x_scale, w_scale, w_scale_group, Y);
     }
   } else if (M <= 512) {
     if (N <= 4096) {
       return _f8i4bf16_shuffled<64, 256, 2, 1, 1, false>(
-          XQ, WQ, x_scale, w_scale, w_scale_group);
+          XQ, WQ, x_scale, w_scale, w_scale_group, Y);
     } else {
       return _f8i4bf16_shuffled<128, 256, 2, 1, 1, true>(
-          XQ, WQ, x_scale, w_scale, w_scale_group);
+          XQ, WQ, x_scale, w_scale, w_scale_group, Y);
     }
   } else if (M <= 1024) {
     if (N <= 1024) {
       return _f8i4bf16_shuffled<64, 128, 2, 1, 1, false>(
-          XQ, WQ, x_scale, w_scale, w_scale_group);
+          XQ, WQ, x_scale, w_scale, w_scale_group, Y);
     } else if (N <= 2048) {
       return _f8i4bf16_shuffled<64, 256, 2, 1, 1, false>(
-          XQ, WQ, x_scale, w_scale, w_scale_group);
+          XQ, WQ, x_scale, w_scale, w_scale_group, Y);
     } else {
       return _f8i4bf16_shuffled<128, 256, 2, 1, 1, true>(
-          XQ, WQ, x_scale, w_scale, w_scale_group);
+          XQ, WQ, x_scale, w_scale, w_scale_group, Y);
     }
   } else {
     if (N <= 1024) {
       return _f8i4bf16_shuffled<64, 256, 2, 1, 1, false>(
-          XQ, WQ, x_scale, w_scale, w_scale_group);
+          XQ, WQ, x_scale, w_scale, w_scale_group, Y);
     } else {
       return _f8i4bf16_shuffled<128, 256, 2, 1, 1, true>(
-          XQ, WQ, x_scale, w_scale, w_scale_group);
+          XQ, WQ, x_scale, w_scale, w_scale_group, Y);
     }
   }
 }
diff --git a/fbgemm_gpu/experimental/gen_ai/test/quantize/quantize_test.py b/fbgemm_gpu/experimental/gen_ai/test/quantize/quantize_test.py