FP8 Rowwise Dequant Kernel (pytorch#962)

choutim · facebook-github-bot · commit b5655fb51425 · 2025-03-24T11:25:55.000-07:00
Summary: X-link: pytorch#3873 Pull Request resolved: facebookresearch/FBGEMM#962 Add Triton Rowwise dequant kernel Reviewed By: jiawenliu64 Differential Revision: D71655200 fbshipit-source-id: 10b329b3da5fa3398145ec78ab98bdbd1402eafd
diff --git a/fbgemm_gpu/experimental/gemm/test/fp8_gemm_test.py b/fbgemm_gpu/experimental/gemm/test/fp8_gemm_test.py
@@ -11,10 +11,12 @@
 from typing import Optional, Tuple
 
 import torch
+import triton
 
 if torch.cuda.is_available():
     from fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemm import (
         dequantize_fp8_block,
+        dequantize_fp8_row,
         matmul_fp8_block,
         matmul_fp8_row,
         quantize_fp8_block,
@@ -136,6 +138,41 @@ def _test_quantize_fp8_row(
             use_jagged=True,
         )
 
+    def test_dequantize_fp8_row(self) -> None:
+        def _test_dequantize_fp8_row(
+            shape: Tuple[int, ...],
+        ) -> None:
+            a = torch.randn(shape, dtype=torch.bfloat16, device="cuda")
+            a_fp8, a_scale = quantize_fp8_row(
+                a,
+                use_triton=True,
+            )
+
+            # Undo scaling.
+            a_bf16 = dequantize_fp8_row(a_fp8, a_scale)
+
+            ms = triton.testing.do_bench(
+                lambda: dequantize_fp8_row(a_fp8, a_scale),
+            )
+            print(f"Shape: {a.shape} MS: {ms}")
+            torch.testing.assert_close(a_bf16, a, atol=2e-1, rtol=1e-1)
+            self.assertTrue(
+                torch.allclose(
+                    a,
+                    a_bf16,
+                    atol=2e-1,
+                    rtol=1e-1,
+                )
+            )
+
+        for n_col in [1, 100, 1000]:
+            _test_dequantize_fp8_row((2, n_col))
+        # Test with batched input.
+        _test_dequantize_fp8_row((4, 2, 3))
+        shapes = [(4, 2, 3), (6, 4, 2, 3), (2, 3), (20, 30)]
+        for shape in shapes:
+            _test_dequantize_fp8_row(shape)
+
     def test_scale_fp8_row(self) -> None:
         def _test_scale_fp8_row(
             shape: Tuple[int, int],
diff --git a/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py b/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py
@@ -3099,6 +3099,104 @@ def _kernel_matmul_fp8_row_non_persistent(
         tl.atomic_add(C, acc, mask=mask)
 
 
+@triton.autotune(
+    configs=[Config({"BLOCK_M": 16, "BLOCK_K": 512, "NUM_STAGES": 2})],
+    key=["M", "K"],
+)
+@triton.jit
+def _kernel_dequantize_fp8_row(
+    xq_ptr,
+    x_scale_ptr,
+    x_dequant_ptr,
+    M,
+    K,
+    stride_xm,
+    stride_xk,
+    stride_xdqm,
+    stride_xdqk,
+    BLOCK_M: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    NUM_STAGES: tl.constexpr,
+    USE_INT64: tl.constexpr,
+):
+    """
+    Kernel to dequantize FP8 tensor to BF16 tensor.
+    Args:
+        xq_ptr (tl.constexpr): Pointer to FP8 tensor.
+        x_scale_ptr (tl.constexpr): Pointer to FP8 scale tensor.
+        x_dequant_ptr (tl.constexpr): Pointer to BF16 tensor.
+        M (tl.constexpr): M dimension of input tensor.
+        K (tl.constexpr): K dimension of input tensor (along which scales are applied)
+        BLOCK_SIZE (tl.constexpr): Block size for the K dimension.
+    """
+    pid = tl.program_id(axis=0)
+    if USE_INT64:
+        pid = pid.to(tl.int64)
+    offs_m = pid * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_k = tl.arange(0, BLOCK_K)
+    scales = tl.load(x_scale_ptr + offs_m)
+
+    for _k in tl.range(0, tl.cdiv(K, BLOCK_K), num_stages=NUM_STAGES):
+        mask = (offs_m[:, None] < M) & (offs_k[None, :] < K)
+        xq = tl.load(
+            xq_ptr + offs_m[:, None] * stride_xm + offs_k[None, :] * stride_xk,
+            mask=mask,
+        )
+        x_dq = xq * scales[:, None]
+        tl.store(
+            x_dequant_ptr
+            + offs_m[:, None] * stride_xdqm
+            + offs_k[None, :] * stride_xdqk,
+            x_dq,
+            mask=mask,
+        )
+        offs_k += BLOCK_K
+
+
+def dequantize_fp8_row(
+    xq: torch.Tensor,
+    x_scale: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Rowwise Dequantize FP8 tensor to BF16 tensor along last axis.
+
+    Args:
+        xq (torch.Tensor): FP8 tensor to be dequantized.
+        x_scale (torch.Tensor): FP8 scale tensor.
+
+    Returns:
+        torch.Tensor: Dequantized BF16 tensor.
+    """
+
+    assert (
+        xq.is_contiguous() and x_scale.is_contiguous()
+    ), "Input tensors must be contiguous"
+    x_dequant = torch.empty_like(xq, dtype=torch.bfloat16)
+
+    # Reshape to 2-d array keeping last dim only.
+    K = xq.shape[-1]
+    xq = xq.reshape(-1, K)
+    M = xq.shape[0]
+    use_int64 = xq.numel() > 2**31
+
+    def grid(meta):
+        return (triton.cdiv(M, meta["BLOCK_M"]),)
+
+    _kernel_dequantize_fp8_row[grid](
+        xq,
+        x_scale,
+        x_dequant,
+        M,
+        K,
+        xq.stride(0),
+        xq.stride(1),
+        xq.stride(0),  # Use squashed stride.
+        xq.stride(1),
+        USE_INT64=use_int64,
+    )
+    return x_dequant
+
+
 @triton.jit
 def _kernel_dequantize_fp8_block(
     xq_ptr,