Enable rowwise scaling for DeepGemm (pytorch#3874)

jwfromm · facebook-github-bot · commit ee4c88b1b821 · 2025-03-27T12:15:42.000-07:00
Summary: X-link: facebookresearch/FBGEMM#964 Pull Request resolved: pytorch#3874 This diff adds [ngimel's support for DeepGemm rowwise scaling](https://github.com/ngimel/DeepGEMM/tree/rowwise) to our fbcode copy. It also includes a few deepgemm updates that allow operation on M<128, which is important for any real use case. Performance is increased considerably by the use of rowwise scaling, especially in memory bound cases. Notably, this makes DeepGemm the premier solution for slow accumulation as it now overall outperforms cublas + unfused rowwise scaling. {F1976375307} Reviewed By: jianyuh Differential Revision: D71748927 fbshipit-source-id: 87e287a2cec284bd8fd7c5e80603065a0d662f53
diff --git a/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py b/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py
@@ -875,6 +875,48 @@ def cuda(self) -> bool:
 
 
 @register_quantize_op
+class DeepGemmRowwise(QuantizeOpBase):
+    """
+    FP8 matmul with rowwise scaling implemented with DeepGemm.
+    """
+
+    def preprocess(self, x, w):
+        # Quantize weights.
+        wq, w_scale = quantize_fp8_row(w)
+        # allocate output.
+        out = torch.empty(
+            x.shape[0], wq.shape[0], device=x.device, dtype=torch.bfloat16
+        )
+        # Return processed tensors.
+        return x, wq, w_scale, out
+
+    def quantize(self, x, wq, w_scale, out):
+        xq, x_scale = quantize_fp8_row(x)
+        # Pretranspose scales to deepgemm format.
+        x_scale = get_col_major_tma_aligned_tensor(x_scale, rowwise_scaling=True)
+        return xq, wq, x_scale, w_scale, out
+
+    def compute(self, xq, wq, x_scale, w_scale, out):
+        gemm_fp8_fp8_bf16_nt((xq, x_scale), (wq, w_scale), out)
+        return out
+
+    def quantize_and_compute(self, x, wq, w_scale, out):
+        xq, wq, x_scale, w_scale, out = self.quantize(x, wq, w_scale, out)
+        return self.compute(xq, wq, x_scale, w_scale, out)
+
+    @property
+    def name(self) -> str:
+        return "deepgemm_rowwise"
+
+    @property
+    def hip(self) -> bool:
+        return False
+
+    @property
+    def cuda(self) -> bool:
+        return True
+
+
 class FP8StackedGroupedGemm(QuantizeOpBase):
     """
     FP8 grouped matmul with rowwise scaling and stacked inputs.