Add abstract impl for Fused8BitRowwiseQuantizedToFloatOrHalf et al. (pytorch#715)

cgufb · facebook-github-bot · commit 92f778555edc · 2025-01-30T11:12:12.000-08:00
Summary: X-link: pytorch#3640 Pull Request resolved: facebookresearch/FBGEMM#715 Reviewed By: q10, jianyuh Differential Revision: D68817290 fbshipit-source-id: b57b475aa9ee746d8726945b28b71fafd93cfbbe
diff --git a/fbgemm_gpu/fbgemm_gpu/sparse_ops.py b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
@@ -1016,6 +1016,90 @@ def fused_nbit_rowwise_quantized_sb_half_to_float_or_half(
         )
 
 
+def fused_8_bit_rowwise_quantized_to_float_or_half(
+    input_t: Tensor,
+    output_dtype: int = 0,
+    scale_bias_last: bool = True,
+    quant_padding_float_type: bool = True,
+) -> Tensor:
+    torch._check(
+        output_dtype
+        in [
+            SparseType.FP32.as_int(),
+            SparseType.FP16.as_int(),
+            SparseType.BF16.as_int(),
+        ]
+    )
+    torch._check(quant_padding_float_type or not scale_bias_last)
+    torch._check(input_t.dim() >= 2)
+    last_dim = input_t.dim() - 1
+    output_shape = list(input_t.shape)
+    ncols = input_t.size(last_dim)
+    quant_padding_size = 4 if quant_padding_float_type else 2
+    ncols_aligned = (
+        (ncols + quant_padding_size - 1) // quant_padding_size * quant_padding_size
+    )
+    output_columns = ncols_aligned - 2 * quant_padding_size
+    output_shape[last_dim] = output_columns
+    if output_dtype == SparseType.FP32.as_int():
+        return torch.empty(output_shape, dtype=torch.float32, device=input_t.device)
+    elif output_dtype == SparseType.FP16.as_int():
+        return torch.empty(output_shape, dtype=torch.float16, device=input_t.device)
+    else:  # output_dtype is SparseType.BF16
+        return torch.empty(output_shape, dtype=torch.bfloat16, device=input_t.device)
+
+
+def float_or_half_to_fused_8_bit_rowwise(
+    input_t: Tensor,
+) -> Tensor:
+    torch._check(input_t.dim() >= 2)
+    last_dim = input_t.dim() - 1
+    output_shape = list(input_t.shape)
+    ncols = input_t.size(last_dim)
+    ncols_aligned = (ncols + 4 - 1) // 4 * 4
+    output_columns = ncols_aligned + 2 * 4
+    output_shape[last_dim] = output_columns
+    return torch.empty(output_shape, dtype=torch.uint8, device=input_t.device)
+
+
+def fused_8_bit_rowwise_quantized_to_float(
+    input_t: Tensor,
+    scale_bias_last: bool = True,
+    quant_padding_float_type: bool = True,
+) -> Tensor:
+    torch._check(quant_padding_float_type or not scale_bias_last)
+    torch._check(input_t.dim() >= 2)
+    last_dim = input_t.dim() - 1
+    output_shape = list(input_t.shape)
+    ncols = input_t.size(last_dim)
+    quant_padding_size = 4 if quant_padding_float_type else 2
+    ncols_aligned = (
+        (ncols + quant_padding_size - 1) // quant_padding_size * quant_padding_size
+    )
+    output_columns = ncols_aligned - 2 * quant_padding_size
+    output_shape[last_dim] = output_columns
+    return torch.empty(output_shape, dtype=torch.float32, device=input_t.device)
+
+
+def fused_8_bit_rowwise_quantized_to_half(
+    input_t: Tensor,
+    scale_bias_last: bool = True,
+    quant_padding_float_type: bool = True,
+) -> Tensor:
+    torch._check(quant_padding_float_type or not scale_bias_last)
+    torch._check(input_t.dim() >= 2)
+    last_dim = input_t.dim() - 1
+    output_shape = list(input_t.shape)
+    ncols = input_t.size(last_dim)
+    quant_padding_size = 4 if quant_padding_float_type else 2
+    ncols_aligned = (
+        (ncols + quant_padding_size - 1) // quant_padding_size * quant_padding_size
+    )
+    output_columns = ncols_aligned - 2 * quant_padding_size
+    output_shape[last_dim] = output_columns
+    return torch.empty(output_shape, dtype=torch.float16, device=input_t.device)
+
+
 def _setup() -> None:
     # pyre-ignore[16]
     _setup.done = getattr(_setup, "done", False)
@@ -1165,7 +1249,30 @@ def impl_autograd(op_name, fn, setup_context: Optional[Callable] = None) -> None
             "fbgemm::FusedNBitRowwiseQuantizedSBHalfToFloatOrHalf",
             fused_nbit_rowwise_quantized_sb_half_to_float_or_half,
         )
-
+        impl_abstract(
+            "fbgemm::Fused8BitRowwiseQuantizedToFloatOrHalf",
+            fused_8_bit_rowwise_quantized_to_float_or_half,
+        )
+        impl_abstract(
+            "fbgemm::FloatToFused8BitRowwiseQuantized",
+            float_or_half_to_fused_8_bit_rowwise,
+        )
+        impl_abstract(
+            "fbgemm::FloatOrHalfToFused8BitRowwiseQuantized",
+            float_or_half_to_fused_8_bit_rowwise,
+        )
+        impl_abstract(
+            "fbgemm::HalfToFused8BitRowwiseQuantized",
+            float_or_half_to_fused_8_bit_rowwise,
+        )
+        impl_abstract(
+            "fbgemm::Fused8BitRowwiseQuantizedToFloat",
+            fused_8_bit_rowwise_quantized_to_float,
+        )
+        impl_abstract(
+            "fbgemm::Fused8BitRowwiseQuantizedToHalf",
+            fused_8_bit_rowwise_quantized_to_half,
+        )
         _setup.done = True
 
 
diff --git a/fbgemm_gpu/test/quantize/failures_dict_fast.json b/fbgemm_gpu/test/quantize/failures_dict_fast.json
@@ -2,6 +2,12 @@
   "_description": "This is a dict containing failures for tests autogenerated by generate_opcheck_tests. For more details, please see https://docs.google.com/document/d/1Pj5HRZvdOq3xpFpbEjUZp2hBovhy7Wnxw14m6lF2154/edit",
   "_version": 1,
   "data": {
+    "fbgemm::FloatOrHalfToFused8BitRowwiseQuantized": {
+      "TestFused8BitRowwiseQuantizationConversion.test_faketensor__test_quantize_op": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
     "fbgemm::FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf": {},
     "fbgemm::FloatToFused8BitRowwiseQuantized": {
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_forward_cpu_int8": {
@@ -23,6 +29,10 @@
       "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_fused_pooled_emb_quant": {
         "comment": "",
         "status": "xfail"
+      },
+      "TestFused8BitRowwiseQuantizationConversion.test_faketensor__test_quantize_op": {
+        "comment": "",
+        "status": "xfail"
       }
     },
     "fbgemm::FloatToFusedNBitRowwiseQuantizedSBHalf": {
@@ -66,6 +76,12 @@
         "status": "xfail"
       }
     },
+    "fbgemm::Fused8BitRowwiseQuantizedToFloatOrHalf": {
+      "TestFused8BitRowwiseQuantizationConversion.test_faketensor__test_quantize_op": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
     "fbgemm::FusedNBitRowwiseQuantizedSBHalfToFloat": {
       "TestFusedNBitRowwiseQuantizationConversion.test_faketensor__test_quantize_and_dequantize_op": {
         "comment": "",
@@ -93,6 +109,12 @@
       }
     },
     "fbgemm::HFP8QuantizedToFloat": {},
+    "fbgemm::HalfToFused8BitRowwiseQuantized": {
+      "TestFused8BitRowwiseQuantizationConversion.test_faketensor__test_quantize_op": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
     "fbgemm::HalfToFusedNBitRowwiseQuantizedSBHalf": {
       "TestFusedNBitRowwiseQuantizationConversion.test_faketensor__test_quantize_and_dequantize_op": {
         "comment": "",
diff --git a/fbgemm_gpu/test/quantize/fused_8bit_rowwise_test.py b/fbgemm_gpu/test/quantize/fused_8bit_rowwise_test.py
@@ -28,13 +28,18 @@
 # pyre-fixme[16]: Module `common` has no attribute `open_source`.
 if open_source:
     # pyre-ignore[21]
-    from test_utils import gpu_available
+    from test_utils import gpu_available, optests
 else:
-    from fbgemm_gpu.test.test_utils import gpu_available
+    from fbgemm_gpu.test.test_utils import gpu_available, optests
+
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+
+torch.ops.import_module("fbgemm_gpu.sparse_ops")
 
 no_long_tests: bool = False
 
 
+@optests.generate_opcheck_tests(fast=True)
 class TestFused8BitRowwiseQuantizationConversion(unittest.TestCase):
     # pyre-fixme[56]: Pyre was not able to infer the type of argument
     #  `hypothesis.strategies.integers($parameter$min_value = 0, $parameter$max_value =
@@ -118,21 +123,7 @@ def test_quantize_op(
                 reference[:, ncols + 4 : ncols + 8],
             )
 
-    # pyre-ignore [56]: Invalid decoration, was not able to infer the type of argument
-    @given(
-        nrows=st.integers(min_value=0, max_value=100),
-        ncols=st.sampled_from([32, 128, 256, 384, 512, 1024]),
-        output_dtype=st.sampled_from(
-            [SparseType.FP16, SparseType.FP32, SparseType.BF16]
-        ),
-        quant_padding_float_type=st.sampled_from(
-            [True, False],
-        ),
-        test_generic_op=st.booleans(),
-        test_cuda=st.booleans(),
-    )
-    @settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
-    def test_quantize_and_dequantize_op(  # noqa: C901
+    def quantize_and_dequantize_op_test_helper(  # noqa: C901
         self,
         nrows: int,
         ncols: int,
@@ -289,6 +280,56 @@ def test_quantize_and_dequantize_op(  # noqa: C901
                     dequantized_data_trimmed.bfloat16(), reference.bfloat16()
                 )
 
+    # pyre-ignore [56]: Invalid decoration, was not able to infer the type of argument
+    @given(
+        nrows=st.integers(min_value=0, max_value=100),
+        ncols=st.sampled_from([32, 128, 256, 384, 512, 1024]),
+        output_dtype=st.sampled_from(
+            [SparseType.FP16, SparseType.FP32, SparseType.BF16]
+        ),
+        quant_padding_float_type=st.sampled_from(
+            [True, False],
+        ),
+        test_generic_op=st.booleans(),
+    )
+    @settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
+    def test_quantize_and_dequantize_op_cpu(  # noqa: C901
+        self,
+        nrows: int,
+        ncols: int,
+        output_dtype: SparseType,
+        quant_padding_float_type: bool,
+        test_generic_op: bool,
+    ) -> None:
+        self.quantize_and_dequantize_op_test_helper(
+            nrows, ncols, output_dtype, quant_padding_float_type, test_generic_op, False
+        )
+
+    # pyre-ignore [56]: Invalid decoration, was not able to infer the type of argument
+    @given(
+        nrows=st.integers(min_value=0, max_value=100),
+        ncols=st.sampled_from([32, 128, 256, 384, 512, 1024]),
+        output_dtype=st.sampled_from(
+            [SparseType.FP16, SparseType.FP32, SparseType.BF16]
+        ),
+        quant_padding_float_type=st.sampled_from(
+            [True, False],
+        ),
+        test_generic_op=st.booleans(),
+    )
+    @settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
+    def test_quantize_and_dequantize_op_cuda(  # noqa: C901
+        self,
+        nrows: int,
+        ncols: int,
+        output_dtype: SparseType,
+        quant_padding_float_type: bool,
+        test_generic_op: bool,
+    ) -> None:
+        self.quantize_and_dequantize_op_test_helper(
+            nrows, ncols, output_dtype, quant_padding_float_type, test_generic_op, True
+        )
+
     @unittest.skipIf(no_long_tests, "Slow test, requires buck build to run.")  # noqa
     def test_quantize_and_dequantize_op_cuda_large_nrows(self) -> None:
         ncols = 256