q10
diff --git a/‎.github/scripts/fbgemm_gpu_test.bash
Lines changed: 14 additions & 7 deletions b/‎.github/scripts/fbgemm_gpu_test.bash
Lines changed: 14 additions & 7 deletions
diff --git a/‎fbgemm_gpu/test/failures_dict.json
Lines changed: 0 additions & 31 deletions b/‎fbgemm_gpu/test/failures_dict.json
Lines changed: 0 additions & 31 deletions
diff --git a/‎fbgemm_gpu/test/failures_dict_fast.json
Lines changed: 0 additions & 78 deletions b/‎fbgemm_gpu/test/failures_dict_fast.json
Lines changed: 0 additions & 78 deletions
diff --git a/‎fbgemm_gpu/test/quantize/bfloat16_test.py
Lines changed: 156 additions & 0 deletions b/‎fbgemm_gpu/test/quantize/bfloat16_test.py
Lines changed: 156 additions & 0 deletions
diff --git a/‎fbgemm_gpu/test/quantize/failures_dict.json
Lines changed: 37 additions & 0 deletions b/‎fbgemm_gpu/test/quantize/failures_dict.json
Lines changed: 37 additions & 0 deletions
@@ -35,9 +35,13 @@ run_python_test () {
   if exec_with_retries 2 conda run --no-capture-output ${env_prefix} python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then
     echo "[TEST] Python test suite PASSED: ${python_test_file}"
     echo ""
+    echo ""
+    echo ""
   else
     echo "[TEST] Python test suite FAILED: ${python_test_file}"
     echo ""
+    echo ""
+    echo ""
     return 1
   fi
 }
@@ -80,20 +84,20 @@ run_fbgemm_gpu_tests () {
 
   # These are either non-tests or currently-broken tests in both FBGEMM_GPU and FBGEMM_GPU-CPU
   local files_to_skip=(
-    test_utils.py
-    split_table_batched_embeddings_test.py
-    ssd_split_table_batched_embeddings_test.py
+    ./test_utils.py
+    ./split_table_batched_embeddings_test.py
+    ./ssd_split_table_batched_embeddings_test.py
   )
 
   if [ "$fbgemm_variant" == "cpu" ]; then
     # These are tests that are currently broken in FBGEMM_GPU-CPU
     local ignored_tests=(
-      uvm_test.py
+      ./uvm_test.py
     )
   elif [ "$fbgemm_variant" == "rocm" ]; then
     # https://github.com/pytorch/FBGEMM/issues/1559
     local ignored_tests=(
-      batched_unary_embeddings_test.py
+      ./batched_unary_embeddings_test.py
     )
   else
     local ignored_tests=()
@@ -108,11 +112,14 @@ run_fbgemm_gpu_tests () {
   (test_python_import_package "${env_name}" fbgemm_gpu.split_embedding_codegen_lookup_invokers) || return 1
 
   echo "[TEST] Enumerating test files ..."
-  print_exec ls -lth ./*.py
+  # shellcheck disable=SC2155
+  local all_test_files=$(find . -type f -name '*_test.py' -print | sort)
+  for f in $all_test_files; do echo "$f"; done
+  echo ""
 
   # NOTE: Tests running on single CPU core with a less powerful testing GPU in
   # GHA can take up to 5 hours.
-  for test_file in *.py; do
+  for test_file in $all_test_files; do
     if echo "${files_to_skip[@]}" | grep "${test_file}"; then
       echo "[TEST] Skipping test file known to be broken: ${test_file}"
     elif echo "${ignored_tests[@]}" | grep "${test_file}"; then
 
@@ -2,37 +2,6 @@
   "_description": "This is a dict containing failures for tests autogenerated by generate_opcheck_tests. For more details, please see https://docs.google.com/document/d/1Pj5HRZvdOq3xpFpbEjUZp2hBovhy7Wnxw14m6lF2154/edit",
   "_version": 1,
   "data": {
-    "fbgemm::FP8RowwiseQuantizedToFloat": {},
-    "fbgemm::FloatToFP8RowwiseQuantized": {
-      "TestFP8RowwiseQuantizationConversion.test_aot_dispatch_dynamic__test_quantize_and_dequantize_op_fp8_rowwise": {
-        "comment": "",
-        "status": "xsuccess"
-      },
-      "TestFP8RowwiseQuantizationConversion.test_faketensor__test_quantize_and_dequantize_op_fp8_rowwise": {
-        "comment": "",
-        "status": "xsuccess"
-      }
-    },
-    "fbgemm::FloatToPaddedFP8RowwiseQuantized": {
-      "TestFP8RowwiseQuantizationConversion.test_aot_dispatch_dynamic__test_quantize_and_dequantize_op_padded_fp8_rowwise": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "TestFP8RowwiseQuantizationConversion.test_faketensor__test_quantize_and_dequantize_op_padded_fp8_rowwise": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
-    "fbgemm::PaddedFP8RowwiseQuantizedToFloat": {
-      "TestFP8RowwiseQuantizationConversion.test_aot_dispatch_dynamic__test_quantize_and_dequantize_op_padded_fp8_rowwise": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "TestFP8RowwiseQuantizationConversion.test_faketensor__test_quantize_and_dequantize_op_padded_fp8_rowwise": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
     "fbgemm::asynchronous_complete_cumsum": {},
     "fbgemm::asynchronous_exclusive_cumsum": {},
     "fbgemm::asynchronous_inclusive_cumsum": {},
 
@@ -2,84 +2,6 @@
   "_description": "This is a dict containing failures for tests autogenerated by generate_opcheck_tests. For more details, please see https://docs.google.com/document/d/1Pj5HRZvdOq3xpFpbEjUZp2hBovhy7Wnxw14m6lF2154/edit",
   "_version": 1,
   "data": {
-    "fbgemm::FloatToFused8BitRowwiseQuantized": {
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_forward_cpu_int8": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_forward_fused_pooled_emb_quant": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_forward_gpu_no_cache_int8": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_forward_gpu_uvm_cache_int8": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_fused_pooled_emb_quant": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
-    "fbgemm::FloatToFusedNBitRowwiseQuantizedSBHalf": {
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_fused_pooled_emb_quant": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
-    "fbgemm::FloatToHFP8Quantized": {
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_cpu": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_gpu_no_cache": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_gpu_no_cache_fp8_2048": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
-    "fbgemm::Fused8BitRowwiseQuantizedToFloat": {
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_forward_cpu_int8": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_forward_fused_pooled_emb_quant": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_forward_gpu_no_cache_int8": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_forward_gpu_uvm_cache_int8": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_fused_pooled_emb_quant": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
-    "fbgemm::HFP8QuantizedToFloat": {
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_cpu": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_gpu_no_cache": {
-        "comment": "",
-        "status": "xfail"
-      },
-      "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_gpu_no_cache_fp8_2048": {
-        "comment": "",
-        "status": "xfail"
-      }
-    },
     "fbgemm::asynchronous_complete_cumsum": {},
     "fbgemm::bounds_check_indices": {},
     "fbgemm::dense_embedding_codegen_lookup_function": {
 
@@ -0,0 +1,156 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from ctypes import c_float, c_int32, cast, POINTER, pointer
+from typing import Tuple
+
+import hypothesis.strategies as st
+import numpy as np
+import torch
+from hypothesis import given, HealthCheck, settings
+
+
+try:
+    # pyre-ignore[21]
+    from fbgemm_gpu import open_source  # noqa: F401
+
+except Exception:
+    if torch.version.hip:
+        torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_hip")
+    else:
+        torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+
+
+class SparseNNOperatorsGPUTest(unittest.TestCase):
+    # pyre-fixme[56]: Pyre was not able to infer the type of argument
+    #  `hypothesis.strategies.sampled_from(["BF16"])` to decorator factory
+    #  `hypothesis.given`.
+    @given(
+        precision=st.just("BF16"),
+        batch_size=st.integers(min_value=1, max_value=256),
+        k=st.integers(min_value=2, max_value=2),
+        n=st.integers(min_value=2, max_value=2),
+    )
+    def test_dense_mlp_quantize_ops(
+        self, precision: str, batch_size: int, k: int, n: int
+    ) -> None:
+        if precision == "BF16":
+            input_data = torch.rand((n, k), dtype=torch.float32)
+            quantized_data = torch.ops.fbgemm.FloatToBfloat16Quantized(input_data)
+            dequantized_data = torch.ops.fbgemm.Bfloat16QuantizedToFloat(quantized_data)
+            torch.testing.assert_close(
+                dequantized_data, input_data, rtol=1e-2, atol=1e-2
+            )
+
+
+def bfloat_quantize(x_float: float) -> np.uint16:
+    bits = cast(pointer(c_float(x_float)), POINTER(c_int32)).contents.value
+    bits += 1 << 15
+    bits = bits >> 16
+    bits = np.uint16(bits)
+    return bits
+
+
+def bfloat_dequantize(x_bfloat: np.uint16) -> float:
+    bits = np.int32(x_bfloat) << 16
+    return cast(pointer(c_int32(bits)), POINTER(c_float)).contents.value
+
+
+class TestBfloat16QuantizationConversion(unittest.TestCase):
+    # pyre-fixme[56]: Pyre was not able to infer the type of argument
+    #  `hypothesis.strategies.integers($parameter$min_value = 0, $parameter$max_value =
+    #  100)` to decorator factory `hypothesis.given`.
+    @given(
+        nrows=st.integers(min_value=0, max_value=100),
+        ncols=st.integers(min_value=0, max_value=100),
+    )
+    @settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
+    def test_quantize_op(self, nrows: int, ncols: int) -> None:
+        input_data = torch.rand(nrows, ncols).float()
+        quantized_data = torch.ops.fbgemm.FloatToBfloat16Quantized(input_data)
+        if nrows == 0 or ncols == 0:
+            assert quantized_data.numel() == 0
+            return
+        f = np.vectorize(lambda x: bfloat_quantize(x))
+        reference = f(input_data.numpy())
+        quantized_data_uint16 = quantized_data.numpy()
+        quantized_data_uint16.dtype = np.uint16
+        np.testing.assert_array_almost_equal(quantized_data_uint16, reference)
+
+        if torch.cuda.is_available():
+            input_data_gpu = input_data.cuda()
+            quantized_data_gpu = torch.ops.fbgemm.FloatToBfloat16Quantized(
+                input_data_gpu
+            )
+            quantized_data_numpy = quantized_data_gpu.cpu().numpy()
+            quantized_data_numpy.dtype = np.uint16
+            np.testing.assert_allclose(quantized_data_numpy, reference)
+
+    # pyre-fixme[56]: Pyre was not able to infer the type of argument
+    #  `hypothesis.strategies.integers($parameter$min_value = 0, $parameter$max_value =
+    #  100)` to decorator factory `hypothesis.given`.
+    @given(
+        nrows=st.integers(min_value=0, max_value=100),
+        ncols=st.integers(min_value=0, max_value=100),
+    )
+    @settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
+    def test_quantize_and_dequantize_op(self, nrows: int, ncols: int) -> None:
+        input_data = torch.rand(nrows, ncols).float()
+        quantized_data = torch.ops.fbgemm.FloatToBfloat16Quantized(input_data)
+        dequantized_data = torch.ops.fbgemm.Bfloat16QuantizedToFloat(quantized_data)
+        if nrows == 0 or ncols == 0:
+            assert dequantized_data.numel() == 0
+            return
+        f = np.vectorize(lambda x: bfloat_quantize(x))
+        ref_bfloat16 = f(input_data.numpy())
+        f = np.vectorize(lambda x: bfloat_dequantize(x))
+        ref_fp32 = torch.from_numpy(f(ref_bfloat16)).float()
+        torch.testing.assert_close(dequantized_data, ref_fp32)
+
+        if torch.cuda.is_available():
+            input_data_gpu = input_data.cuda()
+            quantized_data_gpu = torch.ops.fbgemm.FloatToBfloat16Quantized(
+                input_data_gpu
+            )
+            dequantized_data_gpu = torch.ops.fbgemm.Bfloat16QuantizedToFloat(
+                quantized_data_gpu
+            )
+            # compare quantized data
+            torch.testing.assert_close(dequantized_data_gpu.cpu(), ref_fp32)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "Skip when CUDA is not available")
+    # pyre-fixme[56]: Pyre was not able to infer the type of argument
+    #  `hypothesis.strategies.sampled_from([(65540, 256), (256, 65540)])` to decorator
+    #  factory `hypothesis.given`.
+    @given(
+        ncols_nrows=st.sampled_from([(65540, 256), (256, 65540)]),
+    )
+    @settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
+    def test_quantize_and_dequantize_op_cuda_large_nrows_bf16(
+        self, ncols_nrows: Tuple[int, int]
+    ) -> None:
+        ncols, nrows = ncols_nrows
+        input_data = torch.rand(nrows, ncols).float()
+        quantized_data = torch.ops.fbgemm.FloatToBfloat16Quantized(input_data)
+        dequantized_data = torch.ops.fbgemm.Bfloat16QuantizedToFloat(quantized_data)
+
+        if torch.cuda.is_available():
+            input_data_gpu = input_data.cuda()
+            quantized_data_gpu = torch.ops.fbgemm.FloatToBfloat16Quantized(
+                input_data_gpu
+            )
+            dequantized_data_gpu = torch.ops.fbgemm.Bfloat16QuantizedToFloat(
+                quantized_data_gpu
+            )
+            # compare quantized data
+            torch.testing.assert_close(dequantized_data_gpu.cpu(), dequantized_data)
+
+
+if __name__ == "__main__":
+    unittest.main()
@@ -0,0 +1,37 @@
+{
+  "_description": "This is a dict containing failures for tests autogenerated by generate_opcheck_tests. For more details, please see https://docs.google.com/document/d/1Pj5HRZvdOq3xpFpbEjUZp2hBovhy7Wnxw14m6lF2154/edit",
+  "_version": 1,
+  "data": {
+    "fbgemm::FP8RowwiseQuantizedToFloat": {},
+    "fbgemm::FloatToFP8RowwiseQuantized": {
+      "TestFP8RowwiseQuantizationConversion.test_aot_dispatch_dynamic__test_quantize_and_dequantize_op_fp8_rowwise": {
+        "comment": "",
+        "status": "xsuccess"
+      },
+      "TestFP8RowwiseQuantizationConversion.test_faketensor__test_quantize_and_dequantize_op_fp8_rowwise": {
+        "comment": "",
+        "status": "xsuccess"
+      }
+    },
+    "fbgemm::FloatToPaddedFP8RowwiseQuantized": {
+      "TestFP8RowwiseQuantizationConversion.test_aot_dispatch_dynamic__test_quantize_and_dequantize_op_padded_fp8_rowwise": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "TestFP8RowwiseQuantizationConversion.test_faketensor__test_quantize_and_dequantize_op_padded_fp8_rowwise": {
+        "comment": "",
+        "status": "xfail"
+      }
+    },
+    "fbgemm::PaddedFP8RowwiseQuantizedToFloat": {
+      "TestFP8RowwiseQuantizationConversion.test_aot_dispatch_dynamic__test_quantize_and_dequantize_op_padded_fp8_rowwise": {
+        "comment": "",
+        "status": "xfail"
+      },
+      "TestFP8RowwiseQuantizationConversion.test_faketensor__test_quantize_and_dequantize_op_padded_fp8_rowwise": {
+        "comment": "",
+        "status": "xfail"
+      }
+    }
+  }
+}