jwfromm
diff --git a/‎fbgemm_gpu/fbgemm_gpu/triton/common.py
Lines changed: 22 additions & 19 deletions b/‎fbgemm_gpu/fbgemm_gpu/triton/common.py
Lines changed: 22 additions & 19 deletions
@@ -8,6 +8,26 @@
 # pyre-unsafe
 from enum import IntEnum
 
+import torch
+
+
+# LUTS need to be allocated ahead of time and copied to GPU to avoid expensive copies later.
+if torch.version.cuda:
+    lut_device = "cuda"
+else:
+    lut_device = "cpu"
+
+E2M1_LUT = torch.tensor(
+    [0, 0.5, 1, 1.5, 2, 3, 4, 6, -0, -0.5, -1, -1.5, -2, -3, -4, -6],
+    dtype=torch.float32,
+    device=lut_device,
+)
+E3M0_LUT = torch.tensor(
+    [0, 0.25, 0.5, 1, 2, 4, 8, 16, -0, -0.25, -0.5, -1, -2, -4, -8, -16],
+    dtype=torch.float32,
+    device=lut_device,
+)
+
 
 class RoundingMode(IntEnum):
     """Rounding options for quantization."""
@@ -47,26 +67,9 @@ def get_mx4_lookup_table(ebits, mbits):
         The lookup table for the specified mx4 format.
     """
     if ebits == 2 and mbits == 1:
-        return [0, 0.5, 1, 1.5, 2, 3, 4, 6, -0, -0.5, -1, -1.5, -2, -3, -4, -6]
+        return E2M1_LUT
     elif ebits == 3 and mbits == 0:
-        return [
-            0,
-            0.25,
-            0.5,
-            1,
-            2,
-            4,
-            8,
-            16,
-            -0,
-            -0.25,
-            -0.5,
-            -1,
-            -2,
-            -4,
-            -8,
-            -16,
-        ]
+        return E3M0_LUT
     else:
         raise NotImplementedError(
             f"MX4 with ebits={ebits} and mbits={mbits} not supported."