add dynamic quantize gemm benchmark [step 2: fp16->int8 quantize] (pytorch#2295)

Jiyuan Zhang · facebook-github-bot · commit d715a1bae160 · 2024-01-30T20:10:39.000-08:00
Summary:

- Add FX Kernel benchmark for dynamic quantized gemm step-2
- Use `quantize_step` parameter to differentiate different stages
- Separate Net modules for step-2 vs step-1 --

Differential Revision: D52136852
diff --git a/fbgemm_gpu/src/qlinear_channelwise/qlinear_channelwise_mtia.cpp b/fbgemm_gpu/src/qlinear_channelwise/qlinear_channelwise_mtia.cpp
@@ -24,6 +24,23 @@ static at::Tensor qlinear_channelwise(
   return x;
 }
 
+static at::Tensor qlinear_quant(
+    at::Tensor x,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor input_scale,
+    at::Tensor weight_scale,
+    at::Tensor weight_zero_point,
+    at::Tensor relu) {
+  assert(x.options().dtype() == at::kHalf);
+  assert(weight.options().dtype() == at::kQInt8);
+  assert(bias.options().dtype() == at::kFloat);
+  assert(input_scale.options().dtype() == at::kFloat);
+  assert(weight_scale.options().dtype() == at::kFloat);
+  assert(weight_zero_point.options().dtype() == at::kQUInt8);
+  return x;
+}
+
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "qlinear_channelwise(Tensor x, Tensor weight, Tensor "
@@ -32,4 +49,13 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.impl(
       "qlinear_channelwise",
       torch::dispatch(c10::DispatchKey::CPU, TORCH_FN(qlinear_channelwise)));
+
+  m.def(
+      "qlinear_quant(Tensor x, Tensor weight, Tensor "
+      "bias, Tensor input_scale, Tensor weight_scale, Tensor "
+      "weight_zero_point, Tensor relu) -> Tensor");
+
+  m.impl(
+      "qlinear_quant",
+      torch::dispatch(c10::DispatchKey::CPU, TORCH_FN(qlinear_quant)));
 }