A hotfix for FBGEMM fp8 rowwise with irregular gemm sizes

zjing14 · facebook-github-bot · commit fe42aed4343b · 2025-03-25T22:01:37.000-07:00
Summary:
- Hotfix for T219165899 reported by pranavsh, which is caused by some instances requiring K size being multiple of `KTile`
- Added fallback for GEMM cases with K is not multiple of Max KTile = 256.

Reviewed By: jianyuh

Differential Revision: D71863248
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise/fp8_rowwise_gemm.hip b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise/fp8_rowwise_gemm.hip
@@ -422,8 +422,11 @@ RowwiseKernel rowwise_nk_lookup(int M, const NKLookupTableType& table) {
 RowwiseKernel rowwise_heuristic_dispatch(int M, int N, int K) {
   // Apply shape heuristics to find a suitable kernel implementation.
 
-  //Fallback of irregular data types
-  if(!((N % 8 == 0) && (K % 16 == 0)))
+  //Fallback for irregular data types: some instances require K to be a multiple
+  //of K Tile.
+  //To-Do: Need a systemic solution for various restrictions from different
+  //instances.
+  if(!((N % 8 == 0) && (K % 256 == 0)))
       return fp8_rowwise_64x16x16x256_16x16_1x1_16x4x1_16x4x1_1x4x1x16_4x4x1_1x1_intrawave_v1;
 
   if (M < 64 && N < 2048 && K < 2048) {
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise/kernels/fp8_rowwise_64x16x16x256_16x16_1x1_16x4x1_16x4x1_1x4x1x16_4x4x1_1x1_intrawave_v1.hip b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise/kernels/fp8_rowwise_64x16x16x256_16x16_1x1_16x4x1_16x4x1_1x4x1x16_4x4x1_1x1_intrawave_v1.hip
@@ -20,7 +20,7 @@ fp8_rowwise_64x16x16x256_16x16_1x1_16x4x1_16x4x1_1x4x1x16_4x4x1_1x1_intrawave_v1
   int N = WQ.size(0);
   int K = WQ.size(1);
 
-  if ((K % 16 == 0) && (N % 4 == 0)) {
+if ((K % 256 == 0) && (N % 4 == 0)) {
     using DeviceGemmInstance = DeviceGemmHelper<
         64,
         16,
@@ -42,6 +42,31 @@ fp8_rowwise_64x16x16x256_16x16_1x1_16x4x1_16x4x1_1x4x1x16_4x4x1_1x1_intrawave_v1
         16,
         16>;
 
+    // Run kernel instance.
+    return f8f8bf16_rowwise_impl<DeviceGemmInstance>(
+        XQ, WQ, x_scale, w_scale, Y);
+  } else if ((K % 16 == 0) && (N % 4 == 0)) {
+    using DeviceGemmInstance = DeviceGemmHelper<
+        64,
+        16,
+        16,
+        256,
+        16,
+        16,
+        1,
+        1,
+        S<16, 4, 1>,
+        S<16, 4, 1>,
+        S<1, 16, 1, 4>,
+        S<4, 4, 1>,
+        1,
+        1,
+        ck::BlockGemmPipelineScheduler::Intrawave,
+        ck::BlockGemmPipelineVersion::v1,
+        ck::tensor_operation::device::GemmSpecialization::MNKPadding,
+        16,
+        16>;
+
     // Run kernel instance.
     return f8f8bf16_rowwise_impl<DeviceGemmInstance>(
         XQ, WQ, x_scale, w_scale, Y);
@@ -63,7 +88,7 @@ fp8_rowwise_64x16x16x256_16x16_1x1_16x4x1_16x4x1_1x4x1x16_4x4x1_1x1_intrawave_v1
         1,
         ck::BlockGemmPipelineScheduler::Intrawave,
         ck::BlockGemmPipelineVersion::v1,
-        ck::tensor_operation::device::GemmSpecialization::Default,
+        ck::tensor_operation::device::GemmSpecialization::MNKPadding,
         8,
         8>;
 
@@ -88,7 +113,7 @@ fp8_rowwise_64x16x16x256_16x16_1x1_16x4x1_16x4x1_1x4x1x16_4x4x1_1x1_intrawave_v1
         1,
         ck::BlockGemmPipelineScheduler::Intrawave,
         ck::BlockGemmPipelineVersion::v1,
-        ck::tensor_operation::device::GemmSpecialization::Default,
+        ck::tensor_operation::device::GemmSpecialization::MNKPadding,
         2,
         2>;