Add a workaround for stochastic rounding for AMD GPUs (pytorch#997)

sryap · facebook-github-bot · commit 540db1f58bd3 · 2025-04-01T13:42:54.000-07:00
Summary: X-link: pytorch#3908 Pull Request resolved: facebookresearch/FBGEMM#997 This diff contains a workaround for the stochastic rounding issue for the AMD GPUs. Problem: `quantize_store` calls `nearest_rounding_vector` instead of `stochastic_rounding_vector` when stochastic rounding is used because the `StochasticRoundingRNGState` pointer is a nullptr (https://fburl.com/code/kna14icj) We found that the `WeightRow` constructor also gets a null `StochasticRoundingRNGState` pointer (https://fburl.com/code/vyq53lia) When `WeightRow` is instantiated, we confirm that `stochastic_rounding` is true. `WeightRow` should receive `&state`, but instead it receives a nullptr. (https://fburl.com/code/o3kxgt4z) We suspect that the compiler might have optimized out the `StochasticRoundingRNGState` since it is only passed to `WeightRow` and not utilized anywhere else in the caller kernel. Workaround: We move the `StochasticRoundingRNGState` storage inside the `WeightRow` struct and pass a boolean to the `WeightRow` constructor instead. Reviewed By: q10, yinbinm, jianyuh, xw285cornell, yoyoyocmu, joebos Differential Revision: D72201618 fbshipit-source-id: a2bc7f004ac5183c84eb0501ada6d848ebca17e1
diff --git a/fbgemm_gpu/codegen/training/optimizer/embedding_optimizer_split_device_kernel_template.cuh b/fbgemm_gpu/codegen/training/optimizer/embedding_optimizer_split_device_kernel_template.cuh
@@ -97,13 +97,12 @@ DEVICE_INLINE void {{ mdesc }}_{{ optimizer }}_table_update_kernel(
     }
     {%- endfor %}
 
-    StochasticRoundingRNGState state;
     auto weight_row_template =
         WeightRow<emb_t, cache_t, at::acc_type<cache_t, true>>(
             weights,
             cache_weights,
             D,
-            stochastic_rounding ? &state : nullptr,
+            stochastic_rounding,
             &stochastic_rounding_philox_args,
             threadIdx.x + run_id * blockDim.x);
 
diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/weight_row.cuh b/fbgemm_gpu/include/fbgemm_gpu/utils/weight_row.cuh
@@ -120,22 +120,20 @@ struct WeightRow {
       : row_(row),
         cache_row_(cache_row),
         dim_(dim),
-        stoc_rounding_state_(nullptr) {}
+        stoc_rounding_state_ptr_(nullptr) {}
 
   // Constructor for stochastic rounding
   DEVICE_INLINE WeightRow(
       emb_t* row,
       cache_t* cache_row,
       int dim,
-      StochasticRoundingRNGState* stoc_rounding_state,
+      bool stochastic_rounding,
       const at::PhiloxCudaState* stochastic_rounding_philox_args,
       const uint64_t salt_value)
       : row_(row), cache_row_(cache_row), dim_(dim) {
-    // Set the internal stoc_rounding_state_
-    stoc_rounding_state_ = stoc_rounding_state;
-
+    stoc_rounding_state_ptr_ = nullptr;
     if constexpr (!std::is_same_v<emb_t, float>) {
-      if (stoc_rounding_state != nullptr) {
+      if (stochastic_rounding) {
         const auto stochastic_rounding_seeds =
             at::cuda::philox::unpack(*stochastic_rounding_philox_args);
 
@@ -145,15 +143,18 @@ struct WeightRow {
             // The salt value should be different for every *run* and every
             // *thread*.
             salt_value,
-            stoc_rounding_state);
+            &stoc_rounding_state_);
+        // Store the pointer here to avoid an if-else cond during load/store
+        stoc_rounding_state_ptr_ = &stoc_rounding_state_;
       }
     }
   }
 
   emb_t* row_;
   cache_t* cache_row_;
   int dim_;
-  StochasticRoundingRNGState* stoc_rounding_state_;
+  StochasticRoundingRNGState stoc_rounding_state_;
+  StochasticRoundingRNGState* stoc_rounding_state_ptr_;
 
   // Load from cache if resident; else load from embedding
   DEVICE_INLINE Vec4T<dst_t> load(const int32_t d, const float2 qparams) const {
@@ -169,9 +170,9 @@ struct WeightRow {
   DEVICE_INLINE void
   store(const Vec4T<dst_t>& v, const int32_t d, const float2 qparams) {
     if (cache_row_) {
-      quantize_store(cache_row_ + d, v, stoc_rounding_state_, qparams);
+      quantize_store(cache_row_ + d, v, stoc_rounding_state_ptr_, qparams);
     } else {
-      quantize_store(row_ + d, v, stoc_rounding_state_, qparams);
+      quantize_store(row_ + d, v, stoc_rounding_state_ptr_, qparams);
     }
   }
 
@@ -201,7 +202,7 @@ struct WeightRow {
     } else {
       // Does 2-step conversion: cache_t -> FP32 -> weight_t
       const auto cache_slice = load(d, qparams);
-      quantize_store(row_ + d, cache_slice, stoc_rounding_state_, qparams);
+      quantize_store(row_ + d, cache_slice, stoc_rounding_state_ptr_, qparams);
     }
   }
 
@@ -236,7 +237,7 @@ struct WeightRow {
       // Does 2-step conversion: weight_t -> FP32 -> cache_t
       for (int32_t d = lane_id * 4; d < dim_length; d += num_lanes * 4) {
         const auto slice = load(d, qparams);
-        quantize_store(dst_row + d, slice, stoc_rounding_state_, qparams);
+        quantize_store(dst_row + d, slice, stoc_rounding_state_ptr_, qparams);
       }
     }
   }
diff --git a/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate.cu b/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate.cu
@@ -116,12 +116,11 @@ __global__ __launch_bounds__(kCacheMaxThreads) void lfu_cache_insert_kernel(
         if constexpr (std::is_same_v<emb_t, uint8_t>) {
           D_emb += kINT8QparamsBytes;
         }
-        StochasticRoundingRNGState state;
         auto weight_row = WeightRow<emb_t, cache_t, cache_t>(
             &weights[weights_offset_current + idx_current * D_emb + 0],
             &lxu_cache_weights[cache_set * kWarpSize + insert_slot][0],
             D_current,
-            stochastic_rounding ? &state : nullptr,
+            stochastic_rounding,
             &stochastic_rounding_philox_args,
             (blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x +
              threadIdx.x) *
diff --git a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate.cu b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate.cu
@@ -123,12 +123,11 @@ __global__ __launch_bounds__(kMaxThreads) void lru_cache_insert_kernel(
           D_emb += kINT8QparamsBytes;
         }
 
-        StochasticRoundingRNGState state;
         auto weight_row = WeightRow<emb_t, cache_t, cache_t>(
             &weights[weights_offset_current + idx_current * D_emb + 0],
             &lxu_cache_weights[cache_set * kWarpSize + insert_slot][0],
             D_current,
-            stochastic_rounding ? &state : nullptr,
+            stochastic_rounding,
             &stochastic_rounding_philox_args,
             stoc_rounding_salt + l);
 
diff --git a/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cu b/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cu
@@ -60,7 +60,7 @@ __global__ __launch_bounds__(kMaxThreads) void lxu_cache_flush_kernel(
         &weights[weights_offset_current + idx_current * D_emb + 0],
         &lxu_cache_weights[b][0],
         D_current,
-        stochastic_rounding ? &state : nullptr,
+        stochastic_rounding,
         &stochastic_rounding_philox_args,
         blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x +
             threadIdx.x);