pytorch · r-barnes · Mar 18, 2025
diff --git a/fbgemm_gpu/bench/verify_fp16_stochastic_benchmark.cu b/fbgemm_gpu/bench/verify_fp16_stochastic_benchmark.cu
@@ -66,23 +66,23 @@ __device__ half float_to_sto_half_assemblefloat(float w, uint8_t rand) {
 }
 
 __global__ void convert_float_to_half_direct(half* dst, float* src, int size) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < size) {
     dst[idx] = float_to_sto_half_direct(src[idx]);
   }
 }
 
 __global__ void
 convert_float_to_half_bitcarry(half* dst, float* src, int size) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < size) {
     dst[idx] = float_to_sto_half_bitcarry(src[idx]);
   }
 }
 
 __global__ void
 convert_float_to_half_shortrand(half* dst, float* src, uint8_t* r, int size) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < size) {
     dst[idx] = float_to_sto_half_shortrand(src[idx], r[idx]);
   }
@@ -93,7 +93,7 @@ __global__ void convert_float_to_half_assemblefloat(
     float* src,
     uint8_t* r,
     int size) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < size) {
     dst[idx] = float_to_sto_half_assemblefloat(src[idx], r[idx]);
   }
@@ -128,7 +128,7 @@ void gen_8bit_random(uint8_t* d_random_number, int test_size) {
 }
 
 __global__ void flush_gpu(char* d_flush, char* d_flush2, bool do_write) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
   const char val = d_flush[idx];
   if (do_write * val) {
     d_flush2[idx] = val;

diff --git a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_split_lookup.cu b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_split_lookup.cu
@@ -30,7 +30,7 @@ __launch_bounds__(kMaxThreads) void int_nbit_split_embedding_codegen_forward_pru
     pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
         dense_indices) {
   // uint32_t capacity = hash_table.size(0);
-  const int32_t b_t = blockIdx.x * blockDim.y + threadIdx.y;
+  const auto b_t = blockIdx.x * blockDim.y + threadIdx.y;
   const int32_t t = b_t / B;
   const int32_t b = b_t % B;
   if (b_t >= B * T) {
@@ -46,7 +46,7 @@ __launch_bounds__(kMaxThreads) void int_nbit_split_embedding_codegen_forward_pru
 
   if (capacity == 0) {
     // No pruning applied on the indices associated with this table.
-    for (int32_t l = threadIdx.x; l < L; l += blockDim.x) {
+    for (auto l = threadIdx.x; l < L; l += blockDim.x) {
       dense_indices[indices_start + l] = indices[indices_start + l];
     }
     return;
@@ -115,7 +115,7 @@ __launch_bounds__(kMaxThreads) void int_nbit_split_embedding_codegen_forward_pru
     const int32_t T,
     pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
         dense_indices) {
-  const int32_t b_t = blockIdx.x * blockDim.y + threadIdx.y;
+  const auto b_t = blockIdx.x * blockDim.y + threadIdx.y;
   const int32_t t = b_t / B;
   const int32_t b = b_t % B;
   if (b_t >= B * T) {

diff --git a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_split_nbit_kernel_template.cu b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_split_nbit_kernel_template.cu
@@ -57,7 +57,7 @@ __global__ void {{ emb_weight_type.enum_name }}_split_embedding{{ "_nobag" if no
   {% else %}
   const int32_t B = (offsets.size(0) - 1) / T;
   {% endif %}
-  const int32_t bb_t = blockIdx.x * blockDim.y + threadIdx.y;
+  const auto bb_t = blockIdx.x * blockDim.y + threadIdx.y;
   if (bb_t >= fd_B.D() * T) {
     return;
   }
@@ -321,7 +321,7 @@ __global__ void {{ emb_weight_type.enum_name }}_split_embedding{{ "_nobag" if no
             // We shift back by 4/8/16 elements to remove the first 4 Bytes (which is garbage due to
             // the scale/shift handling).
             // Reason: to avoid divergence the first thread in the warp computes garbage.
-            const int32_t output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
+            const auto output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
             scalar_t v = reinterpret_cast<const scalar_t*>(row)[kWarpSize * j + threadIdx.x];
             if (output_d >= 0 && output_d < D) {
               const int num_valid_outputs = min(static_cast<int>(D - output_d), static_cast<int>({{ (32 // emb_weight_type.bit_width) }}));
@@ -337,7 +337,7 @@ __global__ void {{ emb_weight_type.enum_name }}_split_embedding{{ "_nobag" if no
           float2 qparams;
           #pragma unroll AccumulateStoreRequests
           for (uint32_t j = 0; j < AccumulateStoreRequests; ++j) {
-            int32_t output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
+            auto output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
             scalar_t v = reinterpret_cast<const scalar_t*>(row)[kWarpSize * j + threadIdx.x];
             VecNT<{{ (32 // emb_weight_type.bit_width) }}, PrimitiveType::{{ emb_weight_type.primitive_type }}> acc(v{% if emb_weight_type.primitive_type == "INT" %}, shift_scale {% elif emb_weight_type.enum_name == "FP8" %}, exponent_bits, exponent_bias {% endif %});
             if (output_d >= 0 && output_d < D) {
@@ -348,7 +348,7 @@ __global__ void {{ emb_weight_type.enum_name }}_split_embedding{{ "_nobag" if no
           qparams = warp_find_qparams(thread_local_min, thread_local_max);
           #pragma unroll AccumulateStoreRequests
           for (uint32_t j = 0; j < AccumulateStoreRequests; ++j) {
-            const int32_t output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
+            const auto output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
             scalar_t v = reinterpret_cast<const scalar_t*>(row)[kWarpSize * j + threadIdx.x];
             if (output_d >= 0 && output_d < D) {
               const int num_valid_outputs = min(static_cast<int>(D - output_d), static_cast<int>({{ (32 // emb_weight_type.bit_width) }}));
@@ -368,7 +368,7 @@ __global__ void {{ emb_weight_type.enum_name }}_split_embedding{{ "_nobag" if no
   {% if not nobag %}
   // In case of PackedMode, computes the packed bag index during store stage w.r.t.
   // the real number of uints in the rows.
-  const int32_t packed_bag_store_idx = PackedMode ? threadIdx.x / uints_per_row : 0;
+  const auto packed_bag_store_idx = PackedMode ? threadIdx.x / uints_per_row : 0;
 
   #pragma unroll OutputRowsPerThread
   for (uint32_t i = 0; i < OutputRowsPerThread; ++i) {
@@ -378,7 +378,7 @@ __global__ void {{ emb_weight_type.enum_name }}_split_embedding{{ "_nobag" if no
     if constexpr (std::is_same_v<output_t, float> || std::is_same_v<output_t, at::Half> || std::is_same_v<output_t, at::BFloat16>) {
       #pragma unroll AccumulateStoreRequests
       for (uint32_t j = 0; j < AccumulateStoreRequests; ++j) {
-        int32_t output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
+        auto output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
         if constexpr (PackedMode) {
           // Offset global output_d index with the size of outputs per bag w.r.t. current
           // packed bag index 
@@ -408,7 +408,7 @@ __global__ void {{ emb_weight_type.enum_name }}_split_embedding{{ "_nobag" if no
       float2 qparams;
       #pragma unroll AccumulateStoreRequests
       for (uint32_t j = 0; j < AccumulateStoreRequests; ++j) {
-        int32_t output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
+        auto output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
         accumulators[i][j].mul(inv_L);
         if (output_d >= 0 && output_d < D) {
           thread_local_max = max(thread_local_max, float{{ (32 // emb_weight_type.bit_width) }}_max(accumulators[i][j].acc));
@@ -421,7 +421,7 @@ __global__ void {{ emb_weight_type.enum_name }}_split_embedding{{ "_nobag" if no
       const int output_D_end = output_D_start + D;
       #pragma unroll AccumulateStoreRequests
       for (uint32_t j = 0; j < AccumulateStoreRequests; ++j) {
-        const int32_t output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
+        const auto output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
         if (output_d >= 0 && output_d < D) {
           const int num_valid_outputs = min(static_cast<int>(D - output_d), static_cast<int>({{ (32 // emb_weight_type.bit_width) }}));
           accumulators[i][j].store(&output[b][output_D_start + output_d], qparams, num_valid_outputs);

diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_device_kernel_template.cuh b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_device_kernel_template.cuh
@@ -119,7 +119,7 @@ DEVICE_INLINE void compute_grad_sum_{{ kdesc }}(
         }
 
         for (int32_t sl = sl_start; sl < sl_end; sl += kThreadGroupSize) {
-            int32_t sl_j = sl + threadIdx.x;
+            auto sl_j = sl + threadIdx.x;
             {%- if not nobag %}
             const auto b_t = sl_j < sl_end
                 ? reinterpret_cast<const uint32_t*>(

diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_grad_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_grad_template.cu
@@ -158,7 +158,7 @@ __global__ __launch_bounds__(kMaxThreads) void grad_mean{{ vdesc }}_kernel(
     {% endif %}
 ) {
   int32_t T = D_offsets.size(0) - 1;
-  int32_t b_t = blockIdx.x * blockDim.y + threadIdx.y;
+  auto b_t = blockIdx.x * blockDim.y + threadIdx.y;
   [[maybe_unused]] int32_t b;
   int32_t t;
   const auto total_B = offsets.size(0) - 1;
@@ -194,13 +194,13 @@ __global__ __launch_bounds__(kMaxThreads) void grad_mean{{ vdesc }}_kernel(
   grad_t* shifted_grad_output_mean = &grad_output_mean[grad_outer_offset][grad_offset];
 
   if (L != 0) {
-    for (int32_t d = threadIdx.x; d * 4 < D; d += blockDim.x) {
+    for (auto d = threadIdx.x; d * 4 < D; d += blockDim.x) {
       Vec4T<grad_t> grad_out_vec(&shifted_grad_output[d * 4]);
       grad_out_vec.mul_(1.0 / L);
       grad_out_vec.store(&shifted_grad_output_mean[d * 4]);
     }
   } else {
-    for (int32_t d = threadIdx.x; d * 4 < D; d += blockDim.x) {
+    for (auto d = threadIdx.x; d * 4 < D; d += blockDim.x) {
       Vec4T<grad_t> grad_out_vec(&shifted_grad_output[d * 4]);
       grad_out_vec.store(&shifted_grad_output_mean[d * 4]);
     }

diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu
@@ -98,7 +98,7 @@ __global__ __launch_bounds__(kForwardMaxThreads) void
     constexpr int32_t kVecWidth = 4;
 
     int32_t T = D_offsets.size(0) - 1;
-    int32_t b_t = blockIdx.x * blockDim.y + threadIdx.y;
+    auto b_t = blockIdx.x * blockDim.y + threadIdx.y;
     if (b_t >= offsets.size(0) - 1) {
         return;
     }
@@ -173,7 +173,7 @@ __global__ __launch_bounds__(kForwardMaxThreads) void
         }
 
         for (int32_t l_start = 0; l_start < L; l_start += kWarpSize) {
-            int32_t l = l_start + threadIdx.x;
+            auto l = l_start + threadIdx.x;
             index_t idx = l < L ? indices[indices_start + l] : 0;
             {%- if not dense %}
             const auto {{ locs_or_addrs_idx }} =

diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_kernel_cta_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_kernel_cta_template.cu
@@ -173,8 +173,8 @@ batch_index_select_dim0_codegen_backward_kernel_cta_per_row(
   constexpr auto kIsInt8 = std::is_same<emb_t, uint8_t>::value;
   int32_t T = weights_offsets.size(0);
   const int32_t num_long_runs = num_long_run_ids[0];
-  const int32_t warp_id = threadIdx.y;
-  const int32_t lane_id = threadIdx.x;
+  const auto warp_id = threadIdx.y;
+  const auto lane_id = threadIdx.x;
 
   // Copy value to max_vecs to make max_vecs_per_thread known at compile time
   // when kUseVecBlocking == false
@@ -187,7 +187,7 @@ batch_index_select_dim0_codegen_backward_kernel_cta_per_row(
   {%- if is_gwd_kernel %}
   const float weight_decay_base = 1 - learning_rate * weight_decay;
   {%- endif %}
-  for (int32_t long_run_id = blockIdx.x; long_run_id < num_long_runs; long_run_id += gridDim.x) {
+  for (auto long_run_id = blockIdx.x; long_run_id < num_long_runs; long_run_id += gridDim.x) {
         // The first thread block in the really long run has run_id in long_run_ids
         // and the rest have the negative of its offset (see find_long_segments kernel).
         int32_t cta_rank_on_current_run = 0;

diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_kernel_warp_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_kernel_warp_template.cu
@@ -145,7 +145,7 @@ batch_index_select_dim0_codegen_backward_kernel_warp_per_row(
     {%- else %}
     int32_t T = weights_offsets.size(0);
     {%- endif %}
-    const int32_t start_run_id = blockIdx.x * blockDim.y + threadIdx.y;
+    const auto start_run_id = blockIdx.x * blockDim.y + threadIdx.y;
     {%- if is_gwd_kernel %}
     const float weight_decay_base = 1 - learning_rate * weight_decay;
     {%- endif %}

diff --git a/fbgemm_gpu/codegen/training/forward/embedding_forward_split_kernel_nobag_small_template.cu b/fbgemm_gpu/codegen/training/forward/embedding_forward_split_kernel_nobag_small_template.cu
@@ -68,7 +68,7 @@ batch_index_select_dim0_codegen_forward_small_kernel(
     pta::PackedTensorAccessor64<output_t, {{ "1" if is_index_select else "2" }}, at::RestrictPtrTraits> output
     ) {
     int32_t T = weights_offsets.size(0);
-    int32_t b_t = blockIdx.x * blockDim.y + threadIdx.y;
+    auto b_t = blockIdx.x * blockDim.y + threadIdx.y;
     {%- if not is_index_select %}
     if (b_t >= offsets.size(0) - 1) {
         return;
@@ -128,12 +128,12 @@ batch_index_select_dim0_codegen_forward_small_kernel(
         D_emb += kINT8QparamsBytes;
     }
 
-    const int32_t group_start = threadIdx.x / kThreadGroupSize * kThreadGroupSize;
+    const auto group_start = threadIdx.x / kThreadGroupSize * kThreadGroupSize;
     const int32_t group_end = group_start + kThreadGroupSize;
-    const int32_t d = threadIdx.x % kThreadGroupSize * 4;
+    const auto d = threadIdx.x % kThreadGroupSize * 4;
 
     for (int32_t l_start = 0; l_start < L; l_start += kWarpSize) {
-        int32_t l = l_start + threadIdx.x;
+        auto l = l_start + threadIdx.x;
         int64_t idx = l < L ? indices[indices_start + l] : 0;
         {%- if not dense %}
         const {{ locs_or_addrs_type }} {{ locs_or_addrs_idx }} =

diff --git a/fbgemm_gpu/codegen/training/forward/embedding_forward_split_kernel_template.cu b/fbgemm_gpu/codegen/training/forward/embedding_forward_split_kernel_template.cu
@@ -148,7 +148,7 @@ using namespace fbgemm_gpu;
 
     {%- else %}
     for (int32_t i = 0; i < D; i += kThreadGroupSize * VEC_WIDTH) {
-        const int32_t d = i + threadIdx.x * VEC_WIDTH;
+        const auto d = i + threadIdx.x * VEC_WIDTH;
         if (d < D) {
             // Since there is no pooling, simply copy the weights to output
             const auto weights_slice = weights_row.load(d, qparams);
@@ -253,7 +253,7 @@ using namespace fbgemm_gpu;
 
     {%- else %}
     for (int32_t i = 0; i < D; i += kThreadGroupSize * VEC_WIDTH) {
-        const int32_t d = i + threadIdx.x * VEC_WIDTH;
+        const auto d = i + threadIdx.x * VEC_WIDTH;
         if (d < D) {
             // Since there is no pooling, simply copy the weights to output
             const auto weights_slice = weights_row.load(d, qparams);
@@ -328,7 +328,7 @@ using namespace fbgemm_gpu;
     // Iterate over each kThreadGroupSize-sized subset of L indices in the bag
     for (int32_t l_start = 0; l_start < L; l_start += kThreadGroupSize) {
         // Determine the L index that this thread will load data from in cooperative load
-        int32_t l = l_start + threadIdx.x;
+        auto l = l_start + threadIdx.x;
 
         {%- if dense or lxu_miss_rate != "cache_conflict_miss_rate::zero" %}
         // Cooperatively load the indices
@@ -682,7 +682,7 @@ batch_index_select_dim0_codegen_forward_kernel(
     {%- endif %}
 
     // Determine the linearized warp ID, and exit early if needed
-    int32_t b_t = blockIdx.x * blockDim.y + threadIdx.y;
+    auto b_t = blockIdx.x * blockDim.y + threadIdx.y;
     {%- if not is_index_select %}
     if (b_t >= offsets.size(0) - 1) {
         return;