Skip to content

Fix CUDA kernel index data type in deeplearning/fbgemm/fbgemm_gpu/bench/verify_fp16_stochastic_benchmark.cu +10 #3845

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions fbgemm_gpu/bench/verify_fp16_stochastic_benchmark.cu
Original file line number Diff line number Diff line change
Expand Up @@ -66,23 +66,23 @@ __device__ half float_to_sto_half_assemblefloat(float w, uint8_t rand) {
}

__global__ void convert_float_to_half_direct(half* dst, float* src, int size) {
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
dst[idx] = float_to_sto_half_direct(src[idx]);
}
}

__global__ void
convert_float_to_half_bitcarry(half* dst, float* src, int size) {
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
dst[idx] = float_to_sto_half_bitcarry(src[idx]);
}
}

__global__ void
convert_float_to_half_shortrand(half* dst, float* src, uint8_t* r, int size) {
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
dst[idx] = float_to_sto_half_shortrand(src[idx], r[idx]);
}
Expand All @@ -93,7 +93,7 @@ __global__ void convert_float_to_half_assemblefloat(
float* src,
uint8_t* r,
int size) {
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
dst[idx] = float_to_sto_half_assemblefloat(src[idx], r[idx]);
}
Expand Down Expand Up @@ -128,7 +128,7 @@ void gen_8bit_random(uint8_t* d_random_number, int test_size) {
}

__global__ void flush_gpu(char* d_flush, char* d_flush2, bool do_write) {
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
const char val = d_flush[idx];
if (do_write * val) {
d_flush2[idx] = val;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ __launch_bounds__(kMaxThreads) void int_nbit_split_embedding_codegen_forward_pru
pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
dense_indices) {
// uint32_t capacity = hash_table.size(0);
const int32_t b_t = blockIdx.x * blockDim.y + threadIdx.y;
const auto b_t = blockIdx.x * blockDim.y + threadIdx.y;
const int32_t t = b_t / B;
const int32_t b = b_t % B;
if (b_t >= B * T) {
Expand All @@ -46,7 +46,7 @@ __launch_bounds__(kMaxThreads) void int_nbit_split_embedding_codegen_forward_pru

if (capacity == 0) {
// No pruning applied on the indices associated with this table.
for (int32_t l = threadIdx.x; l < L; l += blockDim.x) {
for (auto l = threadIdx.x; l < L; l += blockDim.x) {
dense_indices[indices_start + l] = indices[indices_start + l];
}
return;
Expand Down Expand Up @@ -115,7 +115,7 @@ __launch_bounds__(kMaxThreads) void int_nbit_split_embedding_codegen_forward_pru
const int32_t T,
pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
dense_indices) {
const int32_t b_t = blockIdx.x * blockDim.y + threadIdx.y;
const auto b_t = blockIdx.x * blockDim.y + threadIdx.y;
const int32_t t = b_t / B;
const int32_t b = b_t % B;
if (b_t >= B * T) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ __global__ void {{ emb_weight_type.enum_name }}_split_embedding{{ "_nobag" if no
{% else %}
const int32_t B = (offsets.size(0) - 1) / T;
{% endif %}
const int32_t bb_t = blockIdx.x * blockDim.y + threadIdx.y;
const auto bb_t = blockIdx.x * blockDim.y + threadIdx.y;
if (bb_t >= fd_B.D() * T) {
return;
}
Expand Down Expand Up @@ -321,7 +321,7 @@ __global__ void {{ emb_weight_type.enum_name }}_split_embedding{{ "_nobag" if no
// We shift back by 4/8/16 elements to remove the first 4 Bytes (which is garbage due to
// the scale/shift handling).
// Reason: to avoid divergence the first thread in the warp computes garbage.
const int32_t output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
const auto output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
scalar_t v = reinterpret_cast<const scalar_t*>(row)[kWarpSize * j + threadIdx.x];
if (output_d >= 0 && output_d < D) {
const int num_valid_outputs = min(static_cast<int>(D - output_d), static_cast<int>({{ (32 // emb_weight_type.bit_width) }}));
Expand All @@ -337,7 +337,7 @@ __global__ void {{ emb_weight_type.enum_name }}_split_embedding{{ "_nobag" if no
float2 qparams;
#pragma unroll AccumulateStoreRequests
for (uint32_t j = 0; j < AccumulateStoreRequests; ++j) {
int32_t output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
auto output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
scalar_t v = reinterpret_cast<const scalar_t*>(row)[kWarpSize * j + threadIdx.x];
VecNT<{{ (32 // emb_weight_type.bit_width) }}, PrimitiveType::{{ emb_weight_type.primitive_type }}> acc(v{% if emb_weight_type.primitive_type == "INT" %}, shift_scale {% elif emb_weight_type.enum_name == "FP8" %}, exponent_bits, exponent_bias {% endif %});
if (output_d >= 0 && output_d < D) {
Expand All @@ -348,7 +348,7 @@ __global__ void {{ emb_weight_type.enum_name }}_split_embedding{{ "_nobag" if no
qparams = warp_find_qparams(thread_local_min, thread_local_max);
#pragma unroll AccumulateStoreRequests
for (uint32_t j = 0; j < AccumulateStoreRequests; ++j) {
const int32_t output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
const auto output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
scalar_t v = reinterpret_cast<const scalar_t*>(row)[kWarpSize * j + threadIdx.x];
if (output_d >= 0 && output_d < D) {
const int num_valid_outputs = min(static_cast<int>(D - output_d), static_cast<int>({{ (32 // emb_weight_type.bit_width) }}));
Expand All @@ -368,7 +368,7 @@ __global__ void {{ emb_weight_type.enum_name }}_split_embedding{{ "_nobag" if no
{% if not nobag %}
// In case of PackedMode, computes the packed bag index during store stage w.r.t.
// the real number of uints in the rows.
const int32_t packed_bag_store_idx = PackedMode ? threadIdx.x / uints_per_row : 0;
const auto packed_bag_store_idx = PackedMode ? threadIdx.x / uints_per_row : 0;

#pragma unroll OutputRowsPerThread
for (uint32_t i = 0; i < OutputRowsPerThread; ++i) {
Expand All @@ -378,7 +378,7 @@ __global__ void {{ emb_weight_type.enum_name }}_split_embedding{{ "_nobag" if no
if constexpr (std::is_same_v<output_t, float> || std::is_same_v<output_t, at::Half> || std::is_same_v<output_t, at::BFloat16>) {
#pragma unroll AccumulateStoreRequests
for (uint32_t j = 0; j < AccumulateStoreRequests; ++j) {
int32_t output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
auto output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
if constexpr (PackedMode) {
// Offset global output_d index with the size of outputs per bag w.r.t. current
// packed bag index
Expand Down Expand Up @@ -408,7 +408,7 @@ __global__ void {{ emb_weight_type.enum_name }}_split_embedding{{ "_nobag" if no
float2 qparams;
#pragma unroll AccumulateStoreRequests
for (uint32_t j = 0; j < AccumulateStoreRequests; ++j) {
int32_t output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
auto output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
accumulators[i][j].mul(inv_L);
if (output_d >= 0 && output_d < D) {
thread_local_max = max(thread_local_max, float{{ (32 // emb_weight_type.bit_width) }}_max(accumulators[i][j].acc));
Expand All @@ -421,7 +421,7 @@ __global__ void {{ emb_weight_type.enum_name }}_split_embedding{{ "_nobag" if no
const int output_D_end = output_D_start + D;
#pragma unroll AccumulateStoreRequests
for (uint32_t j = 0; j < AccumulateStoreRequests; ++j) {
const int32_t output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
const auto output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
if (output_d >= 0 && output_d < D) {
const int num_valid_outputs = min(static_cast<int>(D - output_d), static_cast<int>({{ (32 // emb_weight_type.bit_width) }}));
accumulators[i][j].store(&output[b][output_D_start + output_d], qparams, num_valid_outputs);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ DEVICE_INLINE void compute_grad_sum_{{ kdesc }}(
}

for (int32_t sl = sl_start; sl < sl_end; sl += kThreadGroupSize) {
int32_t sl_j = sl + threadIdx.x;
auto sl_j = sl + threadIdx.x;
{%- if not nobag %}
const auto b_t = sl_j < sl_end
? reinterpret_cast<const uint32_t*>(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ __global__ __launch_bounds__(kMaxThreads) void grad_mean{{ vdesc }}_kernel(
{% endif %}
) {
int32_t T = D_offsets.size(0) - 1;
int32_t b_t = blockIdx.x * blockDim.y + threadIdx.y;
auto b_t = blockIdx.x * blockDim.y + threadIdx.y;
[[maybe_unused]] int32_t b;
int32_t t;
const auto total_B = offsets.size(0) - 1;
Expand Down Expand Up @@ -194,13 +194,13 @@ __global__ __launch_bounds__(kMaxThreads) void grad_mean{{ vdesc }}_kernel(
grad_t* shifted_grad_output_mean = &grad_output_mean[grad_outer_offset][grad_offset];

if (L != 0) {
for (int32_t d = threadIdx.x; d * 4 < D; d += blockDim.x) {
for (auto d = threadIdx.x; d * 4 < D; d += blockDim.x) {
Vec4T<grad_t> grad_out_vec(&shifted_grad_output[d * 4]);
grad_out_vec.mul_(1.0 / L);
grad_out_vec.store(&shifted_grad_output_mean[d * 4]);
}
} else {
for (int32_t d = threadIdx.x; d * 4 < D; d += blockDim.x) {
for (auto d = threadIdx.x; d * 4 < D; d += blockDim.x) {
Vec4T<grad_t> grad_out_vec(&shifted_grad_output[d * 4]);
grad_out_vec.store(&shifted_grad_output_mean[d * 4]);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ __global__ __launch_bounds__(kForwardMaxThreads) void
constexpr int32_t kVecWidth = 4;

int32_t T = D_offsets.size(0) - 1;
int32_t b_t = blockIdx.x * blockDim.y + threadIdx.y;
auto b_t = blockIdx.x * blockDim.y + threadIdx.y;
if (b_t >= offsets.size(0) - 1) {
return;
}
Expand Down Expand Up @@ -173,7 +173,7 @@ __global__ __launch_bounds__(kForwardMaxThreads) void
}

for (int32_t l_start = 0; l_start < L; l_start += kWarpSize) {
int32_t l = l_start + threadIdx.x;
auto l = l_start + threadIdx.x;
index_t idx = l < L ? indices[indices_start + l] : 0;
{%- if not dense %}
const auto {{ locs_or_addrs_idx }} =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,8 @@ batch_index_select_dim0_codegen_backward_kernel_cta_per_row(
constexpr auto kIsInt8 = std::is_same<emb_t, uint8_t>::value;
int32_t T = weights_offsets.size(0);
const int32_t num_long_runs = num_long_run_ids[0];
const int32_t warp_id = threadIdx.y;
const int32_t lane_id = threadIdx.x;
const auto warp_id = threadIdx.y;
const auto lane_id = threadIdx.x;

// Copy value to max_vecs to make max_vecs_per_thread known at compile time
// when kUseVecBlocking == false
Expand All @@ -187,7 +187,7 @@ batch_index_select_dim0_codegen_backward_kernel_cta_per_row(
{%- if is_gwd_kernel %}
const float weight_decay_base = 1 - learning_rate * weight_decay;
{%- endif %}
for (int32_t long_run_id = blockIdx.x; long_run_id < num_long_runs; long_run_id += gridDim.x) {
for (auto long_run_id = blockIdx.x; long_run_id < num_long_runs; long_run_id += gridDim.x) {
// The first thread block in the really long run has run_id in long_run_ids
// and the rest have the negative of its offset (see find_long_segments kernel).
int32_t cta_rank_on_current_run = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ batch_index_select_dim0_codegen_backward_kernel_warp_per_row(
{%- else %}
int32_t T = weights_offsets.size(0);
{%- endif %}
const int32_t start_run_id = blockIdx.x * blockDim.y + threadIdx.y;
const auto start_run_id = blockIdx.x * blockDim.y + threadIdx.y;
{%- if is_gwd_kernel %}
const float weight_decay_base = 1 - learning_rate * weight_decay;
{%- endif %}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ batch_index_select_dim0_codegen_forward_small_kernel(
pta::PackedTensorAccessor64<output_t, {{ "1" if is_index_select else "2" }}, at::RestrictPtrTraits> output
) {
int32_t T = weights_offsets.size(0);
int32_t b_t = blockIdx.x * blockDim.y + threadIdx.y;
auto b_t = blockIdx.x * blockDim.y + threadIdx.y;
{%- if not is_index_select %}
if (b_t >= offsets.size(0) - 1) {
return;
Expand Down Expand Up @@ -128,12 +128,12 @@ batch_index_select_dim0_codegen_forward_small_kernel(
D_emb += kINT8QparamsBytes;
}

const int32_t group_start = threadIdx.x / kThreadGroupSize * kThreadGroupSize;
const auto group_start = threadIdx.x / kThreadGroupSize * kThreadGroupSize;
const int32_t group_end = group_start + kThreadGroupSize;
const int32_t d = threadIdx.x % kThreadGroupSize * 4;
const auto d = threadIdx.x % kThreadGroupSize * 4;

for (int32_t l_start = 0; l_start < L; l_start += kWarpSize) {
int32_t l = l_start + threadIdx.x;
auto l = l_start + threadIdx.x;
int64_t idx = l < L ? indices[indices_start + l] : 0;
{%- if not dense %}
const {{ locs_or_addrs_type }} {{ locs_or_addrs_idx }} =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ using namespace fbgemm_gpu;

{%- else %}
for (int32_t i = 0; i < D; i += kThreadGroupSize * VEC_WIDTH) {
const int32_t d = i + threadIdx.x * VEC_WIDTH;
const auto d = i + threadIdx.x * VEC_WIDTH;
if (d < D) {
// Since there is no pooling, simply copy the weights to output
const auto weights_slice = weights_row.load(d, qparams);
Expand Down Expand Up @@ -253,7 +253,7 @@ using namespace fbgemm_gpu;

{%- else %}
for (int32_t i = 0; i < D; i += kThreadGroupSize * VEC_WIDTH) {
const int32_t d = i + threadIdx.x * VEC_WIDTH;
const auto d = i + threadIdx.x * VEC_WIDTH;
if (d < D) {
// Since there is no pooling, simply copy the weights to output
const auto weights_slice = weights_row.load(d, qparams);
Expand Down Expand Up @@ -328,7 +328,7 @@ using namespace fbgemm_gpu;
// Iterate over each kThreadGroupSize-sized subset of L indices in the bag
for (int32_t l_start = 0; l_start < L; l_start += kThreadGroupSize) {
// Determine the L index that this thread will load data from in cooperative load
int32_t l = l_start + threadIdx.x;
auto l = l_start + threadIdx.x;

{%- if dense or lxu_miss_rate != "cache_conflict_miss_rate::zero" %}
// Cooperatively load the indices
Expand Down Expand Up @@ -682,7 +682,7 @@ batch_index_select_dim0_codegen_forward_kernel(
{%- endif %}

// Determine the linearized warp ID, and exit early if needed
int32_t b_t = blockIdx.x * blockDim.y + threadIdx.y;
auto b_t = blockIdx.x * blockDim.y + threadIdx.y;
{%- if not is_index_select %}
if (b_t >= offsets.size(0) - 1) {
return;
Expand Down
Loading