Skip to content

Commit 6eac042

Browse files
junpeng.lijunpeng0715
authored andcommitted
modify GPU data type to fit int64
1 parent b32427a commit 6eac042

File tree

5 files changed

+18
-13
lines changed

5 files changed

+18
-13
lines changed

CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -488,6 +488,11 @@ if(USE_SWIG)
488488
COMMAND "${Java_JAR_EXECUTABLE}" -cf lightgbmlib.jar com
489489
)
490490
else()
491+
add_custom_command(
492+
TARGET _lightgbm_swig_swig_compilation
493+
POST_BUILD
494+
COMMAND sed -i 's/long long const/int64_t const/g' java/lightgbmlibJAVA_wrap.cxx
495+
)
491496
add_custom_command(
492497
TARGET _lightgbm_swig
493498
POST_BUILD

src/io/train_share_states.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ void MultiValBinWrapper::HistMerge(std::vector<hist_t,
7575
const data_size_t end = std::min<data_size_t>(start + bin_block_size, num_bin_);
7676
for (data_size_t tid = 1; tid < n_data_block_; ++tid) {
7777
auto src_ptr = hist_buf->data() + static_cast<size_t>(num_bin_aligned_) * 2 * (tid - 1);
78-
for (int i = start * 2; i < end * 2; ++i) {
78+
for (data_size_t i = start * 2; i < end * 2; ++i) {
7979
dst[i] += src_ptr[i];
8080
}
8181
}

src/treelearner/cuda_tree_learner.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ class CUDATreeLearner: public SerialTreeLearner {
235235
std::vector<float> kernel_time_; // measure histogram kernel time
236236
std::vector<std::chrono::duration<double, std::milli>> kernel_input_wait_time_;
237237
int num_gpu_;
238-
int allocated_num_data_; // allocated data instances
238+
data_size_t allocated_num_data_; // allocated data instances
239239
pthread_t **cpu_threads_; // pthread, 1 cpu thread / gpu
240240
};
241241

src/treelearner/gpu_tree_learner.cpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
233233
num_dense_feature4_ = (num_dense_feature_groups_ + (dword_features_ - 1)) / dword_features_;
234234
// leave some safe margin for prefetching
235235
// 256 work-items per workgroup. Each work-item prefetches one tuple for that feature
236-
int allocated_num_data_ = num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature);
236+
data_size_t allocated_num_data_ = num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature);
237237
// clear sparse/dense maps
238238
dense_feature_group_map_.clear();
239239
device_bin_mults_.clear();
@@ -391,7 +391,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
391391
*static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[5]),
392392
*static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[6]),
393393
*static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[7])};
394-
for (int j = 0; j < num_data_; ++j) {
394+
for (data_size_t j = 0; j < num_data_; ++j) {
395395
host4[j].s[0] = (uint8_t)((iters[0].RawGet(j) * dev_bin_mult[0] + ((j+0) & (dev_bin_mult[0] - 1)))
396396
|((iters[1].RawGet(j) * dev_bin_mult[1] + ((j+1) & (dev_bin_mult[1] - 1))) << 4));
397397
host4[j].s[1] = (uint8_t)((iters[2].RawGet(j) * dev_bin_mult[2] + ((j+2) & (dev_bin_mult[2] - 1)))
@@ -409,13 +409,13 @@ void GPUTreeLearner::AllocateGPUMemory() {
409409
if (dynamic_cast<DenseBinIterator<uint8_t, false>*>(bin_iter) != 0) {
410410
// Dense bin
411411
DenseBinIterator<uint8_t, false> iter = *static_cast<DenseBinIterator<uint8_t, false>*>(bin_iter);
412-
for (int j = 0; j < num_data_; ++j) {
412+
for (data_size_t j = 0; j < num_data_; ++j) {
413413
host4[j].s[s_idx] = (uint8_t)(iter.RawGet(j) * dev_bin_mult[s_idx] + ((j+s_idx) & (dev_bin_mult[s_idx] - 1)));
414414
}
415415
} else if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iter) != 0) {
416416
// Dense 4-bit bin
417417
DenseBinIterator<uint8_t, true> iter = *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iter);
418-
for (int j = 0; j < num_data_; ++j) {
418+
for (data_size_t j = 0; j < num_data_; ++j) {
419419
host4[j].s[s_idx] = (uint8_t)(iter.RawGet(j) * dev_bin_mult[s_idx] + ((j+s_idx) & (dev_bin_mult[s_idx] - 1)));
420420
}
421421
} else {
@@ -452,7 +452,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
452452
if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iter) != 0) {
453453
DenseBinIterator<uint8_t, true> iter = *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iter);
454454
#pragma omp parallel for schedule(static)
455-
for (int j = 0; j < num_data_; ++j) {
455+
for (data_size_t j = 0; j < num_data_; ++j) {
456456
host4[j].s[i >> 1] |= (uint8_t)((iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
457457
+ ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)))
458458
<< ((i & 1) << 2));
@@ -465,14 +465,14 @@ void GPUTreeLearner::AllocateGPUMemory() {
465465
if (dynamic_cast<DenseBinIterator<uint8_t, false>*>(bin_iter) != 0) {
466466
DenseBinIterator<uint8_t, false> iter = *static_cast<DenseBinIterator<uint8_t, false>*>(bin_iter);
467467
#pragma omp parallel for schedule(static)
468-
for (int j = 0; j < num_data_; ++j) {
468+
for (data_size_t j = 0; j < num_data_; ++j) {
469469
host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
470470
+ ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)));
471471
}
472472
} else if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iter) != 0) {
473473
DenseBinIterator<uint8_t, true> iter = *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iter);
474474
#pragma omp parallel for schedule(static)
475-
for (int j = 0; j < num_data_; ++j) {
475+
for (data_size_t j = 0; j < num_data_; ++j) {
476476
host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
477477
+ ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)));
478478
}
@@ -486,15 +486,15 @@ void GPUTreeLearner::AllocateGPUMemory() {
486486
// fill the leftover features
487487
if (dword_features_ == 8) {
488488
#pragma omp parallel for schedule(static)
489-
for (int j = 0; j < num_data_; ++j) {
489+
for (data_size_t j = 0; j < num_data_; ++j) {
490490
for (int i = k; i < dword_features_; ++i) {
491491
// fill this empty feature with some "random" value
492492
host4[j].s[i >> 1] |= (uint8_t)((j & 0xf) << ((i & 1) << 2));
493493
}
494494
}
495495
} else if (dword_features_ == 4) {
496496
#pragma omp parallel for schedule(static)
497-
for (int j = 0; j < num_data_; ++j) {
497+
for (data_size_t j = 0; j < num_data_; ++j) {
498498
for (int i = k; i < dword_features_; ++i) {
499499
// fill this empty feature with some "random" value
500500
host4[j].s[i] = (uint8_t)j;

src/treelearner/ocl/histogram256.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ typedef uint acc_int_type;
7171
#define AMD_USE_DS_ADD_F32 0
7272
#endif
7373

74-
typedef uint data_size_t;
74+
typedef signed long int data_size_t;
7575
typedef float score_t;
7676

7777

@@ -439,7 +439,7 @@ __kernel void histogram256(__global const uchar4* feature_data_base,
439439
R""()
440440
*/
441441
// there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
442-
for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
442+
for (data_size_t i = subglobal_tid; i < num_data; i += subglobal_size) {
443443
// prefetch the next iteration variables
444444
// we don't need boundary check because we have made the buffer larger
445445
stat1_next = ordered_gradients[i + subglobal_size];

0 commit comments

Comments
 (0)