modify GPU data type to fit int64

junpeng.li · junpeng0715 · commit 6eac042fb832 · 2022-10-11T10:48:01.000Z
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -488,6 +488,11 @@ if(USE_SWIG)
         COMMAND "${Java_JAR_EXECUTABLE}" -cf lightgbmlib.jar com
     )
   else()
+    add_custom_command(
+        TARGET _lightgbm_swig_swig_compilation
+        POST_BUILD
+        COMMAND sed -i 's/long long const/int64_t const/g' java/lightgbmlibJAVA_wrap.cxx
+    )
     add_custom_command(
         TARGET _lightgbm_swig
         POST_BUILD
diff --git a/src/io/train_share_states.cpp b/src/io/train_share_states.cpp
@@ -75,7 +75,7 @@ void MultiValBinWrapper::HistMerge(std::vector<hist_t,
     const data_size_t end = std::min<data_size_t>(start + bin_block_size, num_bin_);
     for (data_size_t tid = 1; tid < n_data_block_; ++tid) {
       auto src_ptr = hist_buf->data() + static_cast<size_t>(num_bin_aligned_) * 2 * (tid - 1);
-      for (int i = start * 2; i < end * 2; ++i) {
+      for (data_size_t i = start * 2; i < end * 2; ++i) {
         dst[i] += src_ptr[i];
       }
     }
diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h
@@ -235,7 +235,7 @@ class CUDATreeLearner: public SerialTreeLearner {
     std::vector<float> kernel_time_;  // measure histogram kernel time
     std::vector<std::chrono::duration<double, std::milli>> kernel_input_wait_time_;
     int num_gpu_;
-    int allocated_num_data_;  // allocated data instances
+    data_size_t allocated_num_data_;  // allocated data instances
     pthread_t **cpu_threads_;  // pthread, 1 cpu thread / gpu
 };
 
diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp
@@ -233,7 +233,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
   num_dense_feature4_ = (num_dense_feature_groups_ + (dword_features_ - 1)) / dword_features_;
   // leave some safe margin for prefetching
   // 256 work-items per workgroup. Each work-item prefetches one tuple for that feature
-  int allocated_num_data_ = num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature);
+  data_size_t allocated_num_data_ = num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature);
   // clear sparse/dense maps
   dense_feature_group_map_.clear();
   device_bin_mults_.clear();
@@ -391,7 +391,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
         *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[5]),
         *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[6]),
         *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[7])};
-      for (int j = 0; j < num_data_; ++j) {
+      for (data_size_t j = 0; j < num_data_; ++j) {
         host4[j].s[0] = (uint8_t)((iters[0].RawGet(j) * dev_bin_mult[0] + ((j+0) & (dev_bin_mult[0] - 1)))
                       |((iters[1].RawGet(j) * dev_bin_mult[1] + ((j+1) & (dev_bin_mult[1] - 1))) << 4));
         host4[j].s[1] = (uint8_t)((iters[2].RawGet(j) * dev_bin_mult[2] + ((j+2) & (dev_bin_mult[2] - 1)))
@@ -409,13 +409,13 @@ void GPUTreeLearner::AllocateGPUMemory() {
         if (dynamic_cast<DenseBinIterator<uint8_t, false>*>(bin_iter) != 0) {
           // Dense bin
           DenseBinIterator<uint8_t, false> iter = *static_cast<DenseBinIterator<uint8_t, false>*>(bin_iter);
-          for (int j = 0; j < num_data_; ++j) {
+          for (data_size_t j = 0; j < num_data_; ++j) {
             host4[j].s[s_idx] = (uint8_t)(iter.RawGet(j) * dev_bin_mult[s_idx] + ((j+s_idx) & (dev_bin_mult[s_idx] - 1)));
           }
         } else if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iter) != 0) {
           // Dense 4-bit bin
           DenseBinIterator<uint8_t, true> iter = *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iter);
-          for (int j = 0; j < num_data_; ++j) {
+          for (data_size_t j = 0; j < num_data_; ++j) {
             host4[j].s[s_idx] = (uint8_t)(iter.RawGet(j) * dev_bin_mult[s_idx] + ((j+s_idx) & (dev_bin_mult[s_idx] - 1)));
           }
         } else {
@@ -452,7 +452,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
         if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iter) != 0) {
           DenseBinIterator<uint8_t, true> iter = *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iter);
           #pragma omp parallel for schedule(static)
-          for (int j = 0; j < num_data_; ++j) {
+          for (data_size_t j = 0; j < num_data_; ++j) {
             host4[j].s[i >> 1] |= (uint8_t)((iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
                                 + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)))
                                << ((i & 1) << 2));
@@ -465,14 +465,14 @@ void GPUTreeLearner::AllocateGPUMemory() {
         if (dynamic_cast<DenseBinIterator<uint8_t, false>*>(bin_iter) != 0) {
           DenseBinIterator<uint8_t, false> iter = *static_cast<DenseBinIterator<uint8_t, false>*>(bin_iter);
           #pragma omp parallel for schedule(static)
-          for (int j = 0; j < num_data_; ++j) {
+          for (data_size_t j = 0; j < num_data_; ++j) {
             host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
                           + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)));
           }
         } else if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iter) != 0) {
           DenseBinIterator<uint8_t, true> iter = *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iter);
           #pragma omp parallel for schedule(static)
-          for (int j = 0; j < num_data_; ++j) {
+          for (data_size_t j = 0; j < num_data_; ++j) {
             host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
                           + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)));
           }
@@ -486,15 +486,15 @@ void GPUTreeLearner::AllocateGPUMemory() {
     // fill the leftover features
     if (dword_features_ == 8) {
       #pragma omp parallel for schedule(static)
-      for (int j = 0; j < num_data_; ++j) {
+      for (data_size_t j = 0; j < num_data_; ++j) {
         for (int i = k; i < dword_features_; ++i) {
           // fill this empty feature with some "random" value
           host4[j].s[i >> 1] |= (uint8_t)((j & 0xf) << ((i & 1) << 2));
         }
       }
     } else if (dword_features_ == 4) {
       #pragma omp parallel for schedule(static)
-      for (int j = 0; j < num_data_; ++j) {
+      for (data_size_t j = 0; j < num_data_; ++j) {
         for (int i = k; i < dword_features_; ++i) {
           // fill this empty feature with some "random" value
           host4[j].s[i] = (uint8_t)j;
diff --git a/src/treelearner/ocl/histogram256.cl b/src/treelearner/ocl/histogram256.cl
@@ -71,7 +71,7 @@ typedef uint acc_int_type;
 #define AMD_USE_DS_ADD_F32 0
 #endif
 
-typedef uint data_size_t;
+typedef signed long int data_size_t;
 typedef float score_t;
 
 
@@ -439,7 +439,7 @@ __kernel void histogram256(__global const uchar4* feature_data_base,
 R""()
 */
     // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
-    for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
+    for (data_size_t i = subglobal_tid; i < num_data; i += subglobal_size) {
         // prefetch the next iteration variables
         // we don't need boundary check because we have made the buffer larger
         stat1_next = ordered_gradients[i + subglobal_size];

Original file line number	Diff line number	Diff line change
`@@ -75,7 +75,7 @@ void MultiValBinWrapper::HistMerge(std::vector<hist_t,`
`75`	`75`	`const data_size_t end = std::min<data_size_t>(start + bin_block_size, num_bin_);`
`76`	`76`	`for (data_size_t tid = 1; tid < n_data_block_; ++tid) {`
`77`	`77`	`auto src_ptr = hist_buf->data() + static_cast<size_t>(num_bin_aligned_) * 2 * (tid - 1);`
`78`		`- for (int i = start * 2; i < end * 2; ++i) {`
	`78`	`+ for (data_size_t i = start * 2; i < end * 2; ++i) {`
`79`	`79`	`dst[i] += src_ptr[i];`
`80`	`80`	`}`
`81`	`81`	`}`