@@ -233,7 +233,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
233
233
num_dense_feature4_ = (num_dense_feature_groups_ + (dword_features_ - 1 )) / dword_features_;
234
234
// leave some safe margin for prefetching
235
235
// 256 work-items per workgroup. Each work-item prefetches one tuple for that feature
236
- int allocated_num_data_ = num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature );
236
+ data_size_t allocated_num_data_ = num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature );
237
237
// clear sparse/dense maps
238
238
dense_feature_group_map_.clear ();
239
239
device_bin_mults_.clear ();
@@ -391,7 +391,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
391
391
*static_cast <DenseBinIterator<uint8_t , true >*>(bin_iters[5 ]),
392
392
*static_cast <DenseBinIterator<uint8_t , true >*>(bin_iters[6 ]),
393
393
*static_cast <DenseBinIterator<uint8_t , true >*>(bin_iters[7 ])};
394
- for (int j = 0 ; j < num_data_; ++j) {
394
+ for (data_size_t j = 0 ; j < num_data_; ++j) {
395
395
host4[j].s [0 ] = (uint8_t )((iters[0 ].RawGet (j) * dev_bin_mult[0 ] + ((j+0 ) & (dev_bin_mult[0 ] - 1 )))
396
396
|((iters[1 ].RawGet (j) * dev_bin_mult[1 ] + ((j+1 ) & (dev_bin_mult[1 ] - 1 ))) << 4 ));
397
397
host4[j].s [1 ] = (uint8_t )((iters[2 ].RawGet (j) * dev_bin_mult[2 ] + ((j+2 ) & (dev_bin_mult[2 ] - 1 )))
@@ -409,13 +409,13 @@ void GPUTreeLearner::AllocateGPUMemory() {
409
409
if (dynamic_cast <DenseBinIterator<uint8_t , false >*>(bin_iter) != 0 ) {
410
410
// Dense bin
411
411
DenseBinIterator<uint8_t , false > iter = *static_cast <DenseBinIterator<uint8_t , false >*>(bin_iter);
412
- for (int j = 0 ; j < num_data_; ++j) {
412
+ for (data_size_t j = 0 ; j < num_data_; ++j) {
413
413
host4[j].s [s_idx] = (uint8_t )(iter.RawGet (j) * dev_bin_mult[s_idx] + ((j+s_idx) & (dev_bin_mult[s_idx] - 1 )));
414
414
}
415
415
} else if (dynamic_cast <DenseBinIterator<uint8_t , true >*>(bin_iter) != 0 ) {
416
416
// Dense 4-bit bin
417
417
DenseBinIterator<uint8_t , true > iter = *static_cast <DenseBinIterator<uint8_t , true >*>(bin_iter);
418
- for (int j = 0 ; j < num_data_; ++j) {
418
+ for (data_size_t j = 0 ; j < num_data_; ++j) {
419
419
host4[j].s [s_idx] = (uint8_t )(iter.RawGet (j) * dev_bin_mult[s_idx] + ((j+s_idx) & (dev_bin_mult[s_idx] - 1 )));
420
420
}
421
421
} else {
@@ -452,7 +452,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
452
452
if (dynamic_cast <DenseBinIterator<uint8_t , true >*>(bin_iter) != 0 ) {
453
453
DenseBinIterator<uint8_t , true > iter = *static_cast <DenseBinIterator<uint8_t , true >*>(bin_iter);
454
454
#pragma omp parallel for schedule(static)
455
- for (int j = 0 ; j < num_data_; ++j) {
455
+ for (data_size_t j = 0 ; j < num_data_; ++j) {
456
456
host4[j].s [i >> 1 ] |= (uint8_t )((iter.RawGet (j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
457
457
+ ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1 )))
458
458
<< ((i & 1 ) << 2 ));
@@ -465,14 +465,14 @@ void GPUTreeLearner::AllocateGPUMemory() {
465
465
if (dynamic_cast <DenseBinIterator<uint8_t , false >*>(bin_iter) != 0 ) {
466
466
DenseBinIterator<uint8_t , false > iter = *static_cast <DenseBinIterator<uint8_t , false >*>(bin_iter);
467
467
#pragma omp parallel for schedule(static)
468
- for (int j = 0 ; j < num_data_; ++j) {
468
+ for (data_size_t j = 0 ; j < num_data_; ++j) {
469
469
host4[j].s [i] = (uint8_t )(iter.RawGet (j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
470
470
+ ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1 )));
471
471
}
472
472
} else if (dynamic_cast <DenseBinIterator<uint8_t , true >*>(bin_iter) != 0 ) {
473
473
DenseBinIterator<uint8_t , true > iter = *static_cast <DenseBinIterator<uint8_t , true >*>(bin_iter);
474
474
#pragma omp parallel for schedule(static)
475
- for (int j = 0 ; j < num_data_; ++j) {
475
+ for (data_size_t j = 0 ; j < num_data_; ++j) {
476
476
host4[j].s [i] = (uint8_t )(iter.RawGet (j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
477
477
+ ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1 )));
478
478
}
@@ -486,15 +486,15 @@ void GPUTreeLearner::AllocateGPUMemory() {
486
486
// fill the leftover features
487
487
if (dword_features_ == 8 ) {
488
488
#pragma omp parallel for schedule(static)
489
- for (int j = 0 ; j < num_data_; ++j) {
489
+ for (data_size_t j = 0 ; j < num_data_; ++j) {
490
490
for (int i = k; i < dword_features_; ++i) {
491
491
// fill this empty feature with some "random" value
492
492
host4[j].s [i >> 1 ] |= (uint8_t )((j & 0xf ) << ((i & 1 ) << 2 ));
493
493
}
494
494
}
495
495
} else if (dword_features_ == 4 ) {
496
496
#pragma omp parallel for schedule(static)
497
- for (int j = 0 ; j < num_data_; ++j) {
497
+ for (data_size_t j = 0 ; j < num_data_; ++j) {
498
498
for (int i = k; i < dword_features_; ++i) {
499
499
// fill this empty feature with some "random" value
500
500
host4[j].s [i] = (uint8_t )j;
0 commit comments