Skip to content

Commit 4776d74

Browse files
committed
PR version
1 parent 2630328 commit 4776d74

9 files changed

+366
-707
lines changed
Lines changed: 51 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -1,161 +1,146 @@
11
#ifndef PFClusterProducer_plugins_alpaka_PFMultiDepthClusterWarpIntrinsics_h
22
#define PFClusterProducer_plugins_alpaka_PFMultiDepthClusterWarpIntrinsics_h
33

4-
54
#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
65

76
namespace ALPAKA_ACCELERATOR_NAMESPACE {
87

98
namespace cms::alpakatools{
109
namespace warp {
1110

12-
template <typename TAcc>
13-
ALPAKA_FN_HOST_ACC inline void syncWarpThreads_mask(TAcc const& acc, unsigned mask) {
11+
template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
12+
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void syncWarpThreads_mask(TAcc const& acc, unsigned mask) {
13+
#if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__)
1414
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
1515
// Alpaka CUDA backend
16-
if constexpr (alpaka::isAccelerator<TAcc>::value) {
17-
__syncwarp(mask); // Synchronize all threads within a warp
18-
}
16+
__syncwarp(mask); // Synchronize all threads within a subset of lanes in the warp
1917
#endif
2018
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
2119
// Alpaka HIP backend
22-
if constexpr (alpaka::isAccelerator<TAcc>::value) {
23-
__builtin_amdgcn_wave_barrier();
24-
}
20+
__builtin_amdgcn_wave_barrier();
2521
#endif
22+
#endif
2623
// No-op for CPU accelerators
2724
}
2825

29-
template <typename TAcc>
30-
ALPAKA_FN_HOST_ACC inline unsigned ballot_mask(TAcc const& acc, unsigned mask, int pred ) {
26+
template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
27+
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE unsigned ballot_mask(TAcc const& acc, unsigned mask, int pred ) {
3128
unsigned res{0};
29+
#if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__)
3230
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
3331
// Alpaka CUDA backend
34-
if constexpr (alpaka::isAccelerator<TAcc>::value) {
35-
res = __ballot_sync(mask, pred); // Synchronize all threads within a warp
36-
}
32+
res = __ballot_sync(mask, pred); // Synchronize all threads within a warp
3733
#endif
3834
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
3935
// Alpaka HIP backend
40-
if constexpr (alpaka::isAccelerator<TAcc>::value) {
41-
// HIP equivalent for warp ballot
42-
}
36+
// HIP equivalent for warp ballot
4337
#endif
38+
#endif
4439
return res;
4540
}
4641

47-
template <typename TAcc, typename T>
48-
ALPAKA_FN_HOST_ACC inline T shfl_mask(TAcc const& acc, unsigned mask, T var, int srcLane, int width ) {
42+
template <typename TAcc, typename T, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
43+
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE T shfl_mask(TAcc const& acc, unsigned mask, T var, int srcLane, int width ) {
4944
T res{};
45+
#if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__)
5046
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
5147
// Alpaka CUDA backend
52-
if constexpr (alpaka::isAccelerator<TAcc>::value) {
53-
res = __shfl_sync(mask, var, srcLane, width); // Synchronize all threads within a warp
54-
}
48+
res = __shfl_sync(mask, var, srcLane, width); // Synchronize all threads within a warp
5549
#endif
5650
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
5751
// Alpaka HIP backend
58-
if constexpr (alpaka::isAccelerator<TAcc>::value) {
59-
// HIP equivalent for warp __shfl_down_sync
60-
}
52+
// HIP equivalent for warp __shfl_down_sync
6153
#endif
54+
#endif
6255
return res;
6356
}
6457

65-
template <typename TAcc, typename T>
66-
ALPAKA_FN_HOST_ACC inline T shfl_down_mask(TAcc const& acc, unsigned mask, T var, int srcLane, int width ) {
58+
template <typename TAcc, typename T, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
59+
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE T shfl_down_mask(TAcc const& acc, unsigned mask, T var, int srcLane, int width ) {
6760
T res{};
61+
#if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__)
6862
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
6963
// Alpaka CUDA backend
70-
if constexpr (alpaka::isAccelerator<TAcc>::value) {
71-
res = __shfl_down_sync(mask, var, srcLane, width); // Synchronize all threads within a warp
72-
}
64+
res = __shfl_down_sync(mask, var, srcLane, width); // Synchronize all threads within a warp
7365
#endif
7466
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
7567
// Alpaka HIP backend
76-
if constexpr (alpaka::isAccelerator<TAcc>::value) {
77-
// HIP equivalent for warp __shfl_down_sync
78-
}
68+
// HIP equivalent for warp __shfl_down_sync
7969
#endif
70+
#endif
8071
return res;
8172
}
8273

83-
template <typename TAcc, typename T>
84-
ALPAKA_FN_HOST_ACC inline T shfl_up_mask(TAcc const& acc, unsigned mask, T var, int srcLane, int width ) {
74+
template <typename TAcc, typename T, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
75+
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE T shfl_up_mask(TAcc const& acc, unsigned mask, T var, int srcLane, int width ) {
8576
T res{};
77+
#if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__)
8678
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
8779
// Alpaka CUDA backend
88-
if constexpr (alpaka::isAccelerator<TAcc>::value) {
89-
res = __shfl_up_sync(mask, var, srcLane, width); // Synchronize all threads within a warp
90-
}
80+
res = __shfl_up_sync(mask, var, srcLane, width); // Synchronize all threads within a warp
9181
#endif
9282
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
9383
// Alpaka HIP backend
94-
if constexpr (alpaka::isAccelerator<TAcc>::value) {
95-
// HIP equivalent for warp __shfl_up_sync
96-
}
84+
// HIP equivalent for warp __shfl_up_sync
9785
#endif
86+
#endif
9887
return res;
9988
}
10089

101-
template <typename TAcc, typename T>
102-
ALPAKA_FN_HOST_ACC inline T match_any_mask(TAcc const& acc, unsigned mask, T var) {
90+
template <typename TAcc, typename T, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
91+
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE T match_any_mask(TAcc const& acc, unsigned mask, T var) {
10392
T res{};
93+
#if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__)
10494
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
10595
// Alpaka CUDA backend
106-
if constexpr (alpaka::isAccelerator<TAcc>::value) {
107-
res = __match_any_sync(mask, var); // Synchronize all threads within a warp
108-
}
96+
#if __CUDA_ARCH__ >= 700
97+
res = __match_any_sync(mask, var); // Synchronize all threads within a warp
98+
#else
99+
// old version
100+
#endif
109101
#endif
110102
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
111103
// Alpaka HIP backend
112-
if constexpr (alpaka::isAccelerator<TAcc>::value) {
113-
// HIP equivalent for warp __match_any_sync
114-
}
104+
// HIP equivalent for warp __match_any_sync
115105
#endif
106+
#endif
116107
return res;
117108
}
118109

119110
} // end of warp exp
120111

121112
// reverse the bit order of a (32-bit) unsigned integer.
122-
template <typename TAcc>
123-
ALPAKA_FN_HOST_ACC inline unsigned brev(TAcc const& acc, unsigned mask) {
113+
template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
114+
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE unsigned brev(TAcc const& acc, unsigned mask) {
124115
unsigned res{0};
116+
#if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__)
125117
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
126118
// Alpaka CUDA backend
127-
if constexpr (alpaka::isAccelerator<TAcc>::value) {
128-
res = __brev(mask);
129-
}
119+
res = __brev(mask);
130120
#endif
131121
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
132122
// Alpaka HIP backend
133-
if constexpr (alpaka::isAccelerator<TAcc>::value) {
134-
135-
}
136123
#endif
124+
#endif
137125
return res;
138126
}
139127

140128
// count the number of leading zeros in a 32-bit unsigned integer
141-
template <typename TAcc>
142-
ALPAKA_FN_HOST_ACC inline unsigned clz(TAcc const& acc, unsigned mask) {
129+
template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
130+
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE unsigned clz(TAcc const& acc, unsigned mask) {
143131
unsigned res{0};
132+
#if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__)
144133
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
145134
// Alpaka CUDA backend
146-
if constexpr (alpaka::isAccelerator<TAcc>::value) {
147-
res = __clz(mask);
148-
}
135+
res = __clz(mask);
149136
#endif
150137
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
151138
// Alpaka HIP backend
152-
if constexpr (alpaka::isAccelerator<TAcc>::value) {
153-
154-
}
155139
#endif
140+
#endif
156141
return res;
157142
}
158143

159-
}// end of alpaka
144+
}// end of alpakatools
160145
} // end of alpaka namespace
161146
#endif

RecoParticleFlow/PFClusterProducer/interface/alpaka/PFMultiDepthClusterizerHelper.h

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
* warp-exclusive sum computations.
1111
*/
1212

13+
#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
14+
15+
#include "RecoParticleFlow/PFClusterProducer/interface/alpaka/PFMultiDepthClusterWarpIntrinsics.h"
1316

1417
namespace ALPAKA_ACCELERATOR_NAMESPACE {
1518

@@ -26,9 +29,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
2629
* @return Index of least significant 1 bit (0-based). (or -1 if x == 0).
2730
*/
2831
template< typename TAcc >
29-
ALPAKA_FN_HOST_ACC inline int get_ls1b_idx(TAcc const& acc, const unsigned int x) {
30-
const int pos = static_cast<int>(alpaka::ffs(acc, x));
31-
return pos - 1;
32+
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE unsigned int get_ls1b_idx(TAcc const& acc, const int x) {
33+
const int pos = alpaka::ffs(acc, x);
34+
return static_cast<unsigned int>(pos - 1);
3235
}
3336

3437
/**
@@ -43,7 +46,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
4346
*/
4447

4548
template< typename TAcc >
46-
ALPAKA_FN_HOST_ACC inline unsigned int erase_ls1b(TAcc const& acc, const unsigned int x) {
49+
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE unsigned int erase_ls1b(TAcc const& acc, const unsigned int x) {
4750
return (x & (x-1));
4851
}
4952

@@ -59,7 +62,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
5962
*/
6063

6164
template< typename TAcc >
62-
ALPAKA_FN_HOST_ACC inline int get_ms1b_idx(TAcc const& acc, const unsigned int x) {
65+
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE unsigned int get_ms1b_idx(TAcc const& acc, const unsigned int x) {
6366
constexpr unsigned int size = sizeof(unsigned int)-1;
6467
const int pos = size - cms::alpakatools::clz(acc, x);
6568
return pos - 1;
@@ -80,15 +83,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
8083
*/
8184

8285
template <typename TAcc, bool accum = true, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
83-
ALPAKA_FN_ACC inline unsigned int warp_exclusive_sum(TAcc const& acc, const unsigned int mask, unsigned int val, const unsigned int lane_idx) {
86+
ALPAKA_FN_ACC ALPAKA_FN_INLINE unsigned int warp_exclusive_sum(TAcc const& acc, const unsigned int mask, unsigned int val, const unsigned int lane_idx) {
8487
if ( mask == 0x0 ) return 0;
8588

8689
const unsigned int w_extent = alpaka::warp::getSize(acc);
8790
//
8891
unsigned int local_offset = 0;
8992
//
9093
CMS_UNROLL_LOOP
91-
for (int j = 1; j < w_extent; j *= 2) {
94+
for (unsigned int j = 1; j < w_extent; j *= 2) {
9295
const auto n = warp::shfl_up_mask(acc, mask, val, j, w_extent);
9396
if (lane_idx >= j) local_offset += n;
9497
}
@@ -99,8 +102,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
99102
return local_offset;
100103
} else {
101104
// Compute the lowest and the highest valid lane index in the mask:
102-
const auto low_lane_idx = get_ls1b_idx(acc, mask);
103-
const auto high_lane_idx = get_ms1b_idx(acc, mask);
105+
const unsigned low_lane_idx = get_ls1b_idx(acc, mask);
106+
const unsigned high_lane_idx = get_ms1b_idx(acc, mask);
104107

105108
// send last lane value (total tile offset) to lane idx = low_lane_idx:
106109
const unsigned active_mask = 1 | (1 << high_lane_idx);

0 commit comments

Comments
 (0)