|
1 | 1 | #ifndef PFClusterProducer_plugins_alpaka_PFMultiDepthClusterWarpIntrinsics_h
|
2 | 2 | #define PFClusterProducer_plugins_alpaka_PFMultiDepthClusterWarpIntrinsics_h
|
3 | 3 |
|
4 |
| - |
5 | 4 | #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
|
6 | 5 |
|
7 | 6 | namespace ALPAKA_ACCELERATOR_NAMESPACE {
|
8 | 7 |
|
9 | 8 | namespace cms::alpakatools{
|
10 | 9 | namespace warp {
|
11 | 10 |
|
12 |
| - template <typename TAcc> |
13 |
| - ALPAKA_FN_HOST_ACC inline void syncWarpThreads_mask(TAcc const& acc, unsigned mask) { |
| 11 | + template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>> |
| 12 | + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void syncWarpThreads_mask(TAcc const& acc, unsigned mask) { |
| 13 | +#if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__) |
14 | 14 | #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
|
15 | 15 | // Alpaka CUDA backend
|
16 |
| - if constexpr (alpaka::isAccelerator<TAcc>::value) { |
17 |
| - __syncwarp(mask); // Synchronize all threads within a warp |
18 |
| - } |
| 16 | + __syncwarp(mask); // Synchronize all threads within a subset of lanes in the warp |
19 | 17 | #endif
|
20 | 18 | #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
|
21 | 19 | // Alpaka HIP backend
|
22 |
| - if constexpr (alpaka::isAccelerator<TAcc>::value) { |
23 |
| - __builtin_amdgcn_wave_barrier(); |
24 |
| - } |
| 20 | + __builtin_amdgcn_wave_barrier(); |
25 | 21 | #endif
|
| 22 | +#endif |
26 | 23 | // No-op for CPU accelerators
|
27 | 24 | }
|
28 | 25 |
|
29 |
| - template <typename TAcc> |
30 |
| - ALPAKA_FN_HOST_ACC inline unsigned ballot_mask(TAcc const& acc, unsigned mask, int pred ) { |
| 26 | + template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>> |
| 27 | + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE unsigned ballot_mask(TAcc const& acc, unsigned mask, int pred ) { |
31 | 28 | unsigned res{0};
|
| 29 | +#if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__) |
32 | 30 | #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
|
33 | 31 | // Alpaka CUDA backend
|
34 |
| - if constexpr (alpaka::isAccelerator<TAcc>::value) { |
35 |
| - res = __ballot_sync(mask, pred); // Synchronize all threads within a warp |
36 |
| - } |
| 32 | + res = __ballot_sync(mask, pred); // Synchronize all threads within a warp |
37 | 33 | #endif
|
38 | 34 | #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
|
39 | 35 | // Alpaka HIP backend
|
40 |
| - if constexpr (alpaka::isAccelerator<TAcc>::value) { |
41 |
| - // HIP equivalent for warp ballot |
42 |
| - } |
| 36 | + // HIP equivalent for warp ballot |
43 | 37 | #endif
|
| 38 | +#endif |
44 | 39 | return res;
|
45 | 40 | }
|
46 | 41 |
|
47 |
| - template <typename TAcc, typename T> |
48 |
| - ALPAKA_FN_HOST_ACC inline T shfl_mask(TAcc const& acc, unsigned mask, T var, int srcLane, int width ) { |
| 42 | + template <typename TAcc, typename T, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>> |
| 43 | + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE T shfl_mask(TAcc const& acc, unsigned mask, T var, int srcLane, int width ) { |
49 | 44 | T res{};
|
| 45 | +#if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__) |
50 | 46 | #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
|
51 | 47 | // Alpaka CUDA backend
|
52 |
| - if constexpr (alpaka::isAccelerator<TAcc>::value) { |
53 |
| - res = __shfl_sync(mask, var, srcLane, width); // Synchronize all threads within a warp |
54 |
| - } |
| 48 | + res = __shfl_sync(mask, var, srcLane, width); // Synchronize all threads within a warp |
55 | 49 | #endif
|
56 | 50 | #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
|
57 | 51 | // Alpaka HIP backend
|
58 |
| - if constexpr (alpaka::isAccelerator<TAcc>::value) { |
59 |
| - // HIP equivalent for warp __shfl_down_sync |
60 |
| - } |
| 52 | + // HIP equivalent for warp __shfl_down_sync |
61 | 53 | #endif
|
| 54 | +#endif |
62 | 55 | return res;
|
63 | 56 | }
|
64 | 57 |
|
65 |
| - template <typename TAcc, typename T> |
66 |
| - ALPAKA_FN_HOST_ACC inline T shfl_down_mask(TAcc const& acc, unsigned mask, T var, int srcLane, int width ) { |
| 58 | + template <typename TAcc, typename T, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>> |
| 59 | + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE T shfl_down_mask(TAcc const& acc, unsigned mask, T var, int srcLane, int width ) { |
67 | 60 | T res{};
|
| 61 | +#if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__) |
68 | 62 | #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
|
69 | 63 | // Alpaka CUDA backend
|
70 |
| - if constexpr (alpaka::isAccelerator<TAcc>::value) { |
71 |
| - res = __shfl_down_sync(mask, var, srcLane, width); // Synchronize all threads within a warp |
72 |
| - } |
| 64 | + res = __shfl_down_sync(mask, var, srcLane, width); // Synchronize all threads within a warp |
73 | 65 | #endif
|
74 | 66 | #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
|
75 | 67 | // Alpaka HIP backend
|
76 |
| - if constexpr (alpaka::isAccelerator<TAcc>::value) { |
77 |
| - // HIP equivalent for warp __shfl_down_sync |
78 |
| - } |
| 68 | + // HIP equivalent for warp __shfl_down_sync |
79 | 69 | #endif
|
| 70 | +#endif |
80 | 71 | return res;
|
81 | 72 | }
|
82 | 73 |
|
83 |
| - template <typename TAcc, typename T> |
84 |
| - ALPAKA_FN_HOST_ACC inline T shfl_up_mask(TAcc const& acc, unsigned mask, T var, int srcLane, int width ) { |
| 74 | + template <typename TAcc, typename T, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>> |
| 75 | + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE T shfl_up_mask(TAcc const& acc, unsigned mask, T var, int srcLane, int width ) { |
85 | 76 | T res{};
|
| 77 | +#if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__) |
86 | 78 | #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
|
87 | 79 | // Alpaka CUDA backend
|
88 |
| - if constexpr (alpaka::isAccelerator<TAcc>::value) { |
89 |
| - res = __shfl_up_sync(mask, var, srcLane, width); // Synchronize all threads within a warp |
90 |
| - } |
| 80 | + res = __shfl_up_sync(mask, var, srcLane, width); // Synchronize all threads within a warp |
91 | 81 | #endif
|
92 | 82 | #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
|
93 | 83 | // Alpaka HIP backend
|
94 |
| - if constexpr (alpaka::isAccelerator<TAcc>::value) { |
95 |
| - // HIP equivalent for warp __shfl_up_sync |
96 |
| - } |
| 84 | + // HIP equivalent for warp __shfl_up_sync |
97 | 85 | #endif
|
| 86 | +#endif |
98 | 87 | return res;
|
99 | 88 | }
|
100 | 89 |
|
101 |
| - template <typename TAcc, typename T> |
102 |
| - ALPAKA_FN_HOST_ACC inline T match_any_mask(TAcc const& acc, unsigned mask, T var) { |
| 90 | + template <typename TAcc, typename T, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>> |
| 91 | + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE T match_any_mask(TAcc const& acc, unsigned mask, T var) { |
103 | 92 | T res{};
|
| 93 | +#if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__) |
104 | 94 | #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
|
105 | 95 | // Alpaka CUDA backend
|
106 |
| - if constexpr (alpaka::isAccelerator<TAcc>::value) { |
107 |
| - res = __match_any_sync(mask, var); // Synchronize all threads within a warp |
108 |
| - } |
| 96 | +#if __CUDA_ARCH__ >= 700 |
| 97 | + res = __match_any_sync(mask, var); // Synchronize all threads within a warp |
| 98 | +#else |
| 99 | + // old version |
| 100 | +#endif |
109 | 101 | #endif
|
110 | 102 | #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
|
111 | 103 | // Alpaka HIP backend
|
112 |
| - if constexpr (alpaka::isAccelerator<TAcc>::value) { |
113 |
| - // HIP equivalent for warp __match_any_sync |
114 |
| - } |
| 104 | + // HIP equivalent for warp __match_any_sync |
115 | 105 | #endif
|
| 106 | +#endif |
116 | 107 | return res;
|
117 | 108 | }
|
118 | 109 |
|
119 | 110 | } // end of warp exp
|
120 | 111 |
|
121 | 112 | // reverse the bit order of a (32-bit) unsigned integer.
|
122 |
| - template <typename TAcc> |
123 |
| - ALPAKA_FN_HOST_ACC inline unsigned brev(TAcc const& acc, unsigned mask) { |
| 113 | + template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>> |
| 114 | + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE unsigned brev(TAcc const& acc, unsigned mask) { |
124 | 115 | unsigned res{0};
|
| 116 | +#if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__) |
125 | 117 | #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
|
126 | 118 | // Alpaka CUDA backend
|
127 |
| - if constexpr (alpaka::isAccelerator<TAcc>::value) { |
128 |
| - res = __brev(mask); |
129 |
| - } |
| 119 | + res = __brev(mask); |
130 | 120 | #endif
|
131 | 121 | #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
|
132 | 122 | // Alpaka HIP backend
|
133 |
| - if constexpr (alpaka::isAccelerator<TAcc>::value) { |
134 |
| - |
135 |
| - } |
136 | 123 | #endif
|
| 124 | +#endif |
137 | 125 | return res;
|
138 | 126 | }
|
139 | 127 |
|
140 | 128 | // count the number of leading zeros in a 32-bit unsigned integer
|
141 |
| - template <typename TAcc> |
142 |
| - ALPAKA_FN_HOST_ACC inline unsigned clz(TAcc const& acc, unsigned mask) { |
| 129 | + template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>> |
| 130 | + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE unsigned clz(TAcc const& acc, unsigned mask) { |
143 | 131 | unsigned res{0};
|
| 132 | +#if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__) |
144 | 133 | #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
|
145 | 134 | // Alpaka CUDA backend
|
146 |
| - if constexpr (alpaka::isAccelerator<TAcc>::value) { |
147 |
| - res = __clz(mask); |
148 |
| - } |
| 135 | + res = __clz(mask); |
149 | 136 | #endif
|
150 | 137 | #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
|
151 | 138 | // Alpaka HIP backend
|
152 |
| - if constexpr (alpaka::isAccelerator<TAcc>::value) { |
153 |
| - |
154 |
| - } |
155 | 139 | #endif
|
| 140 | +#endif |
156 | 141 | return res;
|
157 | 142 | }
|
158 | 143 |
|
159 |
| - }// end of alpaka |
| 144 | + }// end of alpakatools |
160 | 145 | } // end of alpaka namespace
|
161 | 146 | #endif
|
0 commit comments