Skip to content

Commit 4f67aa1

Browse files
committed
relocated files, added missing sm_6x intrinsics
1 parent 953a8ae commit 4f67aa1

File tree

7 files changed

+350
-9
lines changed

7 files changed

+350
-9
lines changed

RecoParticleFlow/PFClusterProducer/plugins/alpaka/PFMultiDepthClusterECLCC.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#include "RecoParticleFlow/PFClusterProducer/interface/alpaka/PFMultiDepthClusteringEdgeVarsDeviceCollection.h"
88
#include "RecoParticleFlow/PFClusterProducer/interface/alpaka/PFMultiDepthClusteringVarsDeviceCollection.h"
99

10-
#include "RecoParticleFlow/PFClusterProducer/interface/alpaka/PFMultiDepthClusterWarpIntrinsics.h"
10+
#include "RecoParticleFlow/PFClusterProducer/plugins/alpaka/PFMultiDepthClusterWarpIntrinsics.h"
1111

1212
/*
1313
ECL-CC code: ECL-CC is a connected components graph algorithm. The CUDA
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
#ifndef PFClusterProducer_plugins_alpaka_PFMultiDepthClusterWarpIntrinsics_h
2+
#define PFClusterProducer_plugins_alpaka_PFMultiDepthClusterWarpIntrinsics_h
3+
4+
#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
5+
6+
7+
namespace cms::alpakatools{
8+
namespace warp {
9+
10+
template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
11+
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void syncWarpThreads_mask(TAcc const& acc, unsigned mask) {
12+
#if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__)
13+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
14+
// Alpaka CUDA backend
15+
__syncwarp(mask); // Synchronize all threads within a subset of lanes in the warp
16+
#endif
17+
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
18+
// Alpaka HIP backend
19+
__builtin_amdgcn_wave_barrier();
20+
#endif
21+
#endif
22+
// No-op for CPU accelerators
23+
}
24+
25+
template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
26+
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE unsigned ballot_mask(TAcc const& acc, unsigned mask, int pred ) {
27+
unsigned res{0};
28+
#if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__)
29+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
30+
// Alpaka CUDA backend
31+
res = __ballot_sync(mask, pred); // Synchronize all threads within a warp
32+
#endif
33+
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
34+
// Alpaka HIP backend
35+
// HIP equivalent for warp ballot
36+
#endif
37+
#endif
38+
return res;
39+
}
40+
41+
template <typename TAcc, typename T, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
42+
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE T shfl_mask(TAcc const& acc, unsigned mask, T var, int srcLane, int width ) {
43+
T res{};
44+
#if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__)
45+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
46+
// Alpaka CUDA backend
47+
res = __shfl_sync(mask, var, srcLane, width); // Synchronize all threads within a warp
48+
#endif
49+
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
50+
// Alpaka HIP backend
51+
// HIP equivalent for warp __shfl_down_sync
52+
#endif
53+
#endif
54+
return res;
55+
}
56+
57+
template <typename TAcc, typename T, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
58+
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE T shfl_down_mask(TAcc const& acc, unsigned mask, T var, int srcLane, int width ) {
59+
T res{};
60+
#if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__)
61+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
62+
// Alpaka CUDA backend
63+
res = __shfl_down_sync(mask, var, srcLane, width); // Synchronize all threads within a warp
64+
#endif
65+
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
66+
// Alpaka HIP backend
67+
// HIP equivalent for warp __shfl_down_sync
68+
#endif
69+
#endif
70+
return res;
71+
}
72+
73+
template <typename TAcc, typename T, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
74+
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE T shfl_up_mask(TAcc const& acc, unsigned mask, T var, int srcLane, int width ) {
75+
T res{};
76+
#if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__)
77+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
78+
// Alpaka CUDA backend
79+
res = __shfl_up_sync(mask, var, srcLane, width); // Synchronize all threads within a warp
80+
#endif
81+
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
82+
// Alpaka HIP backend
83+
// HIP equivalent for warp __shfl_up_sync
84+
#endif
85+
#endif
86+
return res;
87+
}
88+
89+
template <typename TAcc, typename T, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
90+
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE T match_any_mask(TAcc const& acc, unsigned mask, T val) {
91+
T res{};
92+
#if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__)
93+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
94+
// Alpaka CUDA backend
95+
#if __CUDA_ARCH__ >= 700
96+
res = __match_any_sync(mask, val); // Synchronize all threads within a warp
97+
#else
98+
const unsigned int w_extent = alpaka::warp::getSize(acc);
99+
unsigned int match = 0;
100+
for (int iter_lane_idx = 0; iter_lane_idx < w_extent; ++iter_lane_idx) {
101+
T iter_val = __shfl_sync(mask, val, iter_lane_idx, w_extent);
102+
const unsigned int iter_lane_mask = 1 << iter_lane_idx;
103+
if (iter_val == val) match |= iter_lane_mask;
104+
}
105+
res = match & mask;
106+
107+
__syncwarp(mask);
108+
#endif
109+
#endif
110+
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
111+
// Alpaka HIP backend
112+
// HIP equivalent for warp __match_any_sync
113+
#endif
114+
#endif
115+
return res;
116+
}
117+
118+
} // end of warp exp
119+
120+
// reverse the bit order of a (32-bit) unsigned integer.
121+
template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
122+
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE unsigned brev(TAcc const& acc, unsigned mask) {
123+
unsigned res{0};
124+
#if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__)
125+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
126+
// Alpaka CUDA backend
127+
res = __brev(mask);
128+
#endif
129+
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
130+
// Alpaka HIP backend
131+
#endif
132+
#endif
133+
return res;
134+
}
135+
136+
// count the number of leading zeros in a 32-bit unsigned integer
137+
template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
138+
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE unsigned clz(TAcc const& acc, unsigned mask) {
139+
unsigned res{0};
140+
#if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__)
141+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
142+
// Alpaka CUDA backend
143+
res = __clz(mask);
144+
#endif
145+
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
146+
// Alpaka HIP backend
147+
#endif
148+
#endif
149+
return res;
150+
}
151+
152+
}// end of alpakatools
153+
#endif
Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
#ifndef PFClusterProducer_plugins_alpaka_PFMultiDepthClusterizerHelper_h
2+
#define PFClusterProducer_plugins_alpaka_PFMultiDepthClusterizerHelper_h
3+
4+
/**
5+
* @file PFMultiDepthClusterizerHelper.h
6+
* @brief Warp-level utility functions for particle flow multi-depth clustering.
7+
*
8+
* This header provides basic warp-synchronous operations used in clustering algorithms,
9+
* including bitwise manipulations (least/most significant set bits) and masked
10+
* warp-exclusive sum computations.
11+
*/
12+
13+
#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
14+
15+
#include "RecoParticleFlow/PFClusterProducer/interface/alpaka/PFMultiDepthClusterWarpIntrinsics.h"
16+
17+
namespace ALPAKA_ACCELERATOR_NAMESPACE {
18+
19+
using namespace cms::alpakatools;
20+
21+
/**
22+
* @brief Returns the position of the least significant set bit in a mask.
23+
*
24+
* @tparam TAcc Alpaka accelerator type.
25+
*
26+
* @param acc Alpaka accelerator instance.
27+
* @param x Input bitmask.
28+
*
29+
* @return Index of least significant 1 bit (0-based). (or -1 if x == 0).
30+
*/
31+
template< typename TAcc >
32+
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE unsigned int get_ls1b_idx(TAcc const& acc, const int x) {
33+
const int pos = alpaka::ffs(acc, x);
34+
return static_cast<unsigned int>(pos - 1);
35+
}
36+
37+
/**
38+
* @brief Clears the least significant set bit in a mask.
39+
*
40+
* @tparam TAcc Alpaka accelerator type.
41+
*
42+
* @param acc Alpaka accelerator instance.
43+
* @param x Input bitmask.
44+
*
45+
* @return Bitmask with least significant 1 bit cleared.
46+
*/
47+
48+
template< typename TAcc >
49+
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE unsigned int erase_ls1b(TAcc const& acc, const unsigned int x) {
50+
return (x & (x-1));
51+
}
52+
53+
/**
54+
* @brief Returns the position of the most significant set bit in a mask.
55+
*
56+
* @tparam TAcc Alpaka accelerator type.
57+
*
58+
* @param acc Alpaka accelerator instance.
59+
* @param x Input bitmask.
60+
*
61+
* @return Index of most significant 1 bit (0-based). (or -1 if x == 0)
62+
*/
63+
64+
template< typename TAcc >
65+
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE unsigned int get_ms1b_idx(TAcc const& acc, const unsigned int x) {
66+
constexpr unsigned int size = sizeof(unsigned int)-1;
67+
const int pos = size - cms::alpakatools::clz(acc, x);
68+
return pos - 1;
69+
}
70+
71+
/**
72+
* @brief Performs warp-level exclusive prefix sum under a custom lane mask.
73+
*
74+
* @tparam TAcc Alpaka accelerator type.
75+
* @tparam accum If true, broadcast total accumulated value to lowest active lane.
76+
*
77+
* @param acc Alpaka accelerator instance.
78+
* @param mask Active lane mask.
79+
* @param val Value to include in the prefix sum.
80+
* @param lane_idx Current thread's lane index.
81+
*
82+
* @return Exclusive prefix sum value for the current lane.
83+
*/
84+
85+
template <typename TAcc, bool all = true, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
86+
ALPAKA_FN_ACC ALPAKA_FN_INLINE unsigned int warp_exclusive_sum(TAcc const& acc, const unsigned int mask, unsigned int val, const unsigned int lane_idx) {
87+
if ( mask == 0x0 ) return 0;
88+
89+
const unsigned int w_extent = alpaka::warp::getSize(acc);
90+
//
91+
unsigned int local_offset = 0;
92+
//
93+
CMS_UNROLL_LOOP
94+
for (unsigned int j = 1; j < w_extent; j *= 2) {
95+
const auto n = warp::shfl_up_mask(acc, mask, val, j, w_extent);
96+
if (lane_idx >= j) local_offset += n;
97+
}
98+
//
99+
warp::syncWarpThreads_mask(acc, mask);
100+
101+
if constexpr (!all) {
102+
return local_offset;
103+
} else {
104+
// Compute the lowest and the highest valid lane index in the mask:
105+
const unsigned low_lane_idx = get_ls1b_idx(acc, mask);
106+
const unsigned high_lane_idx = get_ms1b_idx(acc, mask);
107+
108+
// send last lane value (total tile offset) to lane idx = low_lane_idx:
109+
const unsigned active_mask = 1 | (1 << high_lane_idx);
110+
const unsigned x = warp::shfl_mask(acc, active_mask, local_offset + val, high_lane_idx, w_extent);
111+
//
112+
if (lane_idx == low_lane_idx) local_offset = x;
113+
114+
warp::syncWarpThreads_mask(acc, mask);
115+
}
116+
return local_offset;
117+
}
118+
/**
119+
* @brief Returns logical index for a given physical lane index based on custom lane mask.
120+
*
121+
* @tparam TAcc Alpaka accelerator type.
122+
*
123+
* @param acc Alpaka accelerator instance.
124+
* @param mask Input bitmask.
125+
* @param lane_idx imput phys. lane index
126+
*
127+
* @return Index of the lane in the mask
128+
*/
129+
130+
template< typename TAcc >
131+
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE unsigned int get_logical_lane_idx(TAcc const& acc, const unsigned int mask, const unsigned int lane_idx) {
132+
const auto lane_mask = mask & ((1 << lane_idx) - 1);
133+
return alpaka::popcount(acc, lane_mask); // Count 1s below current lane
134+
}
135+
136+
/**
137+
* @brief Returns logical index for a given physical lane index based on custom lane mask.
138+
*
139+
* @tparam TAcc Alpaka accelerator type.
140+
*
141+
* @param acc Alpaka accelerator instance.
142+
* @param mask Input bitmask.
143+
* @param lane_idx imput phys. lane index
144+
*
145+
* @return Index of the lane in the mask
146+
*/
147+
148+
template< typename TAcc >
149+
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE unsigned int get_high_neighbor_logical_lane_idx(TAcc const& acc, const unsigned int active_mask, const unsigned int custom_mask, const unsigned int lane_idx) {
150+
// Zero out all bits <= lid
151+
const auto zeroed_lowbit_mask = custom_mask & (active_mask << (lane_idx+1));
152+
// Just in case if the mask is exactly zero (may happen!):
153+
return zeroed_lowbit_mask == 0x0 ? lane_idx : get_ls1b_idx(acc, zeroed_lowbit_mask);
154+
}
155+
156+
/**
157+
* @brief generic warp reduction
158+
*
159+
* @tparam TAcc Alpaka accelerator type.
160+
*
161+
* @param acc Alpaka accelerator instance.
162+
* @param mask Input bitmask.
163+
* @param in imput value to reduce
164+
* @param f reducer
165+
*
166+
* @return return reduced value (propagated to all lanes in the mask by default)
167+
*/
168+
169+
template< typename TAcc, typename reduce_t, typename reducer_t >
170+
ALPAKA_FN_ACC ALPAKA_FN_INLINE reduce_t warp_reduce(TAcc const& acc, unsigned int mask, reduce_t const in, const reducer_t f, bool all = true)
171+
{
172+
unsigned int const warpExtent = alpaka::warp::getSize(acc);
173+
//
174+
reduce_t result = static_cast<reduce_t>(0);
175+
176+
for (unsigned int offset = warpExtent / 2; offset > 0; offset /= 2) {
177+
result = f(result, warp::shfl_down_mask(acc, mask, result, offset, warpExtent));
178+
}
179+
180+
if (all) result = warp::shfl_mask(acc, mask, result, 0, warpExtent);
181+
182+
return result;
183+
}
184+
185+
}
186+
187+
#endif
188+

RecoParticleFlow/PFClusterProducer/plugins/alpaka/PFMultiDepthConstructLinks.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99

1010
#include "RecoParticleFlow/PFClusterProducer/interface/alpaka/PFMultiDepthClusteringEdgeVarsDeviceCollection.h"
1111
#include "RecoParticleFlow/PFClusterProducer/interface/alpaka/PFMultiDepthClusteringVarsDeviceCollection.h"
12-
#include "RecoParticleFlow/PFClusterProducer/interface/alpaka/PFMultiDepthClusterWarpIntrinsics.h"
13-
#include "RecoParticleFlow/PFClusterProducer/interface/alpaka/PFMultiDepthClusterizerHelper.h"
12+
#include "RecoParticleFlow/PFClusterProducer/plugins/alpaka/PFMultiDepthClusterWarpIntrinsics.h"
13+
#include "RecoParticleFlow/PFClusterProducer/plugins/alpaka/PFMultiDepthClusterizerHelper.h"
1414

1515
/**
1616
* @brief Warp-based link construction kernel for Particle Flow (PF) multi-depth clustering.

RecoParticleFlow/PFClusterProducer/plugins/alpaka/PFMultiDepthECLCCEpilogue.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
#include "HeterogeneousCore/AlpakaMath/interface/deltaPhi.h"
99
#include "RecoParticleFlow/PFClusterProducer/interface/alpaka/PFMultiDepthClusteringEdgeVarsDeviceCollection.h"
1010
#include "RecoParticleFlow/PFClusterProducer/interface/alpaka/PFMultiDepthClusteringVarsDeviceCollection.h"
11-
#include "RecoParticleFlow/PFClusterProducer/interface/alpaka/PFMultiDepthClusterWarpIntrinsics.h"
12-
#include "RecoParticleFlow/PFClusterProducer/interface/alpaka/PFMultiDepthClusterizerHelper.h"
11+
#include "RecoParticleFlow/PFClusterProducer/plugins/alpaka/PFMultiDepthClusterWarpIntrinsics.h"
12+
#include "RecoParticleFlow/PFClusterProducer/plugins/alpaka/PFMultiDepthClusterizerHelper.h"
1313

1414
/**
1515
* @file PFMultiDepthECLCCEpilogue.h

RecoParticleFlow/PFClusterProducer/plugins/alpaka/PFMultiDepthECLCCPrologue.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
#include "HeterogeneousCore/AlpakaMath/interface/deltaPhi.h"
99
#include "RecoParticleFlow/PFClusterProducer/interface/alpaka/PFMultiDepthClusteringEdgeVarsDeviceCollection.h"
1010
#include "RecoParticleFlow/PFClusterProducer/interface/alpaka/PFMultiDepthClusteringVarsDeviceCollection.h"
11-
#include "RecoParticleFlow/PFClusterProducer/interface/alpaka/PFMultiDepthClusterWarpIntrinsics.h"
12-
#include "RecoParticleFlow/PFClusterProducer/interface/alpaka/PFMultiDepthClusterizerHelper.h"
11+
#include "RecoParticleFlow/PFClusterProducer/plugins/alpaka/PFMultiDepthClusterWarpIntrinsics.h"
12+
#include "RecoParticleFlow/PFClusterProducer/plugins/alpaka/PFMultiDepthClusterizerHelper.h"
1313

1414
/**
1515
* @brief Warp-based construction of adjacency graph for multi-depth particle flow clusters.

RecoParticleFlow/PFClusterProducer/plugins/alpaka/PFMultiDepthShowerShape.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99

1010
#include "RecoParticleFlow/PFClusterProducer/interface/alpaka/PFMultiDepthClusteringEdgeVarsDeviceCollection.h"
1111
#include "RecoParticleFlow/PFClusterProducer/interface/alpaka/PFMultiDepthClusteringVarsDeviceCollection.h"
12-
#include "RecoParticleFlow/PFClusterProducer/interface/alpaka/PFMultiDepthClusterWarpIntrinsics.h"
13-
#include "RecoParticleFlow/PFClusterProducer/interface/alpaka/PFMultiDepthClusterizerHelper.h"
12+
#include "RecoParticleFlow/PFClusterProducer/plugins/alpaka/PFMultiDepthClusterWarpIntrinsics.h"
13+
#include "RecoParticleFlow/PFClusterProducer/plugins/alpaka/PFMultiDepthClusterizerHelper.h"
1414

1515
namespace ALPAKA_ACCELERATOR_NAMESPACE {
1616

0 commit comments

Comments
 (0)