Skip to content

Commit 2630328

Browse files
committed
added PFMultiDepth* files, recent version (migrated from local Alpaka-mdpf-update4 branch)
1 parent 7a8aff4 commit 2630328

16 files changed

+2803
-0
lines changed

DataFormats/ParticleFlowReco/interface/PFClusterSoA.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ namespace reco {
1818
SOA_COLUMN(float, y),
1919
SOA_COLUMN(float, z),
2020
SOA_COLUMN(int, topoRHCount),
21+
//SOA_COLUMN(int, adjacencyList),//for multi-depth clusterizer
22+
//SOA_COLUMN(int, nDegree),//for multi-depth clusterizer
2123
SOA_SCALAR(int, nTopos),
2224
SOA_SCALAR(int, nSeeds),
2325
SOA_SCALAR(int, nRHFracs),
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#ifndef RecoParticleFlow_PFRecHitProducer_interface_PFMultiDepthClusteringEdgeVarsSoA_h
2+
#define RecoParticleFlow_PFRecHitProducer_interface_PFMultiDepthClusteringEdgeVarsSoA_h
3+
4+
#include "DataFormats/SoATemplate/interface/SoACommon.h"
5+
#include "DataFormats/SoATemplate/interface/SoALayout.h"
6+
#include "DataFormats/SoATemplate/interface/SoAView.h"
7+
8+
namespace reco {
9+
10+
GENERATE_SOA_LAYOUT(PFMultiDepthClusteringEdgeVarsSoALayout,
11+
SOA_COLUMN(int, mdpf_adjacencyIndex), // needs nClusters+1 allocation
12+
SOA_COLUMN(int, mdpf_adjacencyList) // needs 2*nClusters allocation
13+
)
14+
15+
using PFMultiDepthClusteringEdgeVarsSoA = PFMultiDepthClusteringEdgeVarsSoALayout<>;
16+
} // namespace reco
17+
18+
#endif
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#ifndef RecoParticleFlow_PFClusterProducer_interface_PFMultiDepthClusteringVarsSoA_h
2+
#define RecoParticleFlow_PFClusterProducer_interface_PFMultiDepthClusteringVarsSoA_h
3+
4+
#include "DataFormats/SoATemplate/interface/SoACommon.h"
5+
#include "DataFormats/SoATemplate/interface/SoALayout.h"
6+
#include "DataFormats/SoATemplate/interface/SoAView.h"
7+
8+
namespace reco {
9+
10+
GENERATE_SOA_LAYOUT(PFMultiDepthClusteringVarsSoALayout,
11+
SOA_COLUMN(float, depth),
12+
SOA_COLUMN(int, seedRHIdx),
13+
SOA_COLUMN(int, rhfracSize),
14+
SOA_COLUMN(int, rhfracOffset),
15+
SOA_COLUMN(double, etaRMS2),
16+
SOA_COLUMN(double, phiRMS2),
17+
SOA_COLUMN(float, energy),
18+
SOA_COLUMN(float, eta),
19+
SOA_COLUMN(float, phi),
20+
SOA_COLUMN(int, mdpf_topoId),
21+
SOA_COLUMN(int, mdpf_component), //list of component contents (always start with component root id)
22+
SOA_COLUMN(int, mdpf_componentIndex),
23+
SOA_COLUMN(int, mdpf_componentEnergy),
24+
SOA_SCALAR(int, mdpf_nTopos),
25+
SOA_SCALAR(int, size)
26+
)
27+
28+
using PFMultiDepthClusteringVarsSoA = PFMultiDepthClusteringVarsSoALayout<>;
29+
} // namespace reco
30+
31+
#endif
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
#ifndef PFClusterProducer_plugins_alpaka_PFMultiDepthClusterWarpIntrinsics_h
2+
#define PFClusterProducer_plugins_alpaka_PFMultiDepthClusterWarpIntrinsics_h
3+
4+
5+
#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
6+
7+
namespace ALPAKA_ACCELERATOR_NAMESPACE {
8+
9+
namespace cms::alpakatools{
10+
namespace warp {
11+
12+
template <typename TAcc>
13+
ALPAKA_FN_HOST_ACC inline void syncWarpThreads_mask(TAcc const& acc, unsigned mask) {
14+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
15+
// Alpaka CUDA backend
16+
if constexpr (alpaka::isAccelerator<TAcc>::value) {
17+
__syncwarp(mask); // Synchronize all threads within a warp
18+
}
19+
#endif
20+
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
21+
// Alpaka HIP backend
22+
if constexpr (alpaka::isAccelerator<TAcc>::value) {
23+
__builtin_amdgcn_wave_barrier();
24+
}
25+
#endif
26+
// No-op for CPU accelerators
27+
}
28+
29+
template <typename TAcc>
30+
ALPAKA_FN_HOST_ACC inline unsigned ballot_mask(TAcc const& acc, unsigned mask, int pred ) {
31+
unsigned res{0};
32+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
33+
// Alpaka CUDA backend
34+
if constexpr (alpaka::isAccelerator<TAcc>::value) {
35+
res = __ballot_sync(mask, pred); // Synchronize all threads within a warp
36+
}
37+
#endif
38+
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
39+
// Alpaka HIP backend
40+
if constexpr (alpaka::isAccelerator<TAcc>::value) {
41+
// HIP equivalent for warp ballot
42+
}
43+
#endif
44+
return res;
45+
}
46+
47+
template <typename TAcc, typename T>
48+
ALPAKA_FN_HOST_ACC inline T shfl_mask(TAcc const& acc, unsigned mask, T var, int srcLane, int width ) {
49+
T res{};
50+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
51+
// Alpaka CUDA backend
52+
if constexpr (alpaka::isAccelerator<TAcc>::value) {
53+
res = __shfl_sync(mask, var, srcLane, width); // Synchronize all threads within a warp
54+
}
55+
#endif
56+
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
57+
// Alpaka HIP backend
58+
if constexpr (alpaka::isAccelerator<TAcc>::value) {
59+
// HIP equivalent for warp __shfl_down_sync
60+
}
61+
#endif
62+
return res;
63+
}
64+
65+
template <typename TAcc, typename T>
66+
ALPAKA_FN_HOST_ACC inline T shfl_down_mask(TAcc const& acc, unsigned mask, T var, int srcLane, int width ) {
67+
T res{};
68+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
69+
// Alpaka CUDA backend
70+
if constexpr (alpaka::isAccelerator<TAcc>::value) {
71+
res = __shfl_down_sync(mask, var, srcLane, width); // Synchronize all threads within a warp
72+
}
73+
#endif
74+
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
75+
// Alpaka HIP backend
76+
if constexpr (alpaka::isAccelerator<TAcc>::value) {
77+
// HIP equivalent for warp __shfl_down_sync
78+
}
79+
#endif
80+
return res;
81+
}
82+
83+
template <typename TAcc, typename T>
84+
ALPAKA_FN_HOST_ACC inline T shfl_up_mask(TAcc const& acc, unsigned mask, T var, int srcLane, int width ) {
85+
T res{};
86+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
87+
// Alpaka CUDA backend
88+
if constexpr (alpaka::isAccelerator<TAcc>::value) {
89+
res = __shfl_up_sync(mask, var, srcLane, width); // Synchronize all threads within a warp
90+
}
91+
#endif
92+
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
93+
// Alpaka HIP backend
94+
if constexpr (alpaka::isAccelerator<TAcc>::value) {
95+
// HIP equivalent for warp __shfl_up_sync
96+
}
97+
#endif
98+
return res;
99+
}
100+
101+
template <typename TAcc, typename T>
102+
ALPAKA_FN_HOST_ACC inline T match_any_mask(TAcc const& acc, unsigned mask, T var) {
103+
T res{};
104+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
105+
// Alpaka CUDA backend
106+
if constexpr (alpaka::isAccelerator<TAcc>::value) {
107+
res = __match_any_sync(mask, var); // Synchronize all threads within a warp
108+
}
109+
#endif
110+
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
111+
// Alpaka HIP backend
112+
if constexpr (alpaka::isAccelerator<TAcc>::value) {
113+
// HIP equivalent for warp __match_any_sync
114+
}
115+
#endif
116+
return res;
117+
}
118+
119+
} // end of warp exp
120+
121+
// reverse the bit order of a (32-bit) unsigned integer.
122+
template <typename TAcc>
123+
ALPAKA_FN_HOST_ACC inline unsigned brev(TAcc const& acc, unsigned mask) {
124+
unsigned res{0};
125+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
126+
// Alpaka CUDA backend
127+
if constexpr (alpaka::isAccelerator<TAcc>::value) {
128+
res = __brev(mask);
129+
}
130+
#endif
131+
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
132+
// Alpaka HIP backend
133+
if constexpr (alpaka::isAccelerator<TAcc>::value) {
134+
135+
}
136+
#endif
137+
return res;
138+
}
139+
140+
// count the number of leading zeros in a 32-bit unsigned integer
141+
template <typename TAcc>
142+
ALPAKA_FN_HOST_ACC inline unsigned clz(TAcc const& acc, unsigned mask) {
143+
unsigned res{0};
144+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
145+
// Alpaka CUDA backend
146+
if constexpr (alpaka::isAccelerator<TAcc>::value) {
147+
res = __clz(mask);
148+
}
149+
#endif
150+
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
151+
// Alpaka HIP backend
152+
if constexpr (alpaka::isAccelerator<TAcc>::value) {
153+
154+
}
155+
#endif
156+
return res;
157+
}
158+
159+
}// end of alpaka
160+
} // end of alpaka namespace
161+
#endif
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#ifndef RecoParticleFlow_PFRecHitProducer_interface_alpaka_PFMultiDepthClusteringEdgeVarsDeviceCollection_h
2+
#define RecoParticleFlow_PFRecHitProducer_interface_alpaka_PFMultiDepthClusteringEdgeVarsDeviceCollection_h
3+
4+
#include "DataFormats/Portable/interface/alpaka/PortableCollection.h"
5+
#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
6+
7+
#include "RecoParticleFlow/PFClusterProducer/interface/PFMultiDepthClusteringEdgeVarsSoA.h"
8+
9+
namespace ALPAKA_ACCELERATOR_NAMESPACE::reco {
10+
11+
using PFMultiDepthClusteringEdgeVarsDeviceCollection = PortableCollection<::reco::PFMultiDepthClusteringEdgeVarsSoA>;
12+
// needs 2*nClusters allocation
13+
14+
} // namespace ALPAKA_ACCELERATOR_NAMESPACE::reco
15+
16+
#endif
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#ifndef RecoParticleFlow_PFRecHitProducer_interface_alpaka_PFMultiDepthClusteringVarsDeviceCollection_h
2+
#define RecoParticleFlow_PFRecHitProducer_interface_alpaka_PFMultiDepthClusteringVarsDeviceCollection_h
3+
4+
#include "DataFormats/Portable/interface/alpaka/PortableCollection.h"
5+
#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
6+
7+
#include "RecoParticleFlow/PFClusterProducer/interface/PFMultiDepthClusteringVarsSoA.h"
8+
9+
namespace ALPAKA_ACCELERATOR_NAMESPACE::reco {
10+
11+
using PFMultiDepthClusteringVarsDeviceCollection = PortableCollection<::reco::PFMultiDepthClusteringVarsSoA>;
12+
} // namespace ALPAKA_ACCELERATOR_NAMESPACE::reco
13+
14+
#endif
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#ifndef RecoParticleFlow_PFRecHitProducer_interface_alpaka_PFMultiDepthClusteringVarsHostCollection_h
2+
#define RecoParticleFlow_PFRecHitProducer_interface_alpaka_PFMultiDepthClusteringVarsHostCollection_h
3+
4+
#include "RecoParticleFlow/PFClusterProducer/interface/PFMultiDepthClusteringVarsSoA.h"
5+
#include "DataFormats/Portable/interface/PortableHostCollection.h"
6+
7+
namespace reco {
8+
using PFMultiDepthClusteringVarsHostCollection = PortableHostCollection<::reco::PFMultiDepthClusteringVarsSoA>;
9+
} // namespace reco
10+
11+
#endif
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
#ifndef PFClusterProducer_plugins_alpaka_PFMultiDepthClusterizerHelper_h
2+
#define PFClusterProducer_plugins_alpaka_PFMultiDepthClusterizerHelper_h
3+
4+
/**
5+
* @file PFMultiDepthClusterizerHelper.h
6+
* @brief Warp-level utility functions for particle flow multi-depth clustering.
7+
*
8+
* This header provides basic warp-synchronous operations used in clustering algorithms,
9+
* including bitwise manipulations (least/most significant set bits) and masked
10+
* warp-exclusive sum computations.
11+
*/
12+
13+
14+
namespace ALPAKA_ACCELERATOR_NAMESPACE {
15+
16+
using namespace cms::alpakatools;
17+
18+
/**
19+
* @brief Returns the position of the least significant set bit in a mask.
20+
*
21+
* @tparam TAcc Alpaka accelerator type.
22+
*
23+
* @param acc Alpaka accelerator instance.
24+
* @param x Input bitmask.
25+
*
26+
* @return Index of least significant 1 bit (0-based). (or -1 if x == 0).
27+
*/
28+
template< typename TAcc >
29+
ALPAKA_FN_HOST_ACC inline int get_ls1b_idx(TAcc const& acc, const unsigned int x) {
30+
const int pos = static_cast<int>(alpaka::ffs(acc, x));
31+
return pos - 1;
32+
}
33+
34+
/**
35+
* @brief Clears the least significant set bit in a mask.
36+
*
37+
* @tparam TAcc Alpaka accelerator type.
38+
*
39+
* @param acc Alpaka accelerator instance.
40+
* @param x Input bitmask.
41+
*
42+
* @return Bitmask with least significant 1 bit cleared.
43+
*/
44+
45+
template< typename TAcc >
46+
ALPAKA_FN_HOST_ACC inline unsigned int erase_ls1b(TAcc const& acc, const unsigned int x) {
47+
return (x & (x-1));
48+
}
49+
50+
/**
51+
* @brief Returns the position of the most significant set bit in a mask.
52+
*
53+
* @tparam TAcc Alpaka accelerator type.
54+
*
55+
* @param acc Alpaka accelerator instance.
56+
* @param x Input bitmask.
57+
*
58+
* @return Index of most significant 1 bit (0-based). (or -1 if x == 0)
59+
*/
60+
61+
template< typename TAcc >
62+
ALPAKA_FN_HOST_ACC inline int get_ms1b_idx(TAcc const& acc, const unsigned int x) {
63+
constexpr unsigned int size = sizeof(unsigned int)-1;
64+
const int pos = size - cms::alpakatools::clz(acc, x);
65+
return pos - 1;
66+
}
67+
68+
/**
69+
* @brief Performs warp-level exclusive prefix sum under a custom lane mask.
70+
*
71+
* @tparam TAcc Alpaka accelerator type.
72+
* @tparam accum If true, broadcast total accumulated value to lowest active lane.
73+
*
74+
* @param acc Alpaka accelerator instance.
75+
* @param mask Active lane mask.
76+
* @param val Value to include in the prefix sum.
77+
* @param lane_idx Current thread's lane index.
78+
*
79+
* @return Exclusive prefix sum value for the current lane.
80+
*/
81+
82+
template <typename TAcc, bool accum = true, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
83+
ALPAKA_FN_ACC inline unsigned int warp_exclusive_sum(TAcc const& acc, const unsigned int mask, unsigned int val, const unsigned int lane_idx) {
84+
if ( mask == 0x0 ) return 0;
85+
86+
const unsigned int w_extent = alpaka::warp::getSize(acc);
87+
//
88+
unsigned int local_offset = 0;
89+
//
90+
CMS_UNROLL_LOOP
91+
for (int j = 1; j < w_extent; j *= 2) {
92+
const auto n = warp::shfl_up_mask(acc, mask, val, j, w_extent);
93+
if (lane_idx >= j) local_offset += n;
94+
}
95+
//
96+
warp::syncWarpThreads_mask(acc, mask);
97+
98+
if constexpr (!accum) {
99+
return local_offset;
100+
} else {
101+
// Compute the lowest and the highest valid lane index in the mask:
102+
const auto low_lane_idx = get_ls1b_idx(acc, mask);
103+
const auto high_lane_idx = get_ms1b_idx(acc, mask);
104+
105+
// send last lane value (total tile offset) to lane idx = low_lane_idx:
106+
const unsigned active_mask = 1 | (1 << high_lane_idx);
107+
const unsigned x = warp::shfl_mask(acc, active_mask, local_offset + val, high_lane_idx, w_extent);
108+
//
109+
if (lane_idx == low_lane_idx) local_offset = x;
110+
111+
warp::syncWarpThreads_mask(acc, mask);
112+
}
113+
return local_offset;
114+
}
115+
116+
}
117+
118+
#endif
119+

0 commit comments

Comments
 (0)