-
Notifications
You must be signed in to change notification settings - Fork 4.4k
/
Copy pathGPUCACell.h
391 lines (341 loc) · 16.5 KB
/
GPUCACell.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
#ifndef RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
#define RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
//
// Author: Felice Pantaleo, CERN
//
// #define ONLY_TRIPLETS_IN_HOLE
#include <cuda_runtime.h>
#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
#include "HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h"
#include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
#include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h"
#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
#include "CAConstants.h"
class GPUCACell {
public:
using PtrAsInt = unsigned long long;
static constexpr auto maxCellsPerHit = caConstants::maxCellsPerHit;
static constexpr auto invalidHitId = std::numeric_limits<caConstants::hindex_type>::max();
using OuterHitOfCellContainer = caConstants::OuterHitOfCellContainer;
using OuterHitOfCell = caConstants::OuterHitOfCell;
using CellNeighbors = caConstants::CellNeighbors;
using CellTracks = caConstants::CellTracks;
using CellNeighborsVector = caConstants::CellNeighborsVector;
using CellTracksVector = caConstants::CellTracksVector;
using Hits = TrackingRecHit2DSOAView;
using hindex_type = Hits::hindex_type;
using TmpTuple = cms::cuda::VecArray<uint32_t, 6>;
using HitContainer = pixelTrack::HitContainer;
using Quality = pixelTrack::Quality;
static constexpr auto bad = pixelTrack::Quality::bad;
enum class StatusBit : uint16_t { kUsed = 1, kInTrack = 2, kKilled = 1 << 15 };
GPUCACell() = default;
__device__ __forceinline__ void init(CellNeighborsVector& cellNeighbors,
CellTracksVector& cellTracks,
Hits const& hh,
int layerPairId,
hindex_type innerHitId,
hindex_type outerHitId) {
theInnerHitId = innerHitId;
theOuterHitId = outerHitId;
theLayerPairId_ = layerPairId;
theStatus_ = 0;
theFishboneId = invalidHitId;
// optimization that depends on access pattern
theInnerZ = hh.zGlobal(innerHitId);
theInnerR = hh.rGlobal(innerHitId);
// link to default empty
theOuterNeighbors = &cellNeighbors[0];
theTracks = &cellTracks[0];
assert(outerNeighbors().empty());
assert(tracks().empty());
}
__device__ __forceinline__ int addOuterNeighbor(CellNeighbors::value_t t, CellNeighborsVector& cellNeighbors) {
// use smart cache
if (outerNeighbors().empty()) {
auto i = cellNeighbors.extend(); // maybe wasted....
if (i > 0) {
cellNeighbors[i].reset();
__threadfence();
#ifdef __CUDACC__
auto zero = (PtrAsInt)(&cellNeighbors[0]);
atomicCAS((PtrAsInt*)(&theOuterNeighbors),
zero,
(PtrAsInt)(&cellNeighbors[i])); // if fails we cannot give "i" back...
#else
theOuterNeighbors = &cellNeighbors[i];
#endif
} else
return -1;
}
__threadfence();
return outerNeighbors().push_back(t);
}
__device__ __forceinline__ int addTrack(CellTracks::value_t t, CellTracksVector& cellTracks) {
if (tracks().empty()) {
auto i = cellTracks.extend(); // maybe wasted....
if (i > 0) {
cellTracks[i].reset();
__threadfence();
#ifdef __CUDACC__
auto zero = (PtrAsInt)(&cellTracks[0]);
atomicCAS((PtrAsInt*)(&theTracks), zero, (PtrAsInt)(&cellTracks[i])); // if fails we cannot give "i" back...
#else
theTracks = &cellTracks[i];
#endif
} else
return -1;
}
__threadfence();
return tracks().push_back(t);
}
__device__ __forceinline__ CellTracks& tracks() { return *theTracks; }
__device__ __forceinline__ CellTracks const& tracks() const { return *theTracks; }
__device__ __forceinline__ CellNeighbors& outerNeighbors() { return *theOuterNeighbors; }
__device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return *theOuterNeighbors; }
__device__ __forceinline__ float inner_x(Hits const& hh) const { return hh.xGlobal(theInnerHitId); }
__device__ __forceinline__ float outer_x(Hits const& hh) const { return hh.xGlobal(theOuterHitId); }
__device__ __forceinline__ float inner_y(Hits const& hh) const { return hh.yGlobal(theInnerHitId); }
__device__ __forceinline__ float outer_y(Hits const& hh) const { return hh.yGlobal(theOuterHitId); }
__device__ __forceinline__ float inner_z(Hits const& hh) const { return theInnerZ; }
// { return hh.zGlobal(theInnerHitId); } // { return theInnerZ; }
__device__ __forceinline__ float outer_z(Hits const& hh) const { return hh.zGlobal(theOuterHitId); }
__device__ __forceinline__ float inner_r(Hits const& hh) const { return theInnerR; }
// { return hh.rGlobal(theInnerHitId); } // { return theInnerR; }
__device__ __forceinline__ float outer_r(Hits const& hh) const { return hh.rGlobal(theOuterHitId); }
__device__ __forceinline__ auto inner_iphi(Hits const& hh) const { return hh.iphi(theInnerHitId); }
__device__ __forceinline__ auto outer_iphi(Hits const& hh) const { return hh.iphi(theOuterHitId); }
__device__ __forceinline__ float inner_detIndex(Hits const& hh) const { return hh.detectorIndex(theInnerHitId); }
__device__ __forceinline__ float outer_detIndex(Hits const& hh) const { return hh.detectorIndex(theOuterHitId); }
constexpr unsigned int inner_hit_id() const { return theInnerHitId; }
constexpr unsigned int outer_hit_id() const { return theOuterHitId; }
__device__ void print_cell() const {
printf("printing cell: on layerPair: %d, innerHitId: %d, outerHitId: %d \n",
theLayerPairId_,
theInnerHitId,
theOuterHitId);
}
__device__ bool check_alignment(Hits const& hh,
GPUCACell const& otherCell,
const float ptmin,
const float hardCurvCut,
const float caThetaCutBarrel,
const float caThetaCutForward,
const float dcaCutInnerTriplet,
const float dcaCutOuterTriplet) const {
// detIndex of the layerStart for the Phase1 Pixel Detector:
// [BPX1, BPX2, BPX3, BPX4, FP1, FP2, FP3, FN1, FN2, FN3, LAST_VALID]
// [ 0, 96, 320, 672, 1184, 1296, 1408, 1520, 1632, 1744, 1856]
auto ri = inner_r(hh);
auto zi = inner_z(hh);
auto ro = outer_r(hh);
auto zo = outer_z(hh);
auto r1 = otherCell.inner_r(hh);
auto z1 = otherCell.inner_z(hh);
auto isBarrel = otherCell.outer_detIndex(hh) < caConstants::last_barrel_detIndex;
bool aligned = areAlignedRZ(r1,
z1,
ri,
zi,
ro,
zo,
ptmin,
isBarrel ? caThetaCutBarrel : caThetaCutForward); // 2.f*thetaCut); // FIXME tune cuts
return (aligned && dcaCut(hh,
otherCell,
otherCell.inner_detIndex(hh) < caConstants::last_bpix1_detIndex ? dcaCutInnerTriplet
: dcaCutOuterTriplet,
hardCurvCut)); // FIXME tune cuts
}
__device__ __forceinline__ static bool areAlignedRZ(
float r1, float z1, float ri, float zi, float ro, float zo, const float ptmin, const float thetaCut) {
float radius_diff = std::abs(r1 - ro);
float distance_13_squared = radius_diff * radius_diff + (z1 - zo) * (z1 - zo);
float pMin = ptmin * std::sqrt(distance_13_squared); // this needs to be divided by
// radius_diff later
float tan_12_13_half_mul_distance_13_squared = fabs(z1 * (ri - ro) + zi * (ro - r1) + zo * (r1 - ri));
return tan_12_13_half_mul_distance_13_squared * pMin <= thetaCut * distance_13_squared * radius_diff;
}
__device__ inline bool dcaCut(Hits const& hh,
GPUCACell const& otherCell,
const float region_origin_radius_plus_tolerance,
const float maxCurv) const {
auto x1 = otherCell.inner_x(hh);
auto y1 = otherCell.inner_y(hh);
auto x2 = inner_x(hh);
auto y2 = inner_y(hh);
auto x3 = outer_x(hh);
auto y3 = outer_y(hh);
CircleEq<float> eq(x1, y1, x2, y2, x3, y3);
if (eq.curvature() > maxCurv)
return false;
return std::abs(eq.dca0()) < region_origin_radius_plus_tolerance * std::abs(eq.curvature());
}
__device__ __forceinline__ static bool dcaCutH(float x1,
float y1,
float x2,
float y2,
float x3,
float y3,
const float region_origin_radius_plus_tolerance,
const float maxCurv) {
CircleEq<float> eq(x1, y1, x2, y2, x3, y3);
if (eq.curvature() > maxCurv)
return false;
return std::abs(eq.dca0()) < region_origin_radius_plus_tolerance * std::abs(eq.curvature());
}
__device__ inline bool hole0(Hits const& hh, GPUCACell const& innerCell) const {
using caConstants::first_ladder_bpx0;
using caConstants::max_ladder_bpx0;
using caConstants::module_length_bpx0;
using caConstants::module_tolerance_bpx0;
int p = innerCell.inner_iphi(hh);
if (p < 0)
p += std::numeric_limits<unsigned short>::max();
p = (max_ladder_bpx0 * p) / std::numeric_limits<unsigned short>::max();
p %= max_ladder_bpx0;
auto il = first_ladder_bpx0 + p;
auto r0 = hh.averageGeometry().ladderR[il];
auto ri = innerCell.inner_r(hh);
auto zi = innerCell.inner_z(hh);
auto ro = outer_r(hh);
auto zo = outer_z(hh);
auto z0 = zi + (r0 - ri) * (zo - zi) / (ro - ri);
auto z_in_ladder = std::abs(z0 - hh.averageGeometry().ladderZ[il]);
auto z_in_module = z_in_ladder - module_length_bpx0 * int(z_in_ladder / module_length_bpx0);
auto gap = z_in_module < module_tolerance_bpx0 || z_in_module > (module_length_bpx0 - module_tolerance_bpx0);
return gap;
}
__device__ inline bool hole4(Hits const& hh, GPUCACell const& innerCell) const {
using caConstants::first_ladder_bpx4;
using caConstants::max_ladder_bpx4;
using caConstants::module_length_bpx4;
using caConstants::module_tolerance_bpx4;
int p = outer_iphi(hh);
if (p < 0)
p += std::numeric_limits<unsigned short>::max();
p = (max_ladder_bpx4 * p) / std::numeric_limits<unsigned short>::max();
p %= max_ladder_bpx4;
auto il = first_ladder_bpx4 + p;
auto r4 = hh.averageGeometry().ladderR[il];
auto ri = innerCell.inner_r(hh);
auto zi = innerCell.inner_z(hh);
auto ro = outer_r(hh);
auto zo = outer_z(hh);
auto z4 = zo + (r4 - ro) * (zo - zi) / (ro - ri);
auto z_in_ladder = std::abs(z4 - hh.averageGeometry().ladderZ[il]);
auto z_in_module = z_in_ladder - module_length_bpx4 * int(z_in_ladder / module_length_bpx4);
auto gap = z_in_module < module_tolerance_bpx4 || z_in_module > (module_length_bpx4 - module_tolerance_bpx4);
auto holeP = z4 > hh.averageGeometry().ladderMaxZ[il] && z4 < hh.averageGeometry().endCapZ[0];
auto holeN = z4 < hh.averageGeometry().ladderMinZ[il] && z4 > hh.averageGeometry().endCapZ[1];
return gap || holeP || holeN;
}
// trying to free the track building process from hardcoded layers, leaving
// the visit of the graph based on the neighborhood connections between cells.
template <int DEPTH>
__device__ inline void find_ntuplets(Hits const& hh,
GPUCACell* __restrict__ cells,
CellTracksVector& cellTracks,
HitContainer& foundNtuplets,
cms::cuda::AtomicPairCounter& apc,
Quality* __restrict__ quality,
TmpTuple& tmpNtuplet,
const unsigned int minHitsPerNtuplet,
bool startAt0) const {
// the building process for a track ends if:
// it has no right neighbor
// it has no compatible neighbor
// the ntuplets is then saved if the number of hits it contains is greater
// than a threshold
auto doubletId = this - cells;
tmpNtuplet.push_back_unsafe(doubletId);
assert(tmpNtuplet.size() <= 4);
bool last = true;
for (unsigned int otherCell : outerNeighbors()) {
if (cells[otherCell].isKilled())
continue; // killed by earlyFishbone
last = false;
cells[otherCell].find_ntuplets<DEPTH - 1>(
hh, cells, cellTracks, foundNtuplets, apc, quality, tmpNtuplet, minHitsPerNtuplet, startAt0);
}
if (last) { // if long enough save...
if ((unsigned int)(tmpNtuplet.size()) >= minHitsPerNtuplet - 1) {
#ifdef ONLY_TRIPLETS_IN_HOLE
// triplets accepted only pointing to the hole
if (tmpNtuplet.size() >= 3 || (startAt0 && hole4(hh, cells[tmpNtuplet[0]])) ||
((!startAt0) && hole0(hh, cells[tmpNtuplet[0]])))
#endif
{
hindex_type hits[8];
auto nh = 0U;
constexpr int maxFB = 2; // for the time being let's limit this
int nfb = 0;
for (auto c : tmpNtuplet) {
hits[nh++] = cells[c].theInnerHitId;
if (nfb < maxFB && cells[c].hasFishbone()) {
++nfb;
hits[nh++] = cells[c].theFishboneId; // fishbone hit is always outer than inner hit
}
}
assert(nh < caConstants::maxHitsOnTrack);
hits[nh] = theOuterHitId;
auto it = foundNtuplets.bulkFill(apc, hits, nh + 1);
if (it >= 0) { // if negative is overflow....
for (auto c : tmpNtuplet)
cells[c].addTrack(it, cellTracks);
quality[it] = bad; // initialize to bad
}
}
}
}
tmpNtuplet.pop_back();
assert(tmpNtuplet.size() < 4);
}
// Cell status management
__device__ __forceinline__ void kill() { theStatus_ |= uint16_t(StatusBit::kKilled); }
__device__ __forceinline__ bool isKilled() const { return theStatus_ & uint16_t(StatusBit::kKilled); }
__device__ __forceinline__ int16_t layerPairId() const { return theLayerPairId_; }
__device__ __forceinline__ bool unused() const { return 0 == (uint16_t(StatusBit::kUsed) & theStatus_); }
__device__ __forceinline__ void setStatusBits(StatusBit mask) { theStatus_ |= uint16_t(mask); }
__device__ __forceinline__ void setFishbone(hindex_type id, float z, Hits const& hh) {
// make it deterministic: use the farther apart (in z)
auto old = theFishboneId;
while (
old !=
atomicCAS(&theFishboneId,
old,
(invalidHitId == old || std::abs(z - theInnerZ) > std::abs(hh.zGlobal(old) - theInnerZ)) ? id : old))
old = theFishboneId;
}
__device__ __forceinline__ auto fishboneId() const { return theFishboneId; }
__device__ __forceinline__ bool hasFishbone() const { return theFishboneId != invalidHitId; }
private:
CellNeighbors* theOuterNeighbors;
CellTracks* theTracks;
int16_t theLayerPairId_;
uint16_t theStatus_; // tbd
float theInnerZ;
float theInnerR;
hindex_type theInnerHitId;
hindex_type theOuterHitId;
hindex_type theFishboneId;
};
template <>
__device__ inline void GPUCACell::find_ntuplets<0>(Hits const& hh,
GPUCACell* __restrict__ cells,
CellTracksVector& cellTracks,
HitContainer& foundNtuplets,
cms::cuda::AtomicPairCounter& apc,
Quality* __restrict__ quality,
TmpTuple& tmpNtuplet,
const unsigned int minHitsPerNtuplet,
bool startAt0) const {
printf("ERROR: GPUCACell::find_ntuplets reached full depth!\n");
#ifdef __CUDA_ARCH__
__trap();
#else
abort();
#endif
}
#endif // RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h