forked from cms-patatrack/pixeltrack-standalone
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalpakaWorkDivHelper.h
297 lines (258 loc) · 14.2 KB
/
alpakaWorkDivHelper.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
#ifndef ALPAKAWORKDIVHELPER_H
#define ALPAKAWORKDIVHELPER_H
#include "AlpakaCore/alpakaConfig.h"
using namespace alpaka_common;
namespace cms {
namespace alpakatools {
/*
* Creates the accelerator-dependent workdiv.
*/
template <typename T_Dim>
WorkDiv<T_Dim> make_workdiv(const Vec<T_Dim>& blocksPerGrid, const Vec<T_Dim>& threadsPerBlockOrElementsPerThread) {
// On the GPU:
// threadsPerBlockOrElementsPerThread is the number of threads per block.
// Each thread is looking at a single element: elementsPerThread is always 1.
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
const Vec<T_Dim>& elementsPerThread = Vec<T_Dim>::ones();
return WorkDiv<T_Dim>(blocksPerGrid, threadsPerBlockOrElementsPerThread, elementsPerThread);
#else
// On the CPU:
// Run serially with a single thread per block: threadsPerBlock is always 1.
// threadsPerBlockOrElementsPerThread is the number of elements per thread.
const Vec<T_Dim>& threadsPerBlock = Vec<T_Dim>::ones();
return WorkDiv<T_Dim>(blocksPerGrid, threadsPerBlock, threadsPerBlockOrElementsPerThread);
#endif
}
/*
* 1D helper to only access 1 element per block
* (should obviously only be needed for debug / printout).
*/
template <typename T_Acc>
ALPAKA_FN_ACC bool once_per_block_1D(const T_Acc& acc, uint32_t i) {
const uint32_t blockDimension(alpaka::workdiv::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
return (i % blockDimension == 0);
}
/*
* Computes the range of the elements indexes, local to the block.
* Warning: the max index is not truncated by the max number of elements of interest.
*/
template <typename T_Acc, typename T_Dim = alpaka::dim::Dim<T_Acc>>
ALPAKA_FN_ACC std::pair<Vec<T_Dim>, Vec<T_Dim>> element_index_range_in_block(const T_Acc& acc,
const Vec<T_Dim>& elementIdxShift) {
Vec<T_Dim> firstElementIdxVec = Vec<T_Dim>::zeros();
Vec<T_Dim> endElementIdxUncutVec = Vec<T_Dim>::zeros();
// Loop on all grid dimensions.
for (typename T_Dim::value_type dimIndex(0); dimIndex < T_Dim::value; ++dimIndex) {
// Take into account the thread index in block.
const uint32_t threadIdxLocal(alpaka::idx::getIdx<alpaka::Block, alpaka::Threads>(acc)[dimIndex]);
const uint32_t threadDimension(alpaka::workdiv::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[dimIndex]);
// Compute the elements indexes in block.
// Obviously relevant for CPU only.
// For GPU, threadDimension = 1, and elementIdx = firstElementIdx = threadIdx + elementIdxShift.
const uint32_t firstElementIdxLocal = threadIdxLocal * threadDimension;
const uint32_t firstElementIdx = firstElementIdxLocal + elementIdxShift[dimIndex]; // Add the shift!
const uint32_t endElementIdxUncut = firstElementIdx + threadDimension;
firstElementIdxVec[dimIndex] = firstElementIdx;
endElementIdxUncutVec[dimIndex] = endElementIdxUncut;
}
// Return element indexes, shifted by elementIdxShift.
return {firstElementIdxVec, endElementIdxUncutVec};
}
/*
* Computes the range of the elements indexes, local to the block.
* Truncated by the max number of elements of interest.
*/
template <typename T_Acc, typename T_Dim>
ALPAKA_FN_ACC std::pair<Vec<T_Dim>, Vec<T_Dim>> element_index_range_in_block_truncated(
const T_Acc& acc, const Vec<T_Dim>& maxNumberOfElements, const Vec<T_Dim>& elementIdxShift) {
// Check dimension
static_assert(alpaka::dim::Dim<T_Acc>::value == T_Dim::value,
"Accelerator and maxNumberOfElements need to have same dimension.");
auto&& [firstElementIdxLocalVec, endElementIdxLocalVec] = element_index_range_in_block(acc, elementIdxShift);
// Truncate
for (typename T_Dim::value_type dimIndex(0); dimIndex < T_Dim::value; ++dimIndex) {
endElementIdxLocalVec[dimIndex] = std::min(endElementIdxLocalVec[dimIndex], maxNumberOfElements[dimIndex]);
}
// Return element indexes, shifted by elementIdxShift, and truncated by maxNumberOfElements.
return {firstElementIdxLocalVec, endElementIdxLocalVec};
}
/*
* Computes the range of the elements indexes in grid.
* Warning: the max index is not truncated by the max number of elements of interest.
*/
template <typename T_Acc, typename T_Dim = alpaka::dim::Dim<T_Acc>>
ALPAKA_FN_ACC std::pair<Vec<T_Dim>, Vec<T_Dim>> element_index_range_in_grid(const T_Acc& acc,
Vec<T_Dim>& elementIdxShift) {
// Loop on all grid dimensions.
for (typename T_Dim::value_type dimIndex(0); dimIndex < T_Dim::value; ++dimIndex) {
// Take into account the block index in grid.
const uint32_t blockIdxInGrid(alpaka::idx::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[dimIndex]);
const uint32_t blockDimension(alpaka::workdiv::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[dimIndex]);
// Shift to get global indices in grid (instead of local to the block)
elementIdxShift[dimIndex] += blockIdxInGrid * blockDimension;
}
// Return element indexes, shifted by elementIdxShift.
return element_index_range_in_block(acc, elementIdxShift);
}
/*
* Computes the range of the elements indexes in grid.
* Truncated by the max number of elements of interest.
*/
template <typename T_Acc, typename T_Dim>
ALPAKA_FN_ACC std::pair<Vec<T_Dim>, Vec<T_Dim>> element_index_range_in_grid_truncated(
const T_Acc& acc, const Vec<T_Dim>& maxNumberOfElements, Vec<T_Dim>& elementIdxShift) {
// Check dimension
static_assert(alpaka::dim::Dim<T_Acc>::value == T_Dim::value,
"Accelerator and maxNumberOfElements need to have same dimension.");
auto&& [firstElementIdxGlobalVec, endElementIdxGlobalVec] = element_index_range_in_grid(acc, elementIdxShift);
// Truncate
for (typename T_Dim::value_type dimIndex(0); dimIndex < T_Dim::value; ++dimIndex) {
endElementIdxGlobalVec[dimIndex] = std::min(endElementIdxGlobalVec[dimIndex], maxNumberOfElements[dimIndex]);
}
// Return element indexes, shifted by elementIdxShift, and truncated by maxNumberOfElements.
return {firstElementIdxGlobalVec, endElementIdxGlobalVec};
}
/*
* Computes the range of the element(s) index(es) in grid.
* Truncated by the max number of elements of interest.
*/
template <typename T_Acc, typename T_Dim>
ALPAKA_FN_ACC std::pair<Vec<T_Dim>, Vec<T_Dim>> element_index_range_in_grid_truncated(
const T_Acc& acc, const Vec<T_Dim>& maxNumberOfElements) {
Vec<T_Dim> elementIdxShift = Vec<T_Dim>::zeros();
return element_index_range_in_grid_truncated(acc, maxNumberOfElements, elementIdxShift);
}
/*********************************************
* 1D HELPERS, LOOP ON ALL CPU ELEMENTS
********************************************/
/*
* Loop on all (CPU) elements.
* Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift.
* Indexes are local to the BLOCK.
*/
template <typename T_Acc, typename Func>
ALPAKA_FN_ACC void for_each_element_in_thread_1D_index_in_block(const T_Acc& acc,
const uint32_t maxNumberOfElements,
const uint32_t elementIdxShift,
const Func func) {
const auto& [firstElementIdx, endElementIdx] = cms::alpakatools::element_index_range_in_block_truncated(
acc, Vec1::all(maxNumberOfElements), Vec1::all(elementIdxShift));
for (uint32_t elementIdx = firstElementIdx[0u]; elementIdx < endElementIdx[0u]; ++elementIdx) {
func(elementIdx);
}
}
/*
* Overload for elementIdxShift = 0
*/
template <typename T_Acc, typename Func>
ALPAKA_FN_ACC void for_each_element_in_thread_1D_index_in_block(const T_Acc& acc,
const uint32_t maxNumberOfElements,
const Func func) {
const uint32_t elementIdxShift = 0;
cms::alpakatools::for_each_element_in_thread_1D_index_in_block(acc, maxNumberOfElements, elementIdxShift, func);
}
/*
* Loop on all (CPU) elements.
* Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift.
* Indexes are expressed in GRID 'frame-of-reference'.
*/
template <typename T_Acc, typename Func>
ALPAKA_FN_ACC void for_each_element_in_thread_1D_index_in_grid(const T_Acc& acc,
const uint32_t maxNumberOfElements,
uint32_t elementIdxShift,
const Func func) {
// Take into account the block index in grid to compute the element indices.
const uint32_t blockIdxInGrid(alpaka::idx::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
const uint32_t blockDimension(alpaka::workdiv::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
elementIdxShift += blockIdxInGrid * blockDimension;
for_each_element_in_thread_1D_index_in_block(acc, maxNumberOfElements, elementIdxShift, func);
}
/*
* Overload for elementIdxShift = 0
*/
template <typename T_Acc, typename Func>
ALPAKA_FN_ACC void for_each_element_in_thread_1D_index_in_grid(const T_Acc& acc,
const uint32_t maxNumberOfElements,
const Func func) {
const uint32_t elementIdxShift = 0;
cms::alpakatools::for_each_element_in_thread_1D_index_in_grid(acc, maxNumberOfElements, elementIdxShift, func);
}
/******************************************************************************
* 1D HELPERS, LOOP ON ALL CPU ELEMENTS, AND ELEMENT/THREAD STRIDED ACCESS
******************************************************************************/
/*
* (CPU) Loop on all elements + (CPU/GPU) Strided access.
* Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift.
* Stride to full problem size, by BLOCK size.
* Indexes are local to the BLOCK.
*/
template <typename T_Acc, typename Func>
ALPAKA_FN_ACC void for_each_element_1D_block_stride(const T_Acc& acc,
const uint32_t maxNumberOfElements,
const uint32_t elementIdxShift,
const Func func) {
// Get thread / element indices in block.
const auto& [firstElementIdxNoStride, endElementIdxNoStride] =
cms::alpakatools::element_index_range_in_block(acc, Vec1::all(elementIdxShift));
// Stride = block size.
const uint32_t blockDimension(alpaka::workdiv::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
// Strided access.
for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u];
threadIdx < maxNumberOfElements;
threadIdx += blockDimension, endElementIdx += blockDimension) {
// (CPU) Loop on all elements.
for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) {
func(i);
}
}
}
/*
* Overload for elementIdxShift = 0
*/
template <typename T_Acc, typename Func>
ALPAKA_FN_ACC void for_each_element_1D_block_stride(const T_Acc& acc,
const uint32_t maxNumberOfElements,
const Func func) {
const uint32_t elementIdxShift = 0;
cms::alpakatools::for_each_element_1D_block_stride(acc, maxNumberOfElements, elementIdxShift, func);
}
/*
* (CPU) Loop on all elements + (CPU/GPU) Strided access.
* Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift.
* Stride to full problem size, by GRID size.
* Indexes are local to the GRID.
*/
template <typename T_Acc, typename Func>
ALPAKA_FN_ACC void for_each_element_1D_grid_stride(const T_Acc& acc,
const uint32_t maxNumberOfElements,
const uint32_t elementIdxShift,
const Func func) {
Vec1 elementIdxShiftVec = Vec1::all(elementIdxShift);
// Get thread / element indices in block.
const auto& [firstElementIdxNoStride, endElementIdxNoStride] =
cms::alpakatools::element_index_range_in_grid(acc, elementIdxShiftVec);
// Stride = grid size.
const uint32_t gridDimension(alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[0u]);
// Strided access.
for (uint32_t threadIdx = firstElementIdxNoStride[0u], endElementIdx = endElementIdxNoStride[0u];
threadIdx < maxNumberOfElements;
threadIdx += gridDimension, endElementIdx += gridDimension) {
// (CPU) Loop on all elements.
for (uint32_t i = threadIdx; i < std::min(endElementIdx, maxNumberOfElements); ++i) {
func(i);
}
}
}
/*
* Overload for elementIdxShift = 0
*/
template <typename T_Acc, typename Func>
ALPAKA_FN_ACC void for_each_element_1D_grid_stride(const T_Acc& acc,
const uint32_t maxNumberOfElements,
const Func func) {
const uint32_t elementIdxShift = 0;
cms::alpakatools::for_each_element_1D_grid_stride(acc, maxNumberOfElements, elementIdxShift, func);
}
} // namespace alpakatools
} // namespace cms
#endif // ALPAKAWORKDIVHELPER_H