Skip to content

Commit

Permalink
Merge pull request #32499 from fwyzard/CMS_UNROLL_LOOP
Browse files Browse the repository at this point in the history
Add and use preprocessors macros for loop unrolling
  • Loading branch information
cmsbuild authored Dec 16, 2020
2 parents 1aabdf6 + dca2f32 commit 24ed956
Show file tree
Hide file tree
Showing 5 changed files with 107 additions and 52 deletions.
60 changes: 31 additions & 29 deletions DataFormats/CaloRecHit/interface/MultifitComputations.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

#include <Eigen/Dense>

#include "FWCore/Utilities/interface/CMSUnrollLoop.h"

namespace calo {
namespace multifit {

Expand Down Expand Up @@ -73,7 +75,7 @@ namespace calo {
L(0, 0) = sqrtm_0_0;
using T = typename MatrixType1::base_type;

#pragma unroll
CMS_UNROLL_LOOP
for (int i = 1; i < MatrixType1::stride; i++) {
T sumsq{0};
for (int j = 0; j < i; j++) {
Expand Down Expand Up @@ -199,13 +201,13 @@ namespace calo {
constexpr auto NPULSES = MatrixType2::ColsAtCompileTime;
constexpr auto NSAMPLES = MatrixType2::RowsAtCompileTime;

#pragma unroll
CMS_UNROLL_LOOP
for (int icol = 0; icol < NPULSES; icol++) {
float reg_b[NSAMPLES];
float reg_L[NSAMPLES];

// preload a column and load column 0 of cholesky
#pragma unroll
// preload a column and load column 0 of cholesky
CMS_UNROLL_LOOP
for (int i = 0; i < NSAMPLES; i++) {
#ifdef __CUDA_ARCH__
// load through the read-only cache
Expand All @@ -220,16 +222,16 @@ namespace calo {
auto x_prev = reg_b[0] / reg_L[0];
A(0, icol) = x_prev;

// iterate
#pragma unroll
// iterate
CMS_UNROLL_LOOP
for (int iL = 1; iL < NSAMPLES; iL++) {
// update accum
#pragma unroll
// update accum
CMS_UNROLL_LOOP
for (int counter = iL; counter < NSAMPLES; counter++)
reg_b[counter] -= x_prev * reg_L[counter];

// load the next column of cholesky
#pragma unroll
// load the next column of cholesky
CMS_UNROLL_LOOP
for (int counter = iL; counter < NSAMPLES; counter++)
reg_L[counter] = matrixL(counter, iL);

Expand All @@ -251,8 +253,8 @@ namespace calo {
float reg_b_tmp[NSAMPLES];
float reg_L[NSAMPLES];

// preload a column and load column 0 of cholesky
#pragma unroll
// preload a column and load column 0 of cholesky
CMS_UNROLL_LOOP
for (int i = 0; i < NSAMPLES; i++) {
reg_b_tmp[i] = inputAmplitudesView(i);
reg_L[i] = matrixL(i, 0);
Expand All @@ -262,16 +264,16 @@ namespace calo {
auto x_prev = reg_b_tmp[0] / reg_L[0];
reg_b[0] = x_prev;

// iterate
#pragma unroll
// iterate
CMS_UNROLL_LOOP
for (int iL = 1; iL < NSAMPLES; iL++) {
// update accum
#pragma unroll
// update accum
CMS_UNROLL_LOOP
for (int counter = iL; counter < NSAMPLES; counter++)
reg_b_tmp[counter] -= x_prev * reg_L[counter];

// load the next column of cholesky
#pragma unroll
// load the next column of cholesky
CMS_UNROLL_LOOP
for (int counter = iL; counter < NSAMPLES; counter++)
reg_L[counter] = matrixL(counter, iL);

Expand Down Expand Up @@ -300,13 +302,13 @@ namespace calo {
float results[NPULSES];

// preload results and permute according to the pulse offsets /////////////// ??? this is not done in ECAL
#pragma unroll
CMS_UNROLL_LOOP
for (int counter = 0; counter < NPULSES; counter++) {
results[counter] = resultAmplitudesVector[counter];
}

// load accum
#pragma unroll
CMS_UNROLL_LOOP
for (int counter = 0; counter < NSAMPLES; counter++)
accum[counter] = -inputAmplitudesView(counter);

Expand All @@ -315,16 +317,16 @@ namespace calo {
float pm_col[NSAMPLES];

// preload a column of pulse matrix
#pragma unroll
CMS_UNROLL_LOOP
for (int counter = 0; counter < NSAMPLES; counter++)
#ifdef __CUDA_ARCH__
pm_col[counter] = __ldg(&pulseMatrixView.coeffRef(counter, icol));
#else
pm_col[counter] = pulseMatrixView.coeffRef(counter, icol);
#endif

// accum
#pragma unroll
// accum
CMS_UNROLL_LOOP
for (int counter = 0; counter < NSAMPLES; counter++)
accum[counter] += results[icol] * pm_col[counter];
}
Expand All @@ -342,7 +344,7 @@ namespace calo {
float accumSum = 0;

// preload a column and load column 0 of cholesky
#pragma unroll
CMS_UNROLL_LOOP
for (int i = 0; i < NSAMPLES; i++) {
reg_L[i] = matrixL(i, 0);
}
Expand All @@ -352,15 +354,15 @@ namespace calo {
accumSum += x_prev * x_prev;

// iterate
#pragma unroll
CMS_UNROLL_LOOP
for (int iL = 1; iL < NSAMPLES; iL++) {
// update accum
#pragma unroll
CMS_UNROLL_LOOP
for (int counter = iL; counter < NSAMPLES; counter++)
accum[counter] -= x_prev * reg_L[counter];

// load the next column of cholesky
#pragma unroll
// load the next column of cholesky
CMS_UNROLL_LOOP
for (int counter = iL; counter < NSAMPLES; counter++)
reg_L[counter] = matrixL(counter, iL);

Expand Down Expand Up @@ -417,7 +419,7 @@ namespace calo {
auto const icol_real = pulseOffsets(icol);
auto const atb = Atb(icol_real);
float sum = 0;
#pragma unroll
CMS_UNROLL_LOOP
for (int counter = 0; counter < NPULSES; counter++)
sum += counter > icol_real ? AtA(counter, icol_real) * solution(counter)
: AtA(icol_real, counter) * solution(counter);
Expand Down
51 changes: 51 additions & 0 deletions FWCore/Utilities/interface/CMSUnrollLoop.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#ifndef FWCore_Utilities_interface_CMSUnrollLoop_h
#define FWCore_Utilities_interface_CMSUnrollLoop_h

// convert the macro argument to a null-terminated quoted string
#define STRINGIFY_(ARG) #ARG
#define STRINGIFY(ARG) STRINGIFY_(ARG)

#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
// CUDA or HIP device compiler

#define CMS_UNROLL_LOOP _Pragma(STRINGIFY(unroll))
#define CMS_UNROLL_LOOP_COUNT(N) _Pragma(STRINGIFY(unroll N))
#define CMS_UNROLL_LOOP_DISABLE _Pragma(STRINGIFY(unroll 1))

#define CMS_DEVICE_UNROLL_LOOP _Pragma(STRINGIFY(unroll))
#define CMS_DEVICE_UNROLL_LOOP_COUNT(N) _Pragma(STRINGIFY(unroll N))
#define CMS_DEVICE_UNROLL_LOOP_DISABLE _Pragma(STRINGIFY(unroll 1))

#else // defined (__CUDA_ARCH__) || defined (__HIP_DEVICE_COMPILE__)

// any host compiler
#define CMS_DEVICE_UNROLL_LOOP
#define CMS_DEVICE_UNROLL_LOOP_COUNT(N)
#define CMS_DEVICE_UNROLL_LOOP_DISABLE

#if defined(__clang__)
// clang host compiler

#define CMS_UNROLL_LOOP _Pragma(STRINGIFY(clang loop unroll(enable)))
#define CMS_UNROLL_LOOP_COUNT(N) _Pragma(STRINGIFY(clang loop unroll_count(N)))
#define CMS_UNROLL_LOOP_DISABLE _Pragma(STRINGIFY(clang loop unroll(disable)))

#elif defined(__GNUC__)
// GCC host compiler

#define CMS_UNROLL_LOOP _Pragma(STRINGIFY(GCC ivdep))
#define CMS_UNROLL_LOOP_COUNT(N) _Pragma(STRINGIFY(GCC unroll N)) _Pragma(STRINGIFY(GCC ivdep))
#define CMS_UNROLL_LOOP_DISABLE _Pragma(STRINGIFY(GCC unroll 1))

#else
// unsupported or unknown compiler

#define CMS_UNROLL_LOOP
#define CMS_UNROLL_LOOP_COUNT(N)
#define CMS_UNROLL_LOOP_DISABLE

#endif // defined(__clang__) || defined(__GNUC__) || ...

#endif // defined (__CUDA_ARCH__) || defined (__HIP_DEVICE_COMPILE__)

#endif // FWCore_Utilities_interface_CMSUnrollLoop_h
5 changes: 3 additions & 2 deletions HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include <cstdint>

#include "FWCore/Utilities/interface/CMSUnrollLoop.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"

Expand All @@ -13,7 +14,7 @@ __device__ void __forceinline__ warpPrefixScan(T const* __restrict__ ci, T* __re
// ci and co may be the same
auto x = ci[i];
auto laneId = threadIdx.x & 0x1f;
#pragma unroll
CMS_UNROLL_LOOP
for (int offset = 1; offset < 32; offset <<= 1) {
auto y = __shfl_up_sync(mask, x, offset);
if (laneId >= offset)
Expand All @@ -26,7 +27,7 @@ template <typename T>
__device__ void __forceinline__ warpPrefixScan(T* c, uint32_t i, uint32_t mask) {
auto x = c[i];
auto laneId = threadIdx.x & 0x1f;
#pragma unroll
CMS_UNROLL_LOOP
for (int offset = 1; offset < 32; offset <<= 1) {
auto y = __shfl_up_sync(mask, x, offset);
if (laneId >= offset)
Expand Down
3 changes: 2 additions & 1 deletion HeterogeneousCore/CUDAUtilities/interface/radixSort.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <cstdint>
#include <type_traits>

#include "FWCore/Utilities/interface/CMSUnrollLoop.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"

template <typename T>
Expand Down Expand Up @@ -124,7 +125,7 @@ __device__ __forceinline__ void radixSortImpl(
if (threadIdx.x < sb) {
auto x = c[threadIdx.x];
auto laneId = threadIdx.x & 0x1f;
#pragma unroll
CMS_UNROLL_LOOP
for (int offset = 1; offset < 32; offset <<= 1) {
auto y = __shfl_up_sync(0xffffffff, x, offset);
if (laneId >= offset)
Expand Down
40 changes: 20 additions & 20 deletions RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
#include <Eigen/Dense>

#include "DataFormats/CaloRecHit/interface/MultifitComputations.h"

// needed to compile with USER_CXXFLAGS="-DCOMPUTE_TDC_TIME"
#include "DataFormats/HcalRecHit/interface/HcalSpecialTimes.h"
#include "FWCore/Utilities/interface/CMSUnrollLoop.h"

// TODO reuse some of the HCAL constats from
//#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConstants.h"
// ?

#include "SimpleAlgoGPU.h"
#include "KernelHelpers.h"
Expand Down Expand Up @@ -669,7 +669,7 @@ namespace hcal {
Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrix,
Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrixM,
Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrixP) {
#pragma unroll
CMS_UNROLL_LOOP
for (int ipulse = 0; ipulse < NPULSES; ipulse++) {
auto const resultAmplitude = resultAmplitudesVector(ipulse);
if (resultAmplitude == 0)
Expand All @@ -681,15 +681,15 @@ namespace hcal {

// preload a column
float pmcol[NSAMPLES], pmpcol[NSAMPLES], pmmcol[NSAMPLES];
#pragma unroll
CMS_UNROLL_LOOP
for (int counter = 0; counter < NSAMPLES; counter++) {
pmcol[counter] = __ldg(&pulseMatrix.coeffRef(counter, ipulse));
pmpcol[counter] = __ldg(&pulseMatrixP.coeffRef(counter, ipulse));
pmmcol[counter] = __ldg(&pulseMatrixM.coeffRef(counter, ipulse));
}

auto const ampl2 = resultAmplitude * resultAmplitude;
#pragma unroll
CMS_UNROLL_LOOP
for (int col = 0; col < NSAMPLES; col++) {
auto const valueP_col = pmpcol[col];
auto const valueM_col = pmmcol[col];
Expand All @@ -701,8 +701,8 @@ namespace hcal {
auto tmp_value = 0.5 * (tmppcol * tmppcol + tmpmcol * tmpmcol);
covarianceMatrix(col, col) += ampl2 * tmp_value;

// FIXME: understand if this actually gets unrolled
#pragma unroll
// FIXME: understand if this actually gets unrolled
CMS_UNROLL_LOOP
for (int row = col + 1; row < NSAMPLES; row++) {
float const valueP_row = pmpcol[row]; //pulseMatrixP(j, ipulseReal);
float const value_row = pmcol[row]; //pulseMatrix(j, ipulseReal);
Expand Down Expand Up @@ -805,7 +805,7 @@ namespace hcal {
int const soi = soiSamples[gch];
*/
calo::multifit::ColumnVector<NPULSES, int> pulseOffsets;
#pragma unroll
CMS_UNROLL_LOOP
for (int i = 0; i < NPULSES; ++i)
pulseOffsets(i) = i;
// pulseOffsets(i) = pulseOffsetValues[i] - pulseOffsetValues[0];
Expand Down Expand Up @@ -854,10 +854,10 @@ namespace hcal {
// shared memory
float* covarianceMatrixStorage = shrMatrixLFnnlsStorage;
calo::multifit::MapSymM<float, NSAMPLES> covarianceMatrix{covarianceMatrixStorage};
#pragma unroll
CMS_UNROLL_LOOP
for (int counter = 0; counter < calo::multifit::MapSymM<float, NSAMPLES>::total; counter++)
covarianceMatrixStorage[counter] = averagePedestalWidth2;
#pragma unroll
CMS_UNROLL_LOOP
for (int counter = 0; counter < calo::multifit::MapSymM<float, NSAMPLES>::stride; counter++)
covarianceMatrix(counter, counter) += __ldg(&noiseTermsView.coeffRef(counter));

Expand Down Expand Up @@ -907,36 +907,36 @@ namespace hcal {
//float AtAStorage[MapSymM<float, NPULSES>::total];
calo::multifit::MapSymM<float, NPULSES> AtA{shrAtAStorage};
calo::multifit::ColumnVector<NPULSES> Atb;
#pragma unroll
CMS_UNROLL_LOOP
for (int icol = 0; icol < NPULSES; icol++) {
float reg_ai[NSAMPLES];

// load column icol
#pragma unroll
// load column icol
CMS_UNROLL_LOOP
for (int counter = 0; counter < NSAMPLES; counter++)
reg_ai[counter] = A(counter, icol);

// compute diagonal
float sum = 0.f;
#pragma unroll
CMS_UNROLL_LOOP
for (int counter = 0; counter < NSAMPLES; counter++)
sum += reg_ai[counter] * reg_ai[counter];

// store
AtA(icol, icol) = sum;

// go thru the other columns
#pragma unroll
// go thru the other columns
CMS_UNROLL_LOOP
for (int j = icol + 1; j < NPULSES; j++) {
// load column j
float reg_aj[NSAMPLES];
#pragma unroll
CMS_UNROLL_LOOP
for (int counter = 0; counter < NSAMPLES; counter++)
reg_aj[counter] = A(counter, j);

// accum
float sum = 0.f;
#pragma unroll
CMS_UNROLL_LOOP
for (int counter = 0; counter < NSAMPLES; counter++)
sum += reg_aj[counter] * reg_ai[counter];

Expand All @@ -947,7 +947,7 @@ namespace hcal {

// Atb accum
float sum_atb = 0;
#pragma unroll
CMS_UNROLL_LOOP
for (int counter = 0; counter < NSAMPLES; counter++)
sum_atb += reg_ai[counter] * reg_b[counter];

Expand Down Expand Up @@ -1010,7 +1010,7 @@ namespace hcal {
auto const idx_for_energy = std::abs(pulseOffsetValues[0]);
outputEnergy[gch] = (gain * resultAmplitudesVector(idx_for_energy)) * respCorrection;
/*
#pragma unroll
CMS_UNROLL_LOOP
for (int i=0; i<NPULSES; i++)
if (pulseOffsets[i] == soi)
// NOTE: gain is a number < 10^-3/4, multiply first to avoid stab issues
Expand Down

0 comments on commit 24ed956

Please sign in to comment.