Merge pull request #32499 from fwyzard/CMS_UNROLL_LOOP

Add and use preprocessors macros for loop unrolling
cms-sw · Dec 16, 2020 · 24ed956 · 24ed956
2 parents 1aabdf6 + dca2f32
commit 24ed956
Show file tree

Hide file tree

Showing 5 changed files with 107 additions and 52 deletions.
diff --git a/DataFormats/CaloRecHit/interface/MultifitComputations.h b/DataFormats/CaloRecHit/interface/MultifitComputations.h
@@ -7,6 +7,8 @@
 
 #include <Eigen/Dense>
 
+#include "FWCore/Utilities/interface/CMSUnrollLoop.h"
+
 namespace calo {
   namespace multifit {
 
@@ -73,7 +75,7 @@ namespace calo {
       L(0, 0) = sqrtm_0_0;
       using T = typename MatrixType1::base_type;
 
-#pragma unroll
+      CMS_UNROLL_LOOP
       for (int i = 1; i < MatrixType1::stride; i++) {
         T sumsq{0};
         for (int j = 0; j < i; j++) {
@@ -199,13 +201,13 @@ namespace calo {
       constexpr auto NPULSES = MatrixType2::ColsAtCompileTime;
       constexpr auto NSAMPLES = MatrixType2::RowsAtCompileTime;
 
-#pragma unroll
+      CMS_UNROLL_LOOP
       for (int icol = 0; icol < NPULSES; icol++) {
         float reg_b[NSAMPLES];
         float reg_L[NSAMPLES];
 
-// preload a column and load column 0 of cholesky
-#pragma unroll
+        // preload a column and load column 0 of cholesky
+        CMS_UNROLL_LOOP
         for (int i = 0; i < NSAMPLES; i++) {
 #ifdef __CUDA_ARCH__
           // load through the read-only cache
@@ -220,16 +222,16 @@ namespace calo {
         auto x_prev = reg_b[0] / reg_L[0];
         A(0, icol) = x_prev;
 
-// iterate
-#pragma unroll
+        // iterate
+        CMS_UNROLL_LOOP
         for (int iL = 1; iL < NSAMPLES; iL++) {
-// update accum
-#pragma unroll
+          // update accum
+          CMS_UNROLL_LOOP
           for (int counter = iL; counter < NSAMPLES; counter++)
             reg_b[counter] -= x_prev * reg_L[counter];
 
-// load the next column of cholesky
-#pragma unroll
+          // load the next column of cholesky
+          CMS_UNROLL_LOOP
           for (int counter = iL; counter < NSAMPLES; counter++)
             reg_L[counter] = matrixL(counter, iL);
 
@@ -251,8 +253,8 @@ namespace calo {
       float reg_b_tmp[NSAMPLES];
       float reg_L[NSAMPLES];
 
-// preload a column and load column 0 of cholesky
-#pragma unroll
+      // preload a column and load column 0 of cholesky
+      CMS_UNROLL_LOOP
       for (int i = 0; i < NSAMPLES; i++) {
         reg_b_tmp[i] = inputAmplitudesView(i);
         reg_L[i] = matrixL(i, 0);
@@ -262,16 +264,16 @@ namespace calo {
       auto x_prev = reg_b_tmp[0] / reg_L[0];
       reg_b[0] = x_prev;
 
-// iterate
-#pragma unroll
+      // iterate
+      CMS_UNROLL_LOOP
       for (int iL = 1; iL < NSAMPLES; iL++) {
-// update accum
-#pragma unroll
+        // update accum
+        CMS_UNROLL_LOOP
         for (int counter = iL; counter < NSAMPLES; counter++)
           reg_b_tmp[counter] -= x_prev * reg_L[counter];
 
-// load the next column of cholesky
-#pragma unroll
+        // load the next column of cholesky
+        CMS_UNROLL_LOOP
         for (int counter = iL; counter < NSAMPLES; counter++)
           reg_L[counter] = matrixL(counter, iL);
 
@@ -300,13 +302,13 @@ namespace calo {
         float results[NPULSES];
 
         // preload results and permute according to the pulse offsets /////////////// ??? this is not done in ECAL
-#pragma unroll
+        CMS_UNROLL_LOOP
         for (int counter = 0; counter < NPULSES; counter++) {
           results[counter] = resultAmplitudesVector[counter];
         }
 
         // load accum
-#pragma unroll
+        CMS_UNROLL_LOOP
         for (int counter = 0; counter < NSAMPLES; counter++)
           accum[counter] = -inputAmplitudesView(counter);
 
@@ -315,16 +317,16 @@ namespace calo {
           float pm_col[NSAMPLES];
 
           // preload a column of pulse matrix
-#pragma unroll
+          CMS_UNROLL_LOOP
           for (int counter = 0; counter < NSAMPLES; counter++)
 #ifdef __CUDA_ARCH__
             pm_col[counter] = __ldg(&pulseMatrixView.coeffRef(counter, icol));
 #else
             pm_col[counter] = pulseMatrixView.coeffRef(counter, icol);
 #endif
 
-            // accum
-#pragma unroll
+          // accum
+          CMS_UNROLL_LOOP
           for (int counter = 0; counter < NSAMPLES; counter++)
             accum[counter] += results[icol] * pm_col[counter];
         }
@@ -342,7 +344,7 @@ namespace calo {
         float accumSum = 0;
 
         // preload a column and load column 0 of cholesky
-#pragma unroll
+        CMS_UNROLL_LOOP
         for (int i = 0; i < NSAMPLES; i++) {
           reg_L[i] = matrixL(i, 0);
         }
@@ -352,15 +354,15 @@ namespace calo {
         accumSum += x_prev * x_prev;
 
         // iterate
-#pragma unroll
+        CMS_UNROLL_LOOP
         for (int iL = 1; iL < NSAMPLES; iL++) {
           // update accum
-#pragma unroll
+          CMS_UNROLL_LOOP
           for (int counter = iL; counter < NSAMPLES; counter++)
             accum[counter] -= x_prev * reg_L[counter];
 
-            // load the next column of cholesky
-#pragma unroll
+          // load the next column of cholesky
+          CMS_UNROLL_LOOP
           for (int counter = iL; counter < NSAMPLES; counter++)
             reg_L[counter] = matrixL(counter, iL);
 
@@ -417,7 +419,7 @@ namespace calo {
             auto const icol_real = pulseOffsets(icol);
             auto const atb = Atb(icol_real);
             float sum = 0;
-#pragma unroll
+            CMS_UNROLL_LOOP
             for (int counter = 0; counter < NPULSES; counter++)
               sum += counter > icol_real ? AtA(counter, icol_real) * solution(counter)
                                          : AtA(icol_real, counter) * solution(counter);

diff --git a/FWCore/Utilities/interface/CMSUnrollLoop.h b/FWCore/Utilities/interface/CMSUnrollLoop.h
@@ -0,0 +1,51 @@
+#ifndef FWCore_Utilities_interface_CMSUnrollLoop_h
+#define FWCore_Utilities_interface_CMSUnrollLoop_h
+
+// convert the macro argument to a null-terminated quoted string
+#define STRINGIFY_(ARG) #ARG
+#define STRINGIFY(ARG) STRINGIFY_(ARG)
+
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+// CUDA or HIP device compiler
+
+#define CMS_UNROLL_LOOP _Pragma(STRINGIFY(unroll))
+#define CMS_UNROLL_LOOP_COUNT(N) _Pragma(STRINGIFY(unroll N))
+#define CMS_UNROLL_LOOP_DISABLE _Pragma(STRINGIFY(unroll 1))
+
+#define CMS_DEVICE_UNROLL_LOOP _Pragma(STRINGIFY(unroll))
+#define CMS_DEVICE_UNROLL_LOOP_COUNT(N) _Pragma(STRINGIFY(unroll N))
+#define CMS_DEVICE_UNROLL_LOOP_DISABLE _Pragma(STRINGIFY(unroll 1))
+
+#else  // defined (__CUDA_ARCH__) || defined (__HIP_DEVICE_COMPILE__)
+
+// any host compiler
+#define CMS_DEVICE_UNROLL_LOOP
+#define CMS_DEVICE_UNROLL_LOOP_COUNT(N)
+#define CMS_DEVICE_UNROLL_LOOP_DISABLE
+
+#if defined(__clang__)
+// clang host compiler
+
+#define CMS_UNROLL_LOOP _Pragma(STRINGIFY(clang loop unroll(enable)))
+#define CMS_UNROLL_LOOP_COUNT(N) _Pragma(STRINGIFY(clang loop unroll_count(N)))
+#define CMS_UNROLL_LOOP_DISABLE _Pragma(STRINGIFY(clang loop unroll(disable)))
+
+#elif defined(__GNUC__)
+// GCC host compiler
+
+#define CMS_UNROLL_LOOP _Pragma(STRINGIFY(GCC ivdep))
+#define CMS_UNROLL_LOOP_COUNT(N) _Pragma(STRINGIFY(GCC unroll N)) _Pragma(STRINGIFY(GCC ivdep))
+#define CMS_UNROLL_LOOP_DISABLE _Pragma(STRINGIFY(GCC unroll 1))
+
+#else
+// unsupported or unknown compiler
+
+#define CMS_UNROLL_LOOP
+#define CMS_UNROLL_LOOP_COUNT(N)
+#define CMS_UNROLL_LOOP_DISABLE
+
+#endif  // defined(__clang__) || defined(__GNUC__) || ...
+
+#endif  // defined (__CUDA_ARCH__) || defined (__HIP_DEVICE_COMPILE__)
+
+#endif  // FWCore_Utilities_interface_CMSUnrollLoop_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
@@ -3,6 +3,7 @@
 
 #include <cstdint>
 
+#include "FWCore/Utilities/interface/CMSUnrollLoop.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 
@@ -13,7 +14,7 @@ __device__ void __forceinline__ warpPrefixScan(T const* __restrict__ ci, T* __re
   // ci and co may be the same
   auto x = ci[i];
   auto laneId = threadIdx.x & 0x1f;
-#pragma unroll
+  CMS_UNROLL_LOOP
   for (int offset = 1; offset < 32; offset <<= 1) {
     auto y = __shfl_up_sync(mask, x, offset);
     if (laneId >= offset)
@@ -26,7 +27,7 @@ template <typename T>
 __device__ void __forceinline__ warpPrefixScan(T* c, uint32_t i, uint32_t mask) {
   auto x = c[i];
   auto laneId = threadIdx.x & 0x1f;
-#pragma unroll
+  CMS_UNROLL_LOOP
   for (int offset = 1; offset < 32; offset <<= 1) {
     auto y = __shfl_up_sync(mask, x, offset);
     if (laneId >= offset)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
@@ -6,6 +6,7 @@
 #include <cstdint>
 #include <type_traits>
 
+#include "FWCore/Utilities/interface/CMSUnrollLoop.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 
 template <typename T>
@@ -124,7 +125,7 @@ __device__ __forceinline__ void radixSortImpl(
     if (threadIdx.x < sb) {
       auto x = c[threadIdx.x];
       auto laneId = threadIdx.x & 0x1f;
-#pragma unroll
+      CMS_UNROLL_LOOP
       for (int offset = 1; offset < 32; offset <<= 1) {
         auto y = __shfl_up_sync(0xffffffff, x, offset);
         if (laneId >= offset)

diff --git a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
@@ -1,12 +1,12 @@
 #include <Eigen/Dense>
 
 #include "DataFormats/CaloRecHit/interface/MultifitComputations.h"
-
 // needed to compile with USER_CXXFLAGS="-DCOMPUTE_TDC_TIME"
 #include "DataFormats/HcalRecHit/interface/HcalSpecialTimes.h"
+#include "FWCore/Utilities/interface/CMSUnrollLoop.h"
+
 // TODO reuse some of the HCAL constats from
 //#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConstants.h"
-// ?
 
 #include "SimpleAlgoGPU.h"
 #include "KernelHelpers.h"
@@ -669,7 +669,7 @@ namespace hcal {
         Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrix,
         Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrixM,
         Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrixP) {
-#pragma unroll
+      CMS_UNROLL_LOOP
       for (int ipulse = 0; ipulse < NPULSES; ipulse++) {
         auto const resultAmplitude = resultAmplitudesVector(ipulse);
         if (resultAmplitude == 0)
@@ -681,15 +681,15 @@ namespace hcal {
 
         // preload a column
         float pmcol[NSAMPLES], pmpcol[NSAMPLES], pmmcol[NSAMPLES];
-#pragma unroll
+        CMS_UNROLL_LOOP
         for (int counter = 0; counter < NSAMPLES; counter++) {
           pmcol[counter] = __ldg(&pulseMatrix.coeffRef(counter, ipulse));
           pmpcol[counter] = __ldg(&pulseMatrixP.coeffRef(counter, ipulse));
           pmmcol[counter] = __ldg(&pulseMatrixM.coeffRef(counter, ipulse));
         }
 
         auto const ampl2 = resultAmplitude * resultAmplitude;
-#pragma unroll
+        CMS_UNROLL_LOOP
         for (int col = 0; col < NSAMPLES; col++) {
           auto const valueP_col = pmpcol[col];
           auto const valueM_col = pmmcol[col];
@@ -701,8 +701,8 @@ namespace hcal {
           auto tmp_value = 0.5 * (tmppcol * tmppcol + tmpmcol * tmpmcol);
           covarianceMatrix(col, col) += ampl2 * tmp_value;
 
-// FIXME: understand if this actually gets unrolled
-#pragma unroll
+          // FIXME: understand if this actually gets unrolled
+          CMS_UNROLL_LOOP
           for (int row = col + 1; row < NSAMPLES; row++) {
             float const valueP_row = pmpcol[row];  //pulseMatrixP(j, ipulseReal);
             float const value_row = pmcol[row];    //pulseMatrix(j, ipulseReal);
@@ -805,7 +805,7 @@ namespace hcal {
       int const soi = soiSamples[gch];
       */
       calo::multifit::ColumnVector<NPULSES, int> pulseOffsets;
-#pragma unroll
+      CMS_UNROLL_LOOP
       for (int i = 0; i < NPULSES; ++i)
         pulseOffsets(i) = i;
       //        pulseOffsets(i) = pulseOffsetValues[i] - pulseOffsetValues[0];
@@ -854,10 +854,10 @@ namespace hcal {
         // shared memory
         float* covarianceMatrixStorage = shrMatrixLFnnlsStorage;
         calo::multifit::MapSymM<float, NSAMPLES> covarianceMatrix{covarianceMatrixStorage};
-#pragma unroll
+        CMS_UNROLL_LOOP
         for (int counter = 0; counter < calo::multifit::MapSymM<float, NSAMPLES>::total; counter++)
           covarianceMatrixStorage[counter] = averagePedestalWidth2;
-#pragma unroll
+        CMS_UNROLL_LOOP
         for (int counter = 0; counter < calo::multifit::MapSymM<float, NSAMPLES>::stride; counter++)
           covarianceMatrix(counter, counter) += __ldg(&noiseTermsView.coeffRef(counter));
 
@@ -907,36 +907,36 @@ namespace hcal {
         //float AtAStorage[MapSymM<float, NPULSES>::total];
         calo::multifit::MapSymM<float, NPULSES> AtA{shrAtAStorage};
         calo::multifit::ColumnVector<NPULSES> Atb;
-#pragma unroll
+        CMS_UNROLL_LOOP
         for (int icol = 0; icol < NPULSES; icol++) {
           float reg_ai[NSAMPLES];
 
-// load column icol
-#pragma unroll
+          // load column icol
+          CMS_UNROLL_LOOP
           for (int counter = 0; counter < NSAMPLES; counter++)
             reg_ai[counter] = A(counter, icol);
 
           // compute diagonal
           float sum = 0.f;
-#pragma unroll
+          CMS_UNROLL_LOOP
           for (int counter = 0; counter < NSAMPLES; counter++)
             sum += reg_ai[counter] * reg_ai[counter];
 
           // store
           AtA(icol, icol) = sum;
 
-// go thru the other columns
-#pragma unroll
+          // go thru the other columns
+          CMS_UNROLL_LOOP
           for (int j = icol + 1; j < NPULSES; j++) {
             // load column j
             float reg_aj[NSAMPLES];
-#pragma unroll
+            CMS_UNROLL_LOOP
             for (int counter = 0; counter < NSAMPLES; counter++)
               reg_aj[counter] = A(counter, j);
 
             // accum
             float sum = 0.f;
-#pragma unroll
+            CMS_UNROLL_LOOP
             for (int counter = 0; counter < NSAMPLES; counter++)
               sum += reg_aj[counter] * reg_ai[counter];
 
@@ -947,7 +947,7 @@ namespace hcal {
 
           // Atb accum
           float sum_atb = 0;
-#pragma unroll
+          CMS_UNROLL_LOOP
           for (int counter = 0; counter < NSAMPLES; counter++)
             sum_atb += reg_ai[counter] * reg_b[counter];
 
@@ -1010,7 +1010,7 @@ namespace hcal {
       auto const idx_for_energy = std::abs(pulseOffsetValues[0]);
       outputEnergy[gch] = (gain * resultAmplitudesVector(idx_for_energy)) * respCorrection;
       /*
-      #pragma unroll
+      CMS_UNROLL_LOOP
       for (int i=0; i<NPULSES; i++)
           if (pulseOffsets[i] == soi)
               // NOTE: gain is a number < 10^-3/4, multiply first to avoid stab issues