replace kernel shared memory by passing by value (huge simplification)

cms-sw · Sep 15, 2020 · 1fc601e · 1fc601e
1 parent 4838266
commit 1fc601e
Show file tree

Hide file tree

Showing 11 changed files with 189 additions and 433 deletions.
diff --git a/CUDADataFormats/HGCal/interface/HGCUncalibratedRecHitsToRecHitsConstants.h b/CUDADataFormats/HGCal/interface/HGCUncalibratedRecHitsToRecHitsConstants.h
@@ -3,6 +3,24 @@
 
 #include <vector>
 
+//maximum sizes for SoA's arrays holding configuration data ("constants")
+namespace maxsizes_constants {
+  //EE
+  constexpr size_t ee_fCPerMIP = 6; //number of elements pointed by hgcEE_fCPerMIP_
+  constexpr size_t ee_cce = 6; //number of elements posize_ted by hgcEE_cce_
+  constexpr size_t ee_noise_fC = 6; //number of elements posize_ted by hgcEE_noise_fC_
+  constexpr size_t ee_rcorr = 6; //number of elements posize_ted by rcorr_
+  constexpr size_t ee_weights = 53; //number of elements posize_ted by weights_
+  //HEF
+  constexpr size_t hef_fCPerMIP = 6; //number of elements pointed by hgcEE_fCPerMIP_
+  constexpr size_t hef_cce = 6; //number of elements posize_ted by hgcEE_cce_
+  constexpr size_t hef_noise_fC = 6; //number of elements posize_ted by hgcEE_noise_fC_
+  constexpr size_t hef_rcorr = 6; //number of elements posize_ted by rcorr_
+  constexpr size_t hef_weights = 53; //number of elements posize_ted by weights_
+  //HEB
+  constexpr size_t heb_weights = 53; //number of elements posize_ted by weights_
+}
+
 class HGCConstantVectorData {
  public:
   std::vector<double> fCPerMIP_;
@@ -14,65 +32,43 @@ class HGCConstantVectorData {
 
 class HGCeeUncalibratedRecHitConstantData {
  public:
-  double hgcEE_keV2DIGI_; //energy to femto coloumb conversion: 1000 eV/3.62 (eV per e) / 6.24150934e3 (e per fC)
+  double fCPerMIP_[maxsizes_constants::ee_fCPerMIP]; //femto coloumb to MIP conversion; one value per sensor thickness
+  double cce_[maxsizes_constants::ee_cce];           //charge collection efficiency, one value per sensor thickness
+  double noise_fC_[maxsizes_constants::ee_noise_fC]; //noise, one value per sensor thickness
+  double rcorr_[maxsizes_constants::ee_rcorr];              //thickness correction
+  double weights_[maxsizes_constants::ee_weights];          //energy weights to recover rechit energy deposited in the absorber
+
+  double hgcEE_keV2DIGI_;  //energy to femto coloumb conversion: 1000 eV/3.62 (eV per e) / 6.24150934e3 (e per fC)
   double hgceeUncalib2GeV_; //sets the ADC; obtained by dividing 1e-6 by hgcEE_keV2DIGI_
-  double *hgcEE_fCPerMIP_; //femto coloumb to MIP conversion; one value per sensor thickness
-  double *hgcEE_cce_; //charge collection efficiency, one value per sensor thickness
-  double *hgcEE_noise_fC_; //noise, one value per sensor thickness
-  double *rcorr_; //thickness correction
-  double *weights_; //energy weights to recover rechit energy deposited in the absorber
-  float xmin_; //used for computing the time resolution error
+  float xmin_;              //used for computing the time resolution error
   float xmax_; //used for computing the time resolution error
   float aterm_; //used for computing the time resolution error
   float cterm_; //used for computing the time resolution error
-  int nbytes_; //number of bytes allocated by this class
-  int ndelem_; //number of doubles pointed by this class
-  int nfelem_; //number of floats pointed by this class
-  int nielem_; //number of ints pointed by this class
-  int s_hgcEE_fCPerMIP_; //number of elements pointed by hgcEE_fCPerMIP_
-  int s_hgcEE_cce_; //number of elements pointed by hgcEE_cce_
-  int s_hgcEE_noise_fC_; //number of elements pointed by hgcEE_noise_fC_
-  int s_rcorr_; //number of elements pointed by rcorr_
-  int s_weights_; //number of elements pointed by weights_
 };
 
 class HGChefUncalibratedRecHitConstantData {
  public:
-  double hgcHEF_keV2DIGI_; //energy to femto coloumb conversion: 1000 eV/3.62 (eV per e) / 6.24150934e3 (e per fC)
-  double hgchefUncalib2GeV_; //sets the ADC; obtained by dividing 1e-6 by hgcHEF_keV2DIGI_
-  double *hgcHEF_fCPerMIP_; //femto coloumb to MIP conversion; one value per sensor thickness
-  double *hgcHEF_cce_; //charge collection efficiency, one value per sensor thickness
-  double *hgcHEF_noise_fC_; //noise, one value per sensor thickness
-  double *rcorr_;  //thickness correction
-  double *weights_; //energy weights to recover rechit energy deposited in the absorber
-  float xmin_; //used for computing the time resolution error
-  float xmax_; //used for computing the time resolution error
-  float aterm_; //used for computing the time resolution error
-  float cterm_; //used for computing the time resolution error
-  int nbytes_; //number of bytes allocated by this class
-  int ndelem_; //number of doubles allocated by this class
-  int nfelem_; //number of floats allocated by this class
-  int nuelem_; //number of unsigned ints allocated by this class
-  int nielem_; //number of ints allocated by this class
-  int s_hgcHEF_fCPerMIP_; //number of elements pointed by hgcEE_fCPerMIP_
-  int s_hgcHEF_cce_; //number of elements pointed by hgcEE_cce_
-  int s_hgcHEF_noise_fC_; //number of elements pointed by hgcEE_noise_fC_
-  int s_rcorr_; //number of elements pointed by rcorr_
-  int s_weights_; //number of elements pointed by weights_
+  double fCPerMIP_[maxsizes_constants::hef_fCPerMIP]; //femto coloumb to MIP conversion; one value per sensor thickness
+  double cce_[maxsizes_constants::hef_cce];           //charge collection efficiency, one value per sensor thickness
+  double noise_fC_[maxsizes_constants::hef_noise_fC]; //noise, one value per sensor thickness
+  double rcorr_[maxsizes_constants::hef_rcorr];              //thickness correction
+  double weights_[maxsizes_constants::hef_weights];          //energy weights to recover rechit energy deposited in the absorber
+
+  double keV2DIGI_;    //energy to femto coloumb conversion: 1000 eV/3.62 (eV per e) / 6.24150934e3 (e per fC)
+  double uncalib2GeV_; //sets the ADC; obtained by dividing 1e-6 by hgcHEF_keV2DIGI_
+  float xmin_;         //used for computing the time resolution error
+  float xmax_;         //used for computing the time resolution error
+  float aterm_;        //used for computing the time resolution error
+  float cterm_;        //used for computing the time resolution error
 };
 
 class HGChebUncalibratedRecHitConstantData {
  public:
-  double hgcHEB_keV2DIGI_; //energy to femto coloumb conversion: 1000 eV/3.62 (eV per e) / 6.24150934e3 (e per fC)
-  double hgchebUncalib2GeV_; //sets the ADC; obtained by dividing 1e-6 by hgcHEB_keV2DIGI_
-  double hgcHEB_noise_MIP_; //noise
-  double *weights_; //energy weights to recover rechit energy deposited in the absorber
-  int nbytes_; //number of bytes allocated by this class
-  int ndelem_; //number of doubles allocated by this class
-  int nfelem_; //number of floats allocated by this class
-  int nuelem_; //number of unsigned ints allocated by this class
-  int nielem_; //number of ints allocated by this class
-  int s_weights_; //number of elements pointed by weights_
+  double weights_[maxsizes_constants::heb_weights]; //energy weights to recover rechit energy deposited in the absorber
+
+  double keV2DIGI_;   //energy to femto coloumb conversion: 1000 eV/3.62 (eV per e) / 6.24150934e3 (e per fC)
+  double uncalib2GeV_; //sets the ADC; obtained by dividing 1e-6 by hgcHEB_keV2DIGI_
+  double noise_MIP_;  //noise
 };
 
 #endif
diff --git a/UserCode/CodeGPU/plugins/BuildFile.xml b/UserCode/CodeGPU/plugins/BuildFile.xml
@@ -1,4 +1,4 @@
-<library name="HeterogeneousUncalibRecHits" file="HeterogeneousHGCalEERecHitProducer.cc HeterogeneousHGCalHEFRecHitProducer.cc HeterogeneousHGCalHEBRecHitProducer.cc  HeterogeneousHGCalESProduct.cc HeterogeneousHGCalProducerMemoryWrapper.cc KernelManagerHGCalRecHit.cu HGCalRecHitKernelImpl.cu">
+<library name="HeterogeneousUncalibRecHits" file="HeterogeneousHGCalHEFRecHitProducer.cc HeterogeneousHGCalESProduct.cc HeterogeneousHGCalProducerMemoryWrapper.cc KernelManagerHGCalRecHit.cu HGCalRecHitKernelImpl.cu">
   <use name="cuda"/>
   <use name="HeterogeneousCore/CUDACore"/>
   <use name="HeterogeneousCore/CUDAUtilities"/>

diff --git a/UserCode/CodeGPU/plugins/HGCalRecHitKernelImpl.cu b/UserCode/CodeGPU/plugins/HGCalRecHitKernelImpl.cu
@@ -6,14 +6,15 @@
 #include "HGCalRecHitKernelImpl.cuh"
 
 __device__ 
-double get_weight_from_layer(const int& padding, const int& layer, double*& sd)
+double get_weight_from_layer(const int& layer, const double (&weights)[maxsizes_constants::hef_weights])
 {
-  return sd[padding + layer];
+  return weights[layer];
 }
 
 __device__
 void make_rechit(unsigned int tid, HGCRecHitSoA& dst_soa, HGCUncalibratedRecHitSoA& src_soa, const bool &heb_flag, 
-		 const double &weight, const double &rcorr, const double &cce_correction, const double &sigmaNoiseGeV, float *& sf)
+		 const double& weight, const double& rcorr, const double& cce_correction, const double &sigmaNoiseGeV,
+		 const float& xmin, const float& xmax, const float& aterm, const float& cterm)
 {
   dst_soa.id_[tid] = src_soa.id_[tid];
   dst_soa.energy_[tid] = src_soa.amplitude_[tid] * weight * 0.001f;
@@ -29,101 +30,43 @@ void make_rechit(unsigned int tid, HGCRecHitSoA& dst_soa, HGCUncalibratedRecHitS
   if(heb_flag==0)
     {
       //get time resolution
-      float max = fmaxf(son, sf[0]); //this max trick avoids if...elseif...else condition
-      float aterm = sf[2];
-      float cterm = sf[3];
-      dst_soa.timeError_[tid] = sqrt( __fdividef(aterm,max)*__fdividef(aterm,max) + cterm*cterm );
+      //https://github.com/cms-sw/cmssw/blob/master/RecoLocalCalo/HGCalRecProducers/src/ComputeClusterTime.cc#L50
+      /*Maxmin trick to avoid conditions within the kernel (having xmin < xmax)
+      3 possibilities: 1) xval -> xmin -> xmax
+                       2) xmin -> xval -> xmax
+                       3) xmin -> xmax -> xval
+      The time error is calculated with the number in the middle.
+      */
+      float max = fminf( fmaxf(son, xmin), xmax);
+      float div_ = __fdividef(aterm, max);
+      dst_soa.timeError_[tid] = sqrt( div_*div_ + cterm*cterm );
     }
   else
     dst_soa.timeError_[tid] = -1;
 }
 
 __device__ 
-double get_thickness_correction(const int& padding, const int& type, double *& sd)
+double get_thickness_correction(const int& type, const double (&rcorr)[maxsizes_constants::hef_rcorr])
 {
-  return sd[padding + type];
+  return rcorr[type];
 }
 
 __device__
-double get_noise(const int& padding, const int& type, double *& sd)
+double get_noise(const int& type, const double (&noise_fC)[maxsizes_constants::hef_noise_fC])
 {
-  return sd[padding + type - 1];
+  return noise_fC[type - 1];
 }
 
 __device__
-double get_cce_correction(const int& padding, const int& type, double *& sd)
+double get_cce_correction(const int& type, const double (&cce)[maxsizes_constants::hef_cce])
 {
-  return sd[padding + type - 1];
+  return cce[type - 1];
 }
 
 __device__ 
-double get_fCPerMIP(const int& padding, const int& type, double *& sd)
+double get_fCPerMIP(const int& type, const double (&fCPerMIP)[maxsizes_constants::hef_fCPerMIP])
 {
-  return sd[padding + type - 1];
-}
-
-__device__ 
-void set_shared_memory(const int& tid, double*& sd, float*& sf, int*& si, const HGCeeUncalibratedRecHitConstantData& cdata, const int& size1, const int& size2, const int& size3, const int& size4, const int& size5)
-{
-  const int initial_pad = 2;
-  if(tid == 0)
-    {
-      sd[0] = cdata.hgcEE_keV2DIGI_;
-      sd[1] = cdata.hgceeUncalib2GeV_;
-      for(unsigned int i=initial_pad; i<size1; ++i)
-	sd[i] = cdata.hgcEE_fCPerMIP_[i-initial_pad];
-      for(unsigned int i=size1; i<size2; ++i)
-	sd[i] = cdata.hgcEE_cce_[i-size1];
-      for(unsigned int i=size2; i<size3; ++i)
-	sd[i] = cdata.hgcEE_noise_fC_[i-size2];  
-      for(unsigned int i=size3; i<size4; ++i)
-	sd[i] = cdata.rcorr_[i-size3];
-      for(unsigned int i=size4; i<size5; ++i)
-	sd[i] = cdata.weights_[i-size4];
-      sf[0] = (cdata.xmin_ > 0) ? cdata.xmin_ : 0.1;
-      sf[1] = cdata.xmax_;
-      sf[2] = cdata.aterm_;
-      sf[3] = cdata.cterm_;
-    }
-}
-
-__device__ 
-void set_shared_memory(const int& tid, double*& sd, float*& sf, int*& si, const HGChefUncalibratedRecHitConstantData& cdata, const int& size1, const int& size2, const int& size3, const int& size4, const int& size5)
-{
-  const int initial_pad = 2;
-  if(tid == 0)
-    {
-      sd[0] = cdata.hgcHEF_keV2DIGI_;
-      sd[1] = cdata.hgchefUncalib2GeV_;
-      for(unsigned int i=initial_pad; i<size1; ++i)
-	sd[i] = cdata.hgcHEF_fCPerMIP_[i-initial_pad];
-      for(unsigned int i=size1; i<size2; ++i)
-	sd[i] = cdata.hgcHEF_cce_[i-size1];
-      for(unsigned int i=size2; i<size3; ++i)
-	sd[i] = cdata.hgcHEF_noise_fC_[i-size2];  
-      for(unsigned int i=size3; i<size4; ++i)
-	sd[i] = cdata.rcorr_[i-size3];
-      for(unsigned int i=size4; i<size5; ++i)
-	sd[i] = cdata.weights_[i-size4];
-      sf[0] = (cdata.xmin_ > 0) ? cdata.xmin_ : 0.1;
-      sf[1] = cdata.xmax_;
-      sf[2] = cdata.aterm_;
-      sf[3] = cdata.cterm_;
-    }
-}
-
-__device__ 
-void set_shared_memory(const int& tid, double*& sd, float*& sf, const HGChebUncalibratedRecHitConstantData& cdata, const int& size1)
-{
-  const int initial_pad = 3;
-  if(tid == 0)
-    {
-      sd[0] = cdata.hgcHEB_keV2DIGI_;
-      sd[1] = cdata.hgchebUncalib2GeV_;
-      sd[2] = cdata.hgcHEB_noise_MIP_;
-      for(unsigned int i=initial_pad; i<size1; ++i)
-	sd[i] = cdata.weights_[i-initial_pad];
-    }
+  return fCPerMIP[type - 1];
 }
 
 __global__
@@ -146,28 +89,17 @@ void ee_to_rechit(HGCRecHitSoA dst_soa, HGCUncalibratedRecHitSoA src_soa, const
 {
   unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
   HeterogeneousHGCSiliconDetId detid(src_soa.id_[tid]);
-  int size1 = cdata.s_hgcEE_fCPerMIP_ + 2;
-  int size2 = cdata.s_hgcEE_cce_      + size1;
-  int size3 = cdata.s_hgcEE_noise_fC_ + size2;
-  int size4 = cdata.s_rcorr_          + size3; 
-  int size5 = cdata.s_weights_        + size4; 
-
-  extern __shared__ double s[];
-  double   *sd = s;
-  float    *sf = (float*)   (sd + cdata.ndelem_);
-  int      *si = (int*)     (sf + cdata.nfelem_);
-  set_shared_memory(threadIdx.x, sd, sf, si, cdata, size1, size2, size3, size4, size5);
-  __syncthreads();
 
   for (unsigned int i = tid; i < length; i += blockDim.x * gridDim.x)
     {
-      double weight = get_weight_from_layer(size4, detid.layer(), sd);
-      double rcorr = get_thickness_correction(size3, detid.layer(), sd);
-      double noise = get_noise(size2, detid.type(), sd);
-      double cce_correction = get_cce_correction(size1, detid.type(), sd);
-      double fCPerMIP = get_fCPerMIP(2, detid.type(), sd);
-      double sigmaNoiseGeV = 1e-3 * weight * rcorr * __fdividef( noise, fCPerMIP );
-      make_rechit(i, dst_soa, src_soa, false, weight, rcorr, cce_correction, sigmaNoiseGeV, sf);
+      double weight         = get_weight_from_layer(detid.layer(), cdata.weights_);
+      double rcorr          = get_thickness_correction(detid.type(), cdata.rcorr_);
+      double noise          = get_noise(detid.type(), cdata.noise_fC_);
+      double cce_correction = get_cce_correction(detid.type(), cdata.cce_);
+      double fCPerMIP       = get_fCPerMIP(detid.type(), cdata.fCPerMIP_);
+      double sigmaNoiseGeV  = 1e-3 * weight * rcorr * __fdividef( noise,  fCPerMIP );
+      make_rechit(i, dst_soa, src_soa, false, weight, rcorr, cce_correction, sigmaNoiseGeV,
+		  cdata.xmin_, cdata.xmax_, cdata.aterm_, cdata.cterm_);
     }
 }
 
@@ -178,28 +110,16 @@ void hef_to_rechit(HGCRecHitSoA dst_soa, HGCUncalibratedRecHitSoA src_soa, const
   HeterogeneousHGCSiliconDetId detid(src_soa.id_[tid]);
   printf("waferTypeL: %d - cellCoarseY: %lf - cellX: %d\n", conds->params.waferTypeL_[0], conds->params.cellCoarseY_[12], detid.cellX());
 
-  int size1 = cdata.s_hgcHEF_fCPerMIP_ + 2;
-  int size2 = cdata.s_hgcHEF_cce_      + size1;
-  int size3 = cdata.s_hgcHEF_noise_fC_ + size2;
-  int size4 = cdata.s_rcorr_           + size3; 
-  int size5 = cdata.s_weights_         + size4; 
-
-  extern __shared__ double s[];
-  double   *sd = s;
-  float    *sf = (float*)   (sd + cdata.ndelem_);
-  int      *si = (int*)     (sf + cdata.nuelem_);
-  set_shared_memory(threadIdx.x, sd, sf, si, cdata, size1, size2, size3, size4, size5);
-  __syncthreads();
-
   for (unsigned int i = tid; i < length; i += blockDim.x * gridDim.x)
     {
-      double weight = get_weight_from_layer(size4, detid.layer(), sd);
-      double rcorr = get_thickness_correction(size3, detid.type(), sd);
-      double noise = get_noise(size2, detid.type(), sd);
-      double cce_correction = get_cce_correction(size1, detid.type(), sd);
-      double fCPerMIP = get_fCPerMIP(2, detid.type(), sd);
-      double sigmaNoiseGeV = 1e-3 * weight * rcorr * __fdividef( noise,  fCPerMIP );
-      make_rechit(i, dst_soa, src_soa, false, weight, rcorr, cce_correction, sigmaNoiseGeV, sf);
+      double weight         = get_weight_from_layer(detid.layer(), cdata.weights_);
+      double rcorr          = get_thickness_correction(detid.type(), cdata.rcorr_);
+      double noise          = get_noise(detid.type(), cdata.noise_fC_);
+      double cce_correction = get_cce_correction(detid.type(), cdata.cce_);
+      double fCPerMIP       = get_fCPerMIP(detid.type(), cdata.fCPerMIP_);
+      double sigmaNoiseGeV  = 1e-3 * weight * rcorr * __fdividef( noise,  fCPerMIP );
+      make_rechit(i, dst_soa, src_soa, false, weight, rcorr, cce_correction, sigmaNoiseGeV,
+		  cdata.xmin_, cdata.xmax_, cdata.aterm_, cdata.cterm_);
     }
 }
 
@@ -208,19 +128,13 @@ void heb_to_rechit(HGCRecHitSoA dst_soa, HGCUncalibratedRecHitSoA src_soa, const
 {
   unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
   HeterogeneousHGCScintillatorDetId detid(src_soa.id_[tid]);
-  int size1 = cdata.s_weights_ + 3; 
-
-  extern __shared__ double s[];
-  double   *sd = s;
-  float    *sf = (float*)   (sd + cdata.ndelem_);
-  set_shared_memory(threadIdx.x, sd, sf, cdata, size1);
-  __syncthreads();
 
   for (unsigned int i = tid; i < length; i += blockDim.x * gridDim.x)
     {
-      double weight = get_weight_from_layer(3, detid.layer(), sd);
-      double noise = sd[2];
+      double weight        = get_weight_from_layer(detid.layer(), cdata.weights_);
+      double noise         = cdata.noise_MIP_;
       double sigmaNoiseGeV = 1e-3 * noise * weight;
-      make_rechit(i, dst_soa, src_soa, true, weight, 0., 0., sigmaNoiseGeV, sf);
+      make_rechit(i, dst_soa, src_soa, true, weight, 0., 0., sigmaNoiseGeV,
+		  0, 0, 0, 0);
     }
 }