cms-sw · cmsbuild · Sep 9, 2022 · Aug 25, 2022 · Aug 25, 2022 · Aug 26, 2022
diff --git a/Configuration/PyReleaseValidation/README.md b/Configuration/PyReleaseValidation/README.md
@@ -61,7 +61,8 @@ The offsets currently in use are:
 * 0.21: Production-like sequence
 * 0.24: 0 Tesla (Run-2, Run-3)
 * 0.31: Photon energy corrections with DRN architecture
-* 0.61: `phase2_ecal_devel` era
+* 0.61: ECAL `phase2_ecal_devel` era, on CPU
+* 0.612: ECAL `phase2_ecal_devel` era, with automatic offload to GPU if available
 * 0.75: Phase-2 HLT
 * 0.91: Track DNN modifier
 * 0.97: Premixing stage1

diff --git a/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py b/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py
@@ -1388,36 +1388,58 @@ def condition(self, fragment, stepList, key, hasHarvest):
     offset = 0.6,
 )
 
+# ECAL Phase 2 development WF
 class UpgradeWorkflow_ecalDevel(UpgradeWorkflow):
+    def __init__(self, digi = {}, reco = {}, harvest = {}, **kwargs):
+        # adapt the parameters for the UpgradeWorkflow init method
+        super(UpgradeWorkflow_ecalDevel, self).__init__(
+            steps = [
+                'DigiTrigger',
+                'RecoGlobal',
+                'HARVESTGlobal',
+            ],
+            PU = [
+                'DigiTrigger',
+                'RecoGlobal',
+                'HARVESTGlobal',
+            ],
+            **kwargs)
+        self.__digi = digi
+        self.__reco = reco
+        self.__harvest = harvest
+
     def setup_(self, step, stepName, stepDict, k, properties):
         # temporarily remove trigger & downstream steps
         mods = {'--era': stepDict[step][k]['--era']+',phase2_ecal_devel'}
         if 'Digi' in step:
             mods['-s'] = 'DIGI:pdigi_valid,DIGI2RAW'
+            mods |= self.__digi
         elif 'Reco' in step:
             mods['-s'] = 'RAW2DIGI,RECO:reconstruction_ecalOnly,VALIDATION:@ecalOnlyValidation,DQM:@ecalOnly'
             mods['--datatier'] = 'GEN-SIM-RECO,DQMIO'
             mods['--eventcontent'] = 'FEVTDEBUGHLT,DQM'
+            mods |= self.__reco
         elif 'HARVEST' in step:
             mods['-s'] = 'HARVESTING:@ecalOnlyValidation+@ecal'
+            mods |= self.__harvest
         stepDict[stepName][k] = merge([mods, stepDict[step][k]])
+
     def condition(self, fragment, stepList, key, hasHarvest):
         return fragment=="TTbar_14TeV" and '2026' in key
+
+# ECAL Phase 2 workflow running on CPU
 upgradeWFs['ecalDevel'] = UpgradeWorkflow_ecalDevel(
-    steps = [
-        'DigiTrigger',
-        'RecoGlobal',
-        'HARVESTGlobal',
-    ],
-    PU = [
-        'DigiTrigger',
-        'RecoGlobal',
-        'HARVESTGlobal',
-    ],
     suffix = '_ecalDevel',
     offset = 0.61,
 )
 
+# ECAL Phase 2 workflow running on CPU or GPU (if available)
+upgradeWFs['ecalDevelGPU'] = UpgradeWorkflow_ecalDevel(
+    reco = {'--procModifiers': 'gpu'},
+    suffix = '_ecalDevelGPU',
+    offset = 0.612,
+)
+
 class UpgradeWorkflow_0T(UpgradeWorkflow):
     def setup_(self, step, stepName, stepDict, k, properties):
         myGT=stepDict[step][k]['--conditions']

diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalPhase2DigiToGPUProducer.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalPhase2DigiToGPUProducer.cc
@@ -12,24 +12,18 @@
 
 #include "DeclsForKernelsPhase2.h"
 
-class EcalPhase2DigiToGPUProducer : public edm::stream::EDProducer<edm::ExternalWork> {
+class EcalPhase2DigiToGPUProducer : public edm::stream::EDProducer<> {
 public:
   explicit EcalPhase2DigiToGPUProducer(const edm::ParameterSet& ps);
   ~EcalPhase2DigiToGPUProducer() override = default;
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
-  void acquire(edm::Event const& event, edm::EventSetup const& setup, edm::WaitingTaskWithArenaHolder holder) override;
   void produce(edm::Event& evt, edm::EventSetup const& setup) override;
 
 private:
   const edm::EDGetTokenT<EBDigiCollectionPh2> digiCollectionToken_;
   const edm::EDPutTokenT<cms::cuda::Product<ecal::DigisCollection<calo::common::DevStoragePolicy>>>
       digisCollectionToken_;
-  uint32_t size_;
-
-  ecal::DigisCollection<::calo::common::DevStoragePolicy> digis_;
-
-  cms::cuda::ContextState cudaState_;
 };
 
 void EcalPhase2DigiToGPUProducer::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
@@ -46,24 +40,24 @@ EcalPhase2DigiToGPUProducer::EcalPhase2DigiToGPUProducer(const edm::ParameterSet
       digisCollectionToken_(produces<cms::cuda::Product<ecal::DigisCollection<calo::common::DevStoragePolicy>>>(
           ps.getParameter<std::string>("digisLabelEB"))) {}
 
-void EcalPhase2DigiToGPUProducer::acquire(edm::Event const& event,
-                                          edm::EventSetup const& setup,
-                                          edm::WaitingTaskWithArenaHolder holder) {
-  cms::cuda::ScopedContextAcquire ctx{event.streamID(), std::move(holder), cudaState_};
+void EcalPhase2DigiToGPUProducer::produce(edm::Event& event, edm::EventSetup const& setup) {
+  cms::cuda::ScopedContextProduce ctx{event.streamID()};
 
   //input data from event
   const auto& pdigis = event.get(digiCollectionToken_);
 
-  size_ = pdigis.size();
+  const uint32_t size = pdigis.size();
+
+  ecal::DigisCollection<::calo::common::DevStoragePolicy> digis;
+  digis.size = size;
 
-  digis_.size = size_;
   //allocate device pointers for output
-  digis_.ids = cms::cuda::make_device_unique<uint32_t[]>(size_, ctx.stream());
-  digis_.data = cms::cuda::make_device_unique<uint16_t[]>(size_ * EcalDataFrame_Ph2::MAXSAMPLES, ctx.stream());
+  digis.ids = cms::cuda::make_device_unique<uint32_t[]>(size, ctx.stream());
+  digis.data = cms::cuda::make_device_unique<uint16_t[]>(size * EcalDataFrame_Ph2::MAXSAMPLES, ctx.stream());
 
   //allocate host pointers for holding product data and id vectors
-  auto idstmp = cms::cuda::make_host_unique<uint32_t[]>(size_, ctx.stream());
-  auto datatmp = cms::cuda::make_host_unique<uint16_t[]>(size_ * EcalDataFrame_Ph2::MAXSAMPLES, ctx.stream());
+  auto idstmp = cms::cuda::make_host_unique<uint32_t[]>(size, ctx.stream());
+  auto datatmp = cms::cuda::make_host_unique<uint16_t[]>(size * EcalDataFrame_Ph2::MAXSAMPLES, ctx.stream());
 
   //iterate over digis
   uint32_t i = 0;
@@ -82,22 +76,16 @@ void EcalPhase2DigiToGPUProducer::acquire(edm::Event const& event,
   }
 
   //copy output vectors into member variable device pointers for the output struct
-
   cudaCheck(
-      cudaMemcpyAsync(digis_.ids.get(), idstmp.get(), size_ * sizeof(uint32_t), cudaMemcpyHostToDevice, ctx.stream()));
-  cudaCheck(cudaMemcpyAsync(digis_.data.get(),
+      cudaMemcpyAsync(digis.ids.get(), idstmp.get(), size * sizeof(uint32_t), cudaMemcpyHostToDevice, ctx.stream()));
+  cudaCheck(cudaMemcpyAsync(digis.data.get(),
                             datatmp.get(),
-                            size_ * EcalDataFrame_Ph2::MAXSAMPLES * sizeof(uint16_t),
+                            size * EcalDataFrame_Ph2::MAXSAMPLES * sizeof(uint16_t),
                             cudaMemcpyHostToDevice,
                             ctx.stream()));
-}
-
-void EcalPhase2DigiToGPUProducer::produce(edm::Event& event, edm::EventSetup const& setup) {
-  //get cuda context state for producer
-  cms::cuda::ScopedContextProduce ctx{cudaState_};
 
   //emplace output in the context
-  ctx.emplace(event, digisCollectionToken_, std::move(digis_));
+  ctx.emplace(event, digisCollectionToken_, std::move(digis));
 }
 
 DEFINE_FWK_MODULE(EcalPhase2DigiToGPUProducer);
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitPhase2WeightsKernels.cu b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitPhase2WeightsKernels.cu
@@ -19,36 +19,39 @@ namespace ecal {
                                         double const* __restrict__ weights,
                                         uint32_t* __restrict__ flags) {
       constexpr int nsamples = EcalDataFrame_Ph2::MAXSAMPLES;
-      int const tx = threadIdx.x + blockIdx.x * blockDim.x;
       unsigned int nchannels_per_block = blockDim.x;
-      unsigned int const threadx = threadIdx.x;
 
-      if (tx < nchannels) {
-        extern __shared__ char shared_mem[];
-        double* shr_weights = (double*)&shared_mem[0];
-        float* shr_amp = (float*)&shared_mem[nsamples * sizeof(double)];
-        uint16_t* shr_digis = (uint16_t*)&shared_mem[nsamples * sizeof(double) + nchannels_per_block * sizeof(float)];
-        for (int i = 0; i < nsamples; ++i)
-          shr_weights[i] = weights[i];
+      // copy data from global to shared memory
+      extern __shared__ char shared_mem[];
+      double* shr_weights = reinterpret_cast<double*>(shared_mem);                       // nsamples elements
+      float* shr_amp = reinterpret_cast<float*>(shr_weights + nsamples);                 // nchannels_per_block elements
+      uint16_t* shr_digis = reinterpret_cast<uint16_t*>(shr_amp + nchannels_per_block);  // nchannels_per_block elements
+      for (int i = 0; i < nsamples; ++i)
+        shr_weights[i] = weights[i];
 
-        unsigned int const bx = blockIdx.x;  //block index
+      unsigned int const threadx = threadIdx.x;
+      unsigned int const blockx = blockIdx.x;
 
-        for (int sample = 0; sample < nsamples; ++sample) {
-          int const idx = threadx * nsamples + sample;
-          shr_digis[idx] = digis_in[bx * nchannels_per_block * nsamples + idx];
-        }
-        shr_amp[threadx] = 0.0;
-        __syncthreads();
+      for (int sample = 0; sample < nsamples; ++sample) {
+        int const idx = threadx * nsamples + sample;
+        shr_digis[idx] = digis_in[blockx * nchannels_per_block * nsamples + idx];
+      }
+      shr_amp[threadx] = 0.;
+
+      __syncthreads();
 
+      const auto first = threadIdx.x + blockIdx.x * blockDim.x;
+      const auto stride = blockDim.x * gridDim.x;
+      for (auto tx = first; tx < nchannels; tx += stride) {
         auto const did = DetId{dids[tx]};
         CMS_UNROLL_LOOP
         for (int sample = 0; sample < nsamples; ++sample) {
-          const unsigned int idx = threadIdx.x * nsamples + sample;
+          const unsigned int idx = threadx * nsamples + sample;
           const auto shr_digi = shr_digis[idx];
           shr_amp[threadx] += (static_cast<float>(ecalLiteDTU::adc(shr_digi)) *
                                ecalPh2::gains[ecalLiteDTU::gainId(shr_digi)] * shr_weights[sample]);
         }
-        const unsigned int tdx = threadIdx.x * nsamples;
+        const unsigned int tdx = threadx * nsamples;
         amplitude[tx] = shr_amp[threadx];
         amplitudeError[tx] = 1.0f;
         dids_out[tx] = did.rawId();

diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitPhase2WeightsProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitPhase2WeightsProducerGPU.cc
@@ -11,14 +11,13 @@
 #include "EcalUncalibRecHitPhase2WeightsAlgoGPU.h"
 #include "DeclsForKernelsPhase2.h"
 
-class EcalUncalibRecHitPhase2WeightsProducerGPU : public edm::stream::EDProducer<edm::ExternalWork> {
+class EcalUncalibRecHitPhase2WeightsProducerGPU : public edm::stream::EDProducer<> {
 public:
   explicit EcalUncalibRecHitPhase2WeightsProducerGPU(edm::ParameterSet const &ps);
   ~EcalUncalibRecHitPhase2WeightsProducerGPU() override = default;
   static void fillDescriptions(edm::ConfigurationDescriptions &);
 
 private:
-  void acquire(edm::Event const &, edm::EventSetup const &, edm::WaitingTaskWithArenaHolder) override;
   void produce(edm::Event &, edm::EventSetup const &) override;
 
 private:
@@ -31,10 +30,6 @@ class EcalUncalibRecHitPhase2WeightsProducerGPU : public edm::stream::EDProducer
 
   // event data
   ecal::weights::EventOutputDataGPU eventOutputDataGPU_;
-
-  cms::cuda::ContextState cudaState_;
-
-  uint32_t size_;
 };
 
 // constructor with initialisation of elements
@@ -74,43 +69,35 @@ void EcalUncalibRecHitPhase2WeightsProducerGPU::fillDescriptions(edm::Configurat
   descriptions.addWithDefaultLabel(desc);
 }
 
-// aquire function which initislises objects on host and device to their actual objects and calls kernal
-void EcalUncalibRecHitPhase2WeightsProducerGPU::acquire(edm::Event const &event,
-                                                        edm::EventSetup const &setup,
-                                                        edm::WaitingTaskWithArenaHolder holder) {
+void EcalUncalibRecHitPhase2WeightsProducerGPU::produce(edm::Event &event, const edm::EventSetup &setup) {
   // cuda products
   auto const &digisProduct = event.get(digisToken_);
   // raii
-  cms::cuda::ScopedContextAcquire ctx{digisProduct, std::move(holder), cudaState_};
+  cms::cuda::ScopedContextProduce ctx{digisProduct};
 
   // get actual obj
   auto const &digis = ctx.get(digisProduct);
 
-  size_ = digis.size;
-
-  // if no digis stop here
-  if (size_ == 0)
-    return;
+  const uint32_t size = digis.size;
 
-  auto weights_d = cms::cuda::make_device_unique<double[]>(EcalDataFrame_Ph2::MAXSAMPLES, ctx.stream());
+  // do not run the algo if there are no digis
+  if (size > 0) {
+    auto weights_d = cms::cuda::make_device_unique<double[]>(EcalDataFrame_Ph2::MAXSAMPLES, ctx.stream());
 
-  cudaCheck(cudaMemcpyAsync(weights_d.get(),
-                            weights_.data(),
-                            EcalDataFrame_Ph2::MAXSAMPLES * sizeof(double),
-                            cudaMemcpyHostToDevice,
-                            ctx.stream()));
+    cudaCheck(cudaMemcpyAsync(weights_d.get(),
+                              weights_.data(),
+                              EcalDataFrame_Ph2::MAXSAMPLES * sizeof(double),
+                              cudaMemcpyHostToDevice,
+                              ctx.stream()));
 
-  // output on GPU
-  eventOutputDataGPU_.allocate(size_, ctx.stream());
+    // output on GPU
+    eventOutputDataGPU_.allocate(size, ctx.stream());
 
-  ecal::weights::phase2Weights(digis, eventOutputDataGPU_, weights_d, ctx.stream());
-}
-
-void EcalUncalibRecHitPhase2WeightsProducerGPU::produce(edm::Event &event, const edm::EventSetup &setup) {
-  cms::cuda::ScopedContextProduce ctx{cudaState_};
+    ecal::weights::phase2Weights(digis, eventOutputDataGPU_, weights_d, ctx.stream());
+  }
 
   // set the size of digis
-  eventOutputDataGPU_.recHits.size = size_;
+  eventOutputDataGPU_.recHits.size = size;
 
   // put into the event
   ctx.emplace(event, recHitsToken_, std::move(eventOutputDataGPU_.recHits));

diff --git a/RecoLocalCalo/EcalRecProducers/python/ecalUncalibRecHitPhase2_cff.py b/RecoLocalCalo/EcalRecProducers/python/ecalUncalibRecHitPhase2_cff.py
@@ -36,8 +36,8 @@
 from RecoLocalCalo.EcalRecProducers.ecalUncalibRecHitConvertGPU2CPUFormat_cfi import ecalUncalibRecHitConvertGPU2CPUFormat as _ecalUncalibRecHitConvertGPU2CPUFormat
 gpu.toModify(ecalUncalibRecHitPhase2,
     cuda = _ecalUncalibRecHitConvertGPU2CPUFormat.clone(
-        isPhase2 = cms.bool(True),
-        recHitsLabelGPUEB = cms.InputTag('ecalUncalibRecHitSoA', 'EcalUncalibRecHitsEB'),
+        isPhase2 = True,
+        recHitsLabelGPUEB = ('ecalUncalibRecHitSoA', 'EcalUncalibRecHitsEB'),
         recHitsLabelGPUEE = None,  # remove unneeded Phase1 parameters
         recHitsLabelCPUEE = None
     )