Merge pull request #34618 from dan131riley/gpu-sistripclusterizer

CUDA implementation of RecoLocalTracker/SiStripCluster ClustersFromRawProducer
cms-sw · Mar 27, 2023 · 1805fa8 · 1805fa8
2 parents 68e9674 + 44b2092
commit 1805fa8
Show file tree

Hide file tree

Showing 40 changed files with 2,182 additions and 28 deletions.
diff --git a/CUDADataFormats/SiStripCluster/BuildFile.xml b/CUDADataFormats/SiStripCluster/BuildFile.xml
@@ -0,0 +1,10 @@
+<use name="DataFormats/Common"/>
+<use name="CUDADataFormats/Common"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="cuda"/>
+<use name="rootcore"/>
+
+<export>
+    <lib name="1"/>
+</export>
+
diff --git a/CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h b/CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h
@@ -0,0 +1,59 @@
+#ifndef CUDADataFormats_SiStripCluster_interface_SiStripClustersCUDA_h
+#define CUDADataFormats_SiStripCluster_interface_SiStripClustersCUDA_h
+
+#include "DataFormats/SiStripCluster/interface/SiStripClustersSOABase.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
+
+#include <cuda_runtime.h>
+
+class SiStripClustersCUDADevice : public SiStripClustersSOABase<cms::cuda::device::unique_ptr> {
+public:
+  SiStripClustersCUDADevice() = default;
+  explicit SiStripClustersCUDADevice(uint32_t maxClusters, uint32_t maxStripsPerCluster, cudaStream_t stream);
+  ~SiStripClustersCUDADevice() override = default;
+
+  SiStripClustersCUDADevice(const SiStripClustersCUDADevice &) = delete;
+  SiStripClustersCUDADevice &operator=(const SiStripClustersCUDADevice &) = delete;
+  SiStripClustersCUDADevice(SiStripClustersCUDADevice &&) = default;
+  SiStripClustersCUDADevice &operator=(SiStripClustersCUDADevice &&) = default;
+
+  struct DeviceView {
+    uint32_t *clusterIndex_;
+    uint32_t *clusterSize_;
+    uint8_t *clusterADCs_;
+    stripgpu::detId_t *clusterDetId_;
+    stripgpu::stripId_t *firstStrip_;
+    bool *trueCluster_;
+    float *barycenter_;
+    float *charge_;
+    uint32_t nClusters_;
+    uint32_t maxClusterSize_;
+  };
+
+  DeviceView *view() const { return view_d.get(); }
+  uint32_t nClusters() const { return nClusters_; }
+  uint32_t *nClustersPtr() { return &nClusters_; }
+  uint32_t maxClusterSize() const { return maxClusterSize_; }
+  uint32_t *maxClusterSizePtr() { return &maxClusterSize_; }
+
+private:
+  cms::cuda::device::unique_ptr<DeviceView> view_d;  // "me" pointer
+  uint32_t nClusters_;
+  uint32_t maxClusterSize_;
+};
+
+class SiStripClustersCUDAHost : public SiStripClustersSOABase<cms::cuda::host::unique_ptr> {
+public:
+  SiStripClustersCUDAHost() = default;
+  explicit SiStripClustersCUDAHost(const SiStripClustersCUDADevice &clusters_d, cudaStream_t stream);
+  ~SiStripClustersCUDAHost() override = default;
+
+  SiStripClustersCUDAHost(const SiStripClustersCUDAHost &) = delete;
+  SiStripClustersCUDAHost &operator=(const SiStripClustersCUDAHost &) = delete;
+  SiStripClustersCUDAHost(SiStripClustersCUDAHost &&) = default;
+  SiStripClustersCUDAHost &operator=(SiStripClustersCUDAHost &&) = default;
+};
+
+#endif
diff --git a/CUDADataFormats/SiStripCluster/src/SiStripClustersCUDA.cc b/CUDADataFormats/SiStripCluster/src/SiStripClustersCUDA.cc
@@ -0,0 +1,59 @@
+#include "CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+
+SiStripClustersCUDADevice::SiStripClustersCUDADevice(uint32_t maxClusters,
+                                                     uint32_t maxStripsPerCluster,
+                                                     cudaStream_t stream) {
+  maxClusterSize_ = maxStripsPerCluster;
+
+  clusterIndex_ = cms::cuda::make_device_unique<uint32_t[]>(maxClusters, stream);
+  clusterSize_ = cms::cuda::make_device_unique<uint32_t[]>(maxClusters, stream);
+  clusterADCs_ = cms::cuda::make_device_unique<uint8_t[]>(maxClusters * maxStripsPerCluster, stream);
+  clusterDetId_ = cms::cuda::make_device_unique<stripgpu::detId_t[]>(maxClusters, stream);
+  firstStrip_ = cms::cuda::make_device_unique<stripgpu::stripId_t[]>(maxClusters, stream);
+  trueCluster_ = cms::cuda::make_device_unique<bool[]>(maxClusters, stream);
+  barycenter_ = cms::cuda::make_device_unique<float[]>(maxClusters, stream);
+  charge_ = cms::cuda::make_device_unique<float[]>(maxClusters, stream);
+
+  auto view = cms::cuda::make_host_unique<DeviceView>(stream);
+  view->clusterIndex_ = clusterIndex_.get();
+  view->clusterSize_ = clusterSize_.get();
+  view->clusterADCs_ = clusterADCs_.get();
+  view->clusterDetId_ = clusterDetId_.get();
+  view->firstStrip_ = firstStrip_.get();
+  view->trueCluster_ = trueCluster_.get();
+  view->barycenter_ = barycenter_.get();
+  view->charge_ = charge_.get();
+  view->maxClusterSize_ = maxStripsPerCluster;
+
+  view_d = cms::cuda::make_device_unique<DeviceView>(stream);
+  cms::cuda::copyAsync(view_d, view, stream);
+#ifdef GPU_CHECK
+  cudaCheck(cudaStreamSynchronize(stream));
+#endif
+}
+
+SiStripClustersCUDAHost::SiStripClustersCUDAHost(const SiStripClustersCUDADevice& clusters_d, cudaStream_t stream) {
+  nClusters_ = clusters_d.nClusters();
+  maxClusterSize_ = clusters_d.maxClusterSize();
+  clusterIndex_ = cms::cuda::make_host_unique<uint32_t[]>(nClusters_, stream);
+  clusterSize_ = cms::cuda::make_host_unique<uint32_t[]>(nClusters_, stream);
+  clusterADCs_ = cms::cuda::make_host_unique<uint8_t[]>(nClusters_ * maxClusterSize_, stream);
+  clusterDetId_ = cms::cuda::make_host_unique<stripgpu::detId_t[]>(nClusters_, stream);
+  firstStrip_ = cms::cuda::make_host_unique<stripgpu::stripId_t[]>(nClusters_, stream);
+  trueCluster_ = cms::cuda::make_host_unique<bool[]>(nClusters_, stream);
+  barycenter_ = cms::cuda::make_host_unique<float[]>(nClusters_, stream);
+  charge_ = cms::cuda::make_host_unique<float[]>(nClusters_, stream);
+
+  cms::cuda::copyAsync(clusterIndex_, clusters_d.clusterIndex(), nClusters_, stream);
+  cms::cuda::copyAsync(clusterSize_, clusters_d.clusterSize(), nClusters_, stream);
+  cms::cuda::copyAsync(clusterADCs_, clusters_d.clusterADCs(), nClusters_ * maxClusterSize_, stream);
+  cms::cuda::copyAsync(clusterDetId_, clusters_d.clusterDetId(), nClusters_, stream);
+  cms::cuda::copyAsync(firstStrip_, clusters_d.firstStrip(), nClusters_, stream);
+  cms::cuda::copyAsync(trueCluster_, clusters_d.trueCluster(), nClusters_, stream);
+  cms::cuda::copyAsync(barycenter_, clusters_d.barycenter(), nClusters_, stream);
+  cms::cuda::copyAsync(charge_, clusters_d.charge(), nClusters_, stream);
+#ifdef GPU_CHECK
+  cudaCheck(cudaStreamSynchronize(stream));
+#endif
+}
diff --git a/CUDADataFormats/SiStripCluster/src/classes.h b/CUDADataFormats/SiStripCluster/src/classes.h
@@ -0,0 +1,8 @@
+#ifndef CUDADataFormats_SiStripCluster_classes_h
+#define CUDADataFormats_SiStripCluster_classes_h
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+
+#endif
diff --git a/CUDADataFormats/SiStripCluster/src/classes_def.xml b/CUDADataFormats/SiStripCluster/src/classes_def.xml
@@ -0,0 +1,6 @@
+<lcgdict>
+  <class name="cms::cuda::Product<SiStripClustersCUDADevice>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<SiStripClustersCUDADevice>>" persistent="false"/>
+  <class name="SiStripClustersCUDAHost" persistent="false"/>
+  <class name="edm::Wrapper<SiStripClustersCUDAHost>" persistent="false"/>
+</lcgdict>
diff --git a/CalibFormats/SiStripObjects/BuildFile.xml b/CalibFormats/SiStripObjects/BuildFile.xml
@@ -2,6 +2,9 @@
 <use name="DataFormats/SiStripCommon"/>
 <use name="CondFormats/SiStripObjects"/>
 <use name="DataFormats/TrackerCommon"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="cuda"/>
 <export>
   <lib name="1"/>
 </export>
diff --git a/CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h b/CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h
@@ -0,0 +1,137 @@
+#ifndef CalibFormats_SiStripObjects_SiStripClusterizerConditionsGPU_h
+#define CalibFormats_SiStripObjects_SiStripClusterizerConditionsGPU_h
+
+#include "DataFormats/SiStripCluster/interface/SiStripTypes.h"
+#include "DataFormats/SiStripCommon/interface/ConstantsForHardwareSystems.h"
+
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+
+class SiStripQuality;
+class SiStripGain;
+class SiStripNoises;
+
+namespace stripgpu {
+  __host__ __device__ inline fedId_t fedIndex(fedId_t fed) { return fed - sistrip::FED_ID_MIN; }
+  __host__ __device__ inline std::uint32_t stripIndex(fedId_t fed, fedCh_t channel, stripId_t strip) {
+    return fedIndex(fed) * sistrip::FEDCH_PER_FED * sistrip::STRIPS_PER_FEDCH + channel * sistrip::STRIPS_PER_FEDCH +
+           (strip % sistrip::STRIPS_PER_FEDCH);
+  }
+  __host__ __device__ inline std::uint32_t apvIndex(fedId_t fed, fedCh_t channel, stripId_t strip) {
+    return fedIndex(fed) * sistrip::APVS_PER_FEDCH * sistrip::FEDCH_PER_FED + sistrip::APVS_PER_CHAN * channel +
+           (strip % sistrip::STRIPS_PER_FEDCH) / sistrip::STRIPS_PER_APV;
+  }
+  __host__ __device__ inline std::uint32_t channelIndex(fedId_t fed, fedCh_t channel) {
+    return fedIndex(fed) * sistrip::FEDCH_PER_FED + channel;
+  }
+
+  class SiStripClusterizerConditionsGPU {
+  public:
+    class DetToFed {
+    public:
+      DetToFed(detId_t detid, apvPair_t ipair, fedId_t fedid, fedCh_t fedch)
+          : detid_(detid), ipair_(ipair), fedid_(fedid), fedch_(fedch) {}
+      detId_t detID() const { return detid_; }
+      apvPair_t pair() const { return ipair_; }
+      fedId_t fedID() const { return fedid_; }
+      fedCh_t fedCh() const { return fedch_; }
+
+    private:
+      detId_t detid_;
+      apvPair_t ipair_;
+      fedId_t fedid_;
+      fedCh_t fedch_;
+    };
+    using DetToFeds = std::vector<DetToFed>;
+
+    static constexpr std::uint16_t badBit = 1 << 15;
+
+    class Data {
+    public:
+      struct DeviceView {
+        __device__ inline detId_t detID(fedId_t fed, fedCh_t channel) const {
+          return detID_[channelIndex(fed, channel)];
+        }
+
+        __device__ inline apvPair_t iPair(fedId_t fed, fedCh_t channel) const {
+          return iPair_[channelIndex(fed, channel)];
+        }
+
+        __device__ inline float invthick(fedId_t fed, fedCh_t channel) const {
+          return invthick_[channelIndex(fed, channel)];
+        }
+
+        __device__ inline float noise(fedId_t fed, fedCh_t channel, stripId_t strip) const {
+          // noise is stored as 9 bits with a fixed point scale factor of 0.1
+          return 0.1f * (noise_[stripIndex(fed, channel, strip)] & ~badBit);
+        }
+
+        __device__ inline float gain(fedId_t fed, fedCh_t channel, stripId_t strip) const {
+          return gain_[apvIndex(fed, channel, strip)];
+        }
+
+        __device__ inline bool bad(fedId_t fed, fedCh_t channel, stripId_t strip) const {
+          return badBit == (noise_[stripIndex(fed, channel, strip)] & badBit);
+        }
+        const std::uint16_t* noise_;  //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED * sistrip::STRIPS_PER_FEDCH];
+        const float* invthick_;       //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED];
+        const detId_t* detID_;        //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED];
+        const apvPair_t* iPair_;      //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED];
+        const float* gain_;           //[sistrip::NUMBER_OF_FEDS*sistrip::APVS_PER_FEDCH * sistrip::FEDCH_PER_FED];
+      };
+
+      const DeviceView* deviceView() const { return deviceView_.get(); }
+
+      cms::cuda::device::unique_ptr<DeviceView> deviceView_;
+      cms::cuda::host::unique_ptr<DeviceView> hostView_;
+
+      cms::cuda::device::unique_ptr<std::uint16_t[]>
+          noise_;  //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED * sistrip::STRIPS_PER_FEDCH];
+      cms::cuda::device::unique_ptr<float[]> invthick_;   //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED];
+      cms::cuda::device::unique_ptr<detId_t[]> detID_;    //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED];
+      cms::cuda::device::unique_ptr<apvPair_t[]> iPair_;  //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED];
+      cms::cuda::device::unique_ptr<float[]>
+          gain_;  //[sistrip::NUMBER_OF_FEDS*sistrip::APVS_PER_FEDCH * sistrip::FEDCH_PER_FED];
+    };
+
+    SiStripClusterizerConditionsGPU(const SiStripQuality& quality,
+                                    const SiStripGain* gains,
+                                    const SiStripNoises& noises);
+    ~SiStripClusterizerConditionsGPU() = default;
+
+    // Function to return the actual payload on the memory of the current device
+    Data const& getGPUProductAsync(cudaStream_t stream) const;
+
+    const DetToFeds& detToFeds() const { return detToFeds_; }
+
+  private:
+    void setStrip(fedId_t fed, fedCh_t channel, stripId_t strip, std::uint16_t noise, float gain, bool bad) {
+      gain_[apvIndex(fed, channel, strip)] = gain;
+      noise_[stripIndex(fed, channel, strip)] = noise;
+      if (bad) {
+        noise_[stripIndex(fed, channel, strip)] |= badBit;
+      }
+    }
+
+    void setInvThickness(fedId_t fed, fedCh_t channel, float invthick) {
+      invthick_[channelIndex(fed, channel)] = invthick;
+    }
+
+    // Holds the data in pinned CPU memory
+    std::vector<std::uint16_t, cms::cuda::HostAllocator<std::uint16_t>> noise_;
+    std::vector<float, cms::cuda::HostAllocator<float>> invthick_;
+    std::vector<detId_t, cms::cuda::HostAllocator<detId_t>> detID_;
+    std::vector<apvPair_t, cms::cuda::HostAllocator<apvPair_t>> iPair_;
+    std::vector<float, cms::cuda::HostAllocator<float>> gain_;
+
+    // Helper that takes care of complexity of transferring the data to
+    // multiple devices
+    cms::cuda::ESProduct<Data> gpuData_;
+    DetToFeds detToFeds_;
+  };
+}  // namespace stripgpu
+
+#endif
diff --git a/CalibFormats/SiStripObjects/src/EventSetup_Registration.cc b/CalibFormats/SiStripObjects/src/EventSetup_Registration.cc
@@ -23,3 +23,6 @@ TYPELOOKUP_DATA_REG(SiStripQuality);
 
 #include "CalibFormats/SiStripObjects/interface/SiStripClusterizerConditions.h"
 TYPELOOKUP_DATA_REG(SiStripClusterizerConditions);
+
+#include "CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h"
+TYPELOOKUP_DATA_REG(stripgpu::SiStripClusterizerConditionsGPU);