Skip to content

Commit

Permalink
Merge pull request #34618 from dan131riley/gpu-sistripclusterizer
Browse files Browse the repository at this point in the history
CUDA implementation of RecoLocalTracker/SiStripCluster  ClustersFromRawProducer
  • Loading branch information
cmsbuild authored Mar 27, 2023
2 parents 68e9674 + 44b2092 commit 1805fa8
Show file tree
Hide file tree
Showing 40 changed files with 2,182 additions and 28 deletions.
10 changes: 10 additions & 0 deletions CUDADataFormats/SiStripCluster/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<use name="DataFormats/Common"/>
<use name="CUDADataFormats/Common"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
<use name="cuda"/>
<use name="rootcore"/>

<export>
<lib name="1"/>
</export>

59 changes: 59 additions & 0 deletions CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#ifndef CUDADataFormats_SiStripCluster_interface_SiStripClustersCUDA_h
#define CUDADataFormats_SiStripCluster_interface_SiStripClustersCUDA_h

#include "DataFormats/SiStripCluster/interface/SiStripClustersSOABase.h"
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"

#include <cuda_runtime.h>

class SiStripClustersCUDADevice : public SiStripClustersSOABase<cms::cuda::device::unique_ptr> {
public:
SiStripClustersCUDADevice() = default;
explicit SiStripClustersCUDADevice(uint32_t maxClusters, uint32_t maxStripsPerCluster, cudaStream_t stream);
~SiStripClustersCUDADevice() override = default;

SiStripClustersCUDADevice(const SiStripClustersCUDADevice &) = delete;
SiStripClustersCUDADevice &operator=(const SiStripClustersCUDADevice &) = delete;
SiStripClustersCUDADevice(SiStripClustersCUDADevice &&) = default;
SiStripClustersCUDADevice &operator=(SiStripClustersCUDADevice &&) = default;

struct DeviceView {
uint32_t *clusterIndex_;
uint32_t *clusterSize_;
uint8_t *clusterADCs_;
stripgpu::detId_t *clusterDetId_;
stripgpu::stripId_t *firstStrip_;
bool *trueCluster_;
float *barycenter_;
float *charge_;
uint32_t nClusters_;
uint32_t maxClusterSize_;
};

DeviceView *view() const { return view_d.get(); }
uint32_t nClusters() const { return nClusters_; }
uint32_t *nClustersPtr() { return &nClusters_; }
uint32_t maxClusterSize() const { return maxClusterSize_; }
uint32_t *maxClusterSizePtr() { return &maxClusterSize_; }

private:
cms::cuda::device::unique_ptr<DeviceView> view_d; // "me" pointer
uint32_t nClusters_;
uint32_t maxClusterSize_;
};

class SiStripClustersCUDAHost : public SiStripClustersSOABase<cms::cuda::host::unique_ptr> {
public:
SiStripClustersCUDAHost() = default;
explicit SiStripClustersCUDAHost(const SiStripClustersCUDADevice &clusters_d, cudaStream_t stream);
~SiStripClustersCUDAHost() override = default;

SiStripClustersCUDAHost(const SiStripClustersCUDAHost &) = delete;
SiStripClustersCUDAHost &operator=(const SiStripClustersCUDAHost &) = delete;
SiStripClustersCUDAHost(SiStripClustersCUDAHost &&) = default;
SiStripClustersCUDAHost &operator=(SiStripClustersCUDAHost &&) = default;
};

#endif
59 changes: 59 additions & 0 deletions CUDADataFormats/SiStripCluster/src/SiStripClustersCUDA.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#include "CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h"
#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"

SiStripClustersCUDADevice::SiStripClustersCUDADevice(uint32_t maxClusters,
uint32_t maxStripsPerCluster,
cudaStream_t stream) {
maxClusterSize_ = maxStripsPerCluster;

clusterIndex_ = cms::cuda::make_device_unique<uint32_t[]>(maxClusters, stream);
clusterSize_ = cms::cuda::make_device_unique<uint32_t[]>(maxClusters, stream);
clusterADCs_ = cms::cuda::make_device_unique<uint8_t[]>(maxClusters * maxStripsPerCluster, stream);
clusterDetId_ = cms::cuda::make_device_unique<stripgpu::detId_t[]>(maxClusters, stream);
firstStrip_ = cms::cuda::make_device_unique<stripgpu::stripId_t[]>(maxClusters, stream);
trueCluster_ = cms::cuda::make_device_unique<bool[]>(maxClusters, stream);
barycenter_ = cms::cuda::make_device_unique<float[]>(maxClusters, stream);
charge_ = cms::cuda::make_device_unique<float[]>(maxClusters, stream);

auto view = cms::cuda::make_host_unique<DeviceView>(stream);
view->clusterIndex_ = clusterIndex_.get();
view->clusterSize_ = clusterSize_.get();
view->clusterADCs_ = clusterADCs_.get();
view->clusterDetId_ = clusterDetId_.get();
view->firstStrip_ = firstStrip_.get();
view->trueCluster_ = trueCluster_.get();
view->barycenter_ = barycenter_.get();
view->charge_ = charge_.get();
view->maxClusterSize_ = maxStripsPerCluster;

view_d = cms::cuda::make_device_unique<DeviceView>(stream);
cms::cuda::copyAsync(view_d, view, stream);
#ifdef GPU_CHECK
cudaCheck(cudaStreamSynchronize(stream));
#endif
}

SiStripClustersCUDAHost::SiStripClustersCUDAHost(const SiStripClustersCUDADevice& clusters_d, cudaStream_t stream) {
nClusters_ = clusters_d.nClusters();
maxClusterSize_ = clusters_d.maxClusterSize();
clusterIndex_ = cms::cuda::make_host_unique<uint32_t[]>(nClusters_, stream);
clusterSize_ = cms::cuda::make_host_unique<uint32_t[]>(nClusters_, stream);
clusterADCs_ = cms::cuda::make_host_unique<uint8_t[]>(nClusters_ * maxClusterSize_, stream);
clusterDetId_ = cms::cuda::make_host_unique<stripgpu::detId_t[]>(nClusters_, stream);
firstStrip_ = cms::cuda::make_host_unique<stripgpu::stripId_t[]>(nClusters_, stream);
trueCluster_ = cms::cuda::make_host_unique<bool[]>(nClusters_, stream);
barycenter_ = cms::cuda::make_host_unique<float[]>(nClusters_, stream);
charge_ = cms::cuda::make_host_unique<float[]>(nClusters_, stream);

cms::cuda::copyAsync(clusterIndex_, clusters_d.clusterIndex(), nClusters_, stream);
cms::cuda::copyAsync(clusterSize_, clusters_d.clusterSize(), nClusters_, stream);
cms::cuda::copyAsync(clusterADCs_, clusters_d.clusterADCs(), nClusters_ * maxClusterSize_, stream);
cms::cuda::copyAsync(clusterDetId_, clusters_d.clusterDetId(), nClusters_, stream);
cms::cuda::copyAsync(firstStrip_, clusters_d.firstStrip(), nClusters_, stream);
cms::cuda::copyAsync(trueCluster_, clusters_d.trueCluster(), nClusters_, stream);
cms::cuda::copyAsync(barycenter_, clusters_d.barycenter(), nClusters_, stream);
cms::cuda::copyAsync(charge_, clusters_d.charge(), nClusters_, stream);
#ifdef GPU_CHECK
cudaCheck(cudaStreamSynchronize(stream));
#endif
}
8 changes: 8 additions & 0 deletions CUDADataFormats/SiStripCluster/src/classes.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#ifndef CUDADataFormats_SiStripCluster_classes_h
#define CUDADataFormats_SiStripCluster_classes_h

#include "CUDADataFormats/Common/interface/Product.h"
#include "CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h"
#include "DataFormats/Common/interface/Wrapper.h"

#endif
6 changes: 6 additions & 0 deletions CUDADataFormats/SiStripCluster/src/classes_def.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<lcgdict>
<class name="cms::cuda::Product<SiStripClustersCUDADevice>" persistent="false"/>
<class name="edm::Wrapper<cms::cuda::Product<SiStripClustersCUDADevice>>" persistent="false"/>
<class name="SiStripClustersCUDAHost" persistent="false"/>
<class name="edm::Wrapper<SiStripClustersCUDAHost>" persistent="false"/>
</lcgdict>
3 changes: 3 additions & 0 deletions CalibFormats/SiStripObjects/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
<use name="DataFormats/SiStripCommon"/>
<use name="CondFormats/SiStripObjects"/>
<use name="DataFormats/TrackerCommon"/>
<use name="HeterogeneousCore/CUDACore"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
<use name="cuda"/>
<export>
<lib name="1"/>
</export>
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#ifndef CalibFormats_SiStripObjects_SiStripClusterizerConditionsGPU_h
#define CalibFormats_SiStripObjects_SiStripClusterizerConditionsGPU_h

#include "DataFormats/SiStripCluster/interface/SiStripTypes.h"
#include "DataFormats/SiStripCommon/interface/ConstantsForHardwareSystems.h"

#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"

class SiStripQuality;
class SiStripGain;
class SiStripNoises;

namespace stripgpu {
__host__ __device__ inline fedId_t fedIndex(fedId_t fed) { return fed - sistrip::FED_ID_MIN; }
__host__ __device__ inline std::uint32_t stripIndex(fedId_t fed, fedCh_t channel, stripId_t strip) {
return fedIndex(fed) * sistrip::FEDCH_PER_FED * sistrip::STRIPS_PER_FEDCH + channel * sistrip::STRIPS_PER_FEDCH +
(strip % sistrip::STRIPS_PER_FEDCH);
}
__host__ __device__ inline std::uint32_t apvIndex(fedId_t fed, fedCh_t channel, stripId_t strip) {
return fedIndex(fed) * sistrip::APVS_PER_FEDCH * sistrip::FEDCH_PER_FED + sistrip::APVS_PER_CHAN * channel +
(strip % sistrip::STRIPS_PER_FEDCH) / sistrip::STRIPS_PER_APV;
}
__host__ __device__ inline std::uint32_t channelIndex(fedId_t fed, fedCh_t channel) {
return fedIndex(fed) * sistrip::FEDCH_PER_FED + channel;
}

class SiStripClusterizerConditionsGPU {
public:
class DetToFed {
public:
DetToFed(detId_t detid, apvPair_t ipair, fedId_t fedid, fedCh_t fedch)
: detid_(detid), ipair_(ipair), fedid_(fedid), fedch_(fedch) {}
detId_t detID() const { return detid_; }
apvPair_t pair() const { return ipair_; }
fedId_t fedID() const { return fedid_; }
fedCh_t fedCh() const { return fedch_; }

private:
detId_t detid_;
apvPair_t ipair_;
fedId_t fedid_;
fedCh_t fedch_;
};
using DetToFeds = std::vector<DetToFed>;

static constexpr std::uint16_t badBit = 1 << 15;

class Data {
public:
struct DeviceView {
__device__ inline detId_t detID(fedId_t fed, fedCh_t channel) const {
return detID_[channelIndex(fed, channel)];
}

__device__ inline apvPair_t iPair(fedId_t fed, fedCh_t channel) const {
return iPair_[channelIndex(fed, channel)];
}

__device__ inline float invthick(fedId_t fed, fedCh_t channel) const {
return invthick_[channelIndex(fed, channel)];
}

__device__ inline float noise(fedId_t fed, fedCh_t channel, stripId_t strip) const {
// noise is stored as 9 bits with a fixed point scale factor of 0.1
return 0.1f * (noise_[stripIndex(fed, channel, strip)] & ~badBit);
}

__device__ inline float gain(fedId_t fed, fedCh_t channel, stripId_t strip) const {
return gain_[apvIndex(fed, channel, strip)];
}

__device__ inline bool bad(fedId_t fed, fedCh_t channel, stripId_t strip) const {
return badBit == (noise_[stripIndex(fed, channel, strip)] & badBit);
}
const std::uint16_t* noise_; //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED * sistrip::STRIPS_PER_FEDCH];
const float* invthick_; //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED];
const detId_t* detID_; //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED];
const apvPair_t* iPair_; //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED];
const float* gain_; //[sistrip::NUMBER_OF_FEDS*sistrip::APVS_PER_FEDCH * sistrip::FEDCH_PER_FED];
};

const DeviceView* deviceView() const { return deviceView_.get(); }

cms::cuda::device::unique_ptr<DeviceView> deviceView_;
cms::cuda::host::unique_ptr<DeviceView> hostView_;

cms::cuda::device::unique_ptr<std::uint16_t[]>
noise_; //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED * sistrip::STRIPS_PER_FEDCH];
cms::cuda::device::unique_ptr<float[]> invthick_; //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED];
cms::cuda::device::unique_ptr<detId_t[]> detID_; //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED];
cms::cuda::device::unique_ptr<apvPair_t[]> iPair_; //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED];
cms::cuda::device::unique_ptr<float[]>
gain_; //[sistrip::NUMBER_OF_FEDS*sistrip::APVS_PER_FEDCH * sistrip::FEDCH_PER_FED];
};

SiStripClusterizerConditionsGPU(const SiStripQuality& quality,
const SiStripGain* gains,
const SiStripNoises& noises);
~SiStripClusterizerConditionsGPU() = default;

// Function to return the actual payload on the memory of the current device
Data const& getGPUProductAsync(cudaStream_t stream) const;

const DetToFeds& detToFeds() const { return detToFeds_; }

private:
void setStrip(fedId_t fed, fedCh_t channel, stripId_t strip, std::uint16_t noise, float gain, bool bad) {
gain_[apvIndex(fed, channel, strip)] = gain;
noise_[stripIndex(fed, channel, strip)] = noise;
if (bad) {
noise_[stripIndex(fed, channel, strip)] |= badBit;
}
}

void setInvThickness(fedId_t fed, fedCh_t channel, float invthick) {
invthick_[channelIndex(fed, channel)] = invthick;
}

// Holds the data in pinned CPU memory
std::vector<std::uint16_t, cms::cuda::HostAllocator<std::uint16_t>> noise_;
std::vector<float, cms::cuda::HostAllocator<float>> invthick_;
std::vector<detId_t, cms::cuda::HostAllocator<detId_t>> detID_;
std::vector<apvPair_t, cms::cuda::HostAllocator<apvPair_t>> iPair_;
std::vector<float, cms::cuda::HostAllocator<float>> gain_;

// Helper that takes care of complexity of transferring the data to
// multiple devices
cms::cuda::ESProduct<Data> gpuData_;
DetToFeds detToFeds_;
};
} // namespace stripgpu

#endif
3 changes: 3 additions & 0 deletions CalibFormats/SiStripObjects/src/EventSetup_Registration.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,6 @@ TYPELOOKUP_DATA_REG(SiStripQuality);

#include "CalibFormats/SiStripObjects/interface/SiStripClusterizerConditions.h"
TYPELOOKUP_DATA_REG(SiStripClusterizerConditions);

#include "CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h"
TYPELOOKUP_DATA_REG(stripgpu::SiStripClusterizerConditionsGPU);
Loading

0 comments on commit 1805fa8

Please sign in to comment.