Skip to content

Commit

Permalink
rebase to 12_0_X
Browse files Browse the repository at this point in the history
  • Loading branch information
Dan Riley authored and Dan Riley committed Jun 8, 2021
1 parent 2203a47 commit 39d686e
Show file tree
Hide file tree
Showing 28 changed files with 2,125 additions and 2 deletions.
10 changes: 10 additions & 0 deletions CUDADataFormats/SiStripCluster/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<use name="DataFormats/Common"/>
<use name="CUDADataFormats/Common"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
<use name="cuda"/>
<use name="rootcore"/>

<export>
<lib name="1"/>
</export>

19 changes: 19 additions & 0 deletions CUDADataFormats/SiStripCluster/interface/GPUtypes.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#ifndef CUDADataFormats_SiStripCluster_interface_GPUtypes_h
#define CUDADataFormats_SiStripCluster_interface_GPUtypes_h

#include <cstdint>
#include <limits>

namespace stripgpu {
using detId_t = std::uint32_t;
using fedId_t = std::uint16_t;
using fedCh_t = std::uint8_t;
using APVPair_t = std::uint16_t;
using stripId_t = std::uint16_t;

static constexpr detId_t invDet = std::numeric_limits<detId_t>::max();
static constexpr fedId_t invFed = std::numeric_limits<fedId_t>::max();
static constexpr stripId_t invStrip = std::numeric_limits<stripId_t>::max();
} // namespace stripgpu

#endif
144 changes: 144 additions & 0 deletions CUDADataFormats/SiStripCluster/interface/MkFitSiStripClustersCUDA.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#ifndef CUDADataFormats_SiStripCluster_interface_MkFitSiStripClustersCUDA_h
#define CUDADataFormats_SiStripCluster_interface_MkFitSiStripClustersCUDA_h

#include "CUDADataFormats/SiStripCluster/interface/GPUtypes.h"

#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"

#include <cuda_runtime.h>

class MkFitSiStripClustersCUDA {
public:
MkFitSiStripClustersCUDA() = default;
explicit MkFitSiStripClustersCUDA(size_t maxClusters, int clustersPerStrip, cudaStream_t stream);
~MkFitSiStripClustersCUDA() = default;

MkFitSiStripClustersCUDA(const MkFitSiStripClustersCUDA &) = delete;
MkFitSiStripClustersCUDA &operator=(const MkFitSiStripClustersCUDA &) = delete;
MkFitSiStripClustersCUDA(MkFitSiStripClustersCUDA &&) = default;
MkFitSiStripClustersCUDA &operator=(MkFitSiStripClustersCUDA &&) = default;

void setNClusters(uint32_t nClusters) { nClusters_h = nClusters; }

uint32_t nClusters() const { return nClusters_h; }

class GlobalDeviceView {
public:
// __device__ __forceinline__ float local_xx(int i) const { return __ldg(local_xx_ + i); }
// __device__ __forceinline__ float local_xy(int i) const { return __ldg(local_xy_ + i); }
// __device__ __forceinline__ float local_yy(int i) const { return __ldg(local_yy_ + i); }
// __device__ __forceinline__ float local(int i) const { return __ldg(local_ + i); }
__device__ __forceinline__ float global_x(int i) const { return __ldg(global_x_ + i); }
__device__ __forceinline__ float global_y(int i) const { return __ldg(global_y_ + i); }
__device__ __forceinline__ float global_z(int i) const { return __ldg(global_z_ + i); }

__device__ __forceinline__ float global_xx(int i) const { return __ldg(global_xx_ + i); }
__device__ __forceinline__ float global_xy(int i) const { return __ldg(global_xy_ + i); }
__device__ __forceinline__ float global_xz(int i) const { return __ldg(global_xz_ + i); }
__device__ __forceinline__ float global_yy(int i) const { return __ldg(global_yy_ + i); }
__device__ __forceinline__ float global_yz(int i) const { return __ldg(global_yz_ + i); }
__device__ __forceinline__ float global_zz(int i) const { return __ldg(global_zz_ + i); }

__device__ __forceinline__ short layer(int i) const { return __ldg(layer_ + i); }
__device__ __forceinline__ float charge(int i) const { return __ldg(charge_ + i); }
// __device__ __forceinline__ stripgpu::detId_t clusterDetId(int i) const { return __ldg(clusterDetId_ + i); }
// __device__ __forceinline__ uint32_t clusterIndex(int i) const { return __ldg(clusterIndex_ + i); }
// __device__ __forceinline__ uint8_t clusterADCs(int i) const { return __ldg(clusterADCs_ + i); }
__device__ __forceinline__ stripgpu::stripId_t firstStrip(int i) const { return __ldg(firstStrip_ + i); }
__device__ __forceinline__ uint32_t clusterSize(int i) const { return __ldg(clusterSize_ + i); }

friend MkFitSiStripClustersCUDA;

// private:
int nClusters_;

// float *local_xx_;
// float *local_xy_;
// float *local_yy_;
// float *local_;
float *global_x_;
float *global_y_;
float *global_z_;

float *global_xx_;
float *global_xy_;
float *global_xz_;
float *global_yy_;
float *global_yz_;
float *global_zz_;

short *layer_;
float *charge_;
stripgpu::detId_t *clusterDetId_;
// uint32_t *clusterIndex_;
//uint8_t *clusterADCs_;
stripgpu::stripId_t *firstStrip_;
uint32_t *clusterSize_;
};

GlobalDeviceView *gview() const { return gview_d.get(); }

class HostView {
public:
explicit HostView(size_t maxClusters, int clustersPerStrip, cudaStream_t stream);

cms::cuda::host::unique_ptr<stripgpu::detId_t[]> clusterDetId_h;
// cms::cuda::host::unique_ptr<uint32_t[]> clusterIndex_h;
cms::cuda::host::unique_ptr<float[]> charge_h;
//cms::cuda::host::unique_ptr<uint8_t[]> clusterADCs_h;
cms::cuda::host::unique_ptr<uint32_t[]> clusterSize_h;
cms::cuda::host::unique_ptr<stripgpu::stripId_t[]> firstStrip_h;

// cms::cuda::host::unique_ptr<float[]> local_xx_h;
// cms::cuda::host::unique_ptr<float[]> local_xy_h;
// cms::cuda::host::unique_ptr<float[]> local_yy_h;
// cms::cuda::host::unique_ptr<float[]> local_h;
cms::cuda::host::unique_ptr<float[]> global_x_h;
cms::cuda::host::unique_ptr<float[]> global_y_h;
cms::cuda::host::unique_ptr<float[]> global_z_h;
cms::cuda::host::unique_ptr<float[]> global_xx_h;
cms::cuda::host::unique_ptr<float[]> global_xy_h;
cms::cuda::host::unique_ptr<float[]> global_xz_h;
cms::cuda::host::unique_ptr<float[]> global_yy_h;
cms::cuda::host::unique_ptr<float[]> global_yz_h;
cms::cuda::host::unique_ptr<float[]> global_zz_h;

cms::cuda::host::unique_ptr<short[]> layer_h;
int nClusters_h;
};

std::unique_ptr<HostView> hostView(int clustersPerStrip, cudaStream_t stream) const;

private:
cms::cuda::device::unique_ptr<stripgpu::detId_t[]> clusterDetId_d;
// cms::cuda::device::unique_ptr<uint32_t[]> clusterIndex_d;
cms::cuda::device::unique_ptr<float[]> charge_d;
//cms::cuda::device::unique_ptr<uint8_t[]> clusterADCs_d;
cms::cuda::device::unique_ptr<uint32_t[]> clusterSize_d;
cms::cuda::device::unique_ptr<stripgpu::stripId_t[]> firstStrip_d;

// cms::cuda::device::unique_ptr<float[]> local_xx_d;
// cms::cuda::device::unique_ptr<float[]> local_xy_d;
// cms::cuda::device::unique_ptr<float[]> local_yy_d;
// cms::cuda::device::unique_ptr<float[]> local_d;
cms::cuda::device::unique_ptr<float[]> global_x_d;
cms::cuda::device::unique_ptr<float[]> global_y_d;
cms::cuda::device::unique_ptr<float[]> global_z_d;
cms::cuda::device::unique_ptr<float[]> global_xx_d;
cms::cuda::device::unique_ptr<float[]> global_xy_d;
cms::cuda::device::unique_ptr<float[]> global_xz_d;
cms::cuda::device::unique_ptr<float[]> global_yy_d;
cms::cuda::device::unique_ptr<float[]> global_yz_d;
cms::cuda::device::unique_ptr<float[]> global_zz_d;

cms::cuda::device::unique_ptr<short[]> layer_d;

cms::cuda::device::unique_ptr<GlobalDeviceView> gview_d; // "me" pointer

public:
int nClusters_h;
};

#endif
89 changes: 89 additions & 0 deletions CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#ifndef CUDADataFormats_SiStripCluster_interface_SiStripClustersCUDA_h
#define CUDADataFormats_SiStripCluster_interface_SiStripClustersCUDA_h

#include "CUDADataFormats/SiStripCluster/interface/GPUtypes.h"

#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"

#include <cuda_runtime.h>

class SiStripClustersCUDA {
public:
static constexpr uint32_t kClusterMaxStrips = 16;

SiStripClustersCUDA() = default;
explicit SiStripClustersCUDA(size_t maxClusters, int clustersPerStrip, cudaStream_t stream);
~SiStripClustersCUDA() = default;

SiStripClustersCUDA(const SiStripClustersCUDA &) = delete;
SiStripClustersCUDA &operator=(const SiStripClustersCUDA &) = delete;
SiStripClustersCUDA(SiStripClustersCUDA &&) = default;
SiStripClustersCUDA &operator=(SiStripClustersCUDA &&) = default;

void setNClusters(uint32_t nClusters) { nClusters_h = nClusters; }

uint32_t nClusters() const { return nClusters_h; }

class DeviceView {
public:
__device__ __forceinline__ uint32_t clusterIndex(int i) const { return __ldg(clusterIndex_ + i); }
__device__ __forceinline__ uint32_t clusterSize(int i) const { return __ldg(clusterSize_ + i); }
__device__ __forceinline__ uint8_t clusterADCs(int i) const { return __ldg(clusterADCs_ + i); }
__device__ __forceinline__ stripgpu::detId_t clusterDetId(int i) const { return __ldg(clusterDetId_ + i); }
__device__ __forceinline__ stripgpu::stripId_t firstStrip(int i) const { return __ldg(firstStrip_ + i); }
__device__ __forceinline__ bool trueCluster(int i) const { return trueCluster_[i]; }
__device__ __forceinline__ float barycenter(int i) const { return __ldg(barycenter_ + i); }
__device__ __forceinline__ float charge(int i) const { return __ldg(charge_ + i); }

friend SiStripClustersCUDA;

// private:
uint32_t *clusterIndex_;
uint32_t *clusterSize_;
uint8_t *clusterADCs_;
stripgpu::detId_t *clusterDetId_;
stripgpu::stripId_t *firstStrip_;
bool *trueCluster_;
float *barycenter_;
float *charge_;
int nClusters_;
};

DeviceView *view() const { return view_d.get(); }

class HostView {
public:
explicit HostView(size_t maxClusters, int clustersPerStrip, cudaStream_t stream);

cms::cuda::host::unique_ptr<uint32_t[]> clusterIndex_h;
cms::cuda::host::unique_ptr<uint32_t[]> clusterSize_h;
cms::cuda::host::unique_ptr<uint8_t[]> clusterADCs_h;
cms::cuda::host::unique_ptr<stripgpu::detId_t[]> clusterDetId_h;
cms::cuda::host::unique_ptr<stripgpu::stripId_t[]> firstStrip_h;
cms::cuda::host::unique_ptr<bool[]> trueCluster_h;
cms::cuda::host::unique_ptr<float[]> barycenter_h;
cms::cuda::host::unique_ptr<float[]> charge_h;
int nClusters_h;
};

std::unique_ptr<HostView> hostView(int clustersPerStrip, cudaStream_t stream) const;

private:
cms::cuda::device::unique_ptr<uint32_t[]> clusterIndex_d;
cms::cuda::device::unique_ptr<uint32_t[]> clusterSize_d;
cms::cuda::device::unique_ptr<uint8_t[]> clusterADCs_d;
cms::cuda::device::unique_ptr<stripgpu::detId_t[]> clusterDetId_d;
cms::cuda::device::unique_ptr<stripgpu::stripId_t[]> firstStrip_d;
cms::cuda::device::unique_ptr<bool[]> trueCluster_d;
cms::cuda::device::unique_ptr<float[]> barycenter_d;
cms::cuda::device::unique_ptr<float[]> charge_d;

cms::cuda::device::unique_ptr<DeviceView> view_d; // "me" pointer

public:
int nClusters_h;
};

#endif
110 changes: 110 additions & 0 deletions CUDADataFormats/SiStripCluster/src/MkFitSiStripClustersCUDA.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#include "CUDADataFormats/SiStripCluster/interface/MkFitSiStripClustersCUDA.h"
#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"

MkFitSiStripClustersCUDA::MkFitSiStripClustersCUDA(size_t maxClusters, int clustersPerStrip, cudaStream_t stream) {
clusterDetId_d = cms::cuda::make_device_unique<stripgpu::detId_t[]>(maxClusters, stream);
// clusterIndex_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters, stream);
clusterSize_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters, stream);
charge_d = cms::cuda::make_device_unique<float[]>(maxClusters, stream);
// clusterADCs_d = cms::cuda::make_device_unique<uint8_t[]>(maxClusters * clustersPerStrip, stream);
firstStrip_d = cms::cuda::make_device_unique<stripgpu::stripId_t[]>(maxClusters, stream);

// local_xx_d = cms::cuda::make_device_unique<float[]>(maxClusters, stream);
// local_xy_d = cms::cuda::make_device_unique<float[]>(maxClusters, stream);
// local_yy_d = cms::cuda::make_device_unique<float[]>(maxClusters, stream);
// local_d = cms::cuda::make_device_unique<float[]>(maxClusters, stream);
global_x_d = cms::cuda::make_device_unique<float[]>(maxClusters, stream);
global_y_d = cms::cuda::make_device_unique<float[]>(maxClusters, stream);
global_z_d = cms::cuda::make_device_unique<float[]>(maxClusters, stream);
global_xx_d = cms::cuda::make_device_unique<float[]>(maxClusters, stream);
global_xy_d = cms::cuda::make_device_unique<float[]>(maxClusters, stream);
global_xz_d = cms::cuda::make_device_unique<float[]>(maxClusters, stream);
global_yy_d = cms::cuda::make_device_unique<float[]>(maxClusters, stream);
global_yz_d = cms::cuda::make_device_unique<float[]>(maxClusters, stream);
global_zz_d = cms::cuda::make_device_unique<float[]>(maxClusters, stream);

layer_d = cms::cuda::make_device_unique<short[]>(maxClusters, stream);

auto gview = cms::cuda::make_host_unique<GlobalDeviceView>(stream);
// gview->local_xx_ = local_xx_d.get();
// gview->local_xy_ = local_xy_d.get();
// gview->local_yy_ = local_yy_d.get();
// gview->local_ = local_d.get();
gview->global_x_ = global_x_d.get();
gview->global_y_ = global_y_d.get();
gview->global_z_ = global_z_d.get();
gview->global_xx_ = global_xx_d.get();
gview->global_xy_ = global_xy_d.get();
gview->global_xz_ = global_xz_d.get();
gview->global_yy_ = global_yy_d.get();
gview->global_yz_ = global_yz_d.get();
gview->global_zz_ = global_zz_d.get();
gview->charge_ = charge_d.get();
gview->clusterDetId_ = clusterDetId_d.get();
// gview->clusterIndex_ = clusterIndex_d.get();
//gview->clusterADCs_ = clusterADCs_d.get();
gview->firstStrip_ = firstStrip_d.get();
gview->clusterSize_ = clusterSize_d.get();

gview->layer_ = layer_d.get();

gview_d = cms::cuda::make_device_unique<GlobalDeviceView>(stream);
cms::cuda::copyAsync(gview_d, gview, stream);
}

MkFitSiStripClustersCUDA::HostView::HostView(size_t maxClusters, int clustersPerStrip, cudaStream_t stream) {
clusterDetId_h = cms::cuda::make_host_unique<stripgpu::detId_t[]>(maxClusters, stream);
// clusterIndex_h = cms::cuda::make_host_unique<uint32_t[]>(maxClusters, stream);
charge_h = cms::cuda::make_host_unique<float[]>(maxClusters, stream);
// clusterADCs_h = cms::cuda::make_host_unique<uint8_t[]>(maxClusters * clustersPerStrip, stream);
firstStrip_h = cms::cuda::make_host_unique<stripgpu::stripId_t[]>(maxClusters, stream);
clusterSize_h = cms::cuda::make_host_unique<uint32_t[]>(maxClusters, stream);

// local_xx_h = cms::cuda::make_host_unique<float[]>(maxClusters, stream);
// local_xy_h = cms::cuda::make_host_unique<float[]>(maxClusters, stream);
// local_yy_h = cms::cuda::make_host_unique<float[]>(maxClusters, stream);
// local_h = cms::cuda::make_host_unique<float[]>(maxClusters, stream);
global_x_h = cms::cuda::make_host_unique<float[]>(maxClusters, stream);
global_y_h = cms::cuda::make_host_unique<float[]>(maxClusters, stream);
global_z_h = cms::cuda::make_host_unique<float[]>(maxClusters, stream);
global_xx_h = cms::cuda::make_host_unique<float[]>(maxClusters, stream);
global_xy_h = cms::cuda::make_host_unique<float[]>(maxClusters, stream);
global_xz_h = cms::cuda::make_host_unique<float[]>(maxClusters, stream);
global_yy_h = cms::cuda::make_host_unique<float[]>(maxClusters, stream);
global_yz_h = cms::cuda::make_host_unique<float[]>(maxClusters, stream);
global_zz_h = cms::cuda::make_host_unique<float[]>(maxClusters, stream);

layer_h = cms::cuda::make_host_unique<short[]>(maxClusters, stream);

nClusters_h = maxClusters;
}

std::unique_ptr<MkFitSiStripClustersCUDA::HostView> MkFitSiStripClustersCUDA::hostView(int clustersPerStrip,
cudaStream_t stream) const {
auto view_h = std::make_unique<HostView>(nClusters_h, clustersPerStrip, stream);

cms::cuda::copyAsync(view_h->clusterDetId_h, clusterDetId_d, nClusters_h, stream);
// cms::cuda::copyAsync(view_h->clusterIndex_h, clusterIndex_d, nClusters_h, stream);
cms::cuda::copyAsync(view_h->charge_h, charge_d, nClusters_h, stream);
// cms::cuda::copyAsync(view_h->clusterADCs_h, clusterADCs_d, nClusters_h * clustersPerStrip, stream);
cms::cuda::copyAsync(view_h->firstStrip_h, firstStrip_d, nClusters_h, stream);
cms::cuda::copyAsync(view_h->clusterSize_h, clusterSize_d, nClusters_h, stream);

// cms::cuda::copyAsync(view_h->local_xx_h, local_xx_d, nClusters_h, stream);
// cms::cuda::copyAsync(view_h->local_xy_h, local_xy_d, nClusters_h, stream);
// cms::cuda::copyAsync(view_h->local_yy_h, local_yy_d, nClusters_h, stream);
// cms::cuda::copyAsync(view_h->local_h, local_d, nClusters_h, stream);
cms::cuda::copyAsync(view_h->global_x_h, global_x_d, nClusters_h, stream);
cms::cuda::copyAsync(view_h->global_y_h, global_y_d, nClusters_h, stream);
cms::cuda::copyAsync(view_h->global_z_h, global_z_d, nClusters_h, stream);
cms::cuda::copyAsync(view_h->global_xx_h, global_xx_d, nClusters_h, stream);
cms::cuda::copyAsync(view_h->global_xy_h, global_xy_d, nClusters_h, stream);
cms::cuda::copyAsync(view_h->global_xz_h, global_xz_d, nClusters_h, stream);
cms::cuda::copyAsync(view_h->global_yy_h, global_yy_d, nClusters_h, stream);
cms::cuda::copyAsync(view_h->global_yz_h, global_yz_d, nClusters_h, stream);
cms::cuda::copyAsync(view_h->global_zz_h, global_zz_d, nClusters_h, stream);

cms::cuda::copyAsync(view_h->layer_h, layer_d, nClusters_h, stream);

return view_h;
}
Loading

0 comments on commit 39d686e

Please sign in to comment.