Skip to content

Commit

Permalink
Move BeamSpot transfer to GPU to its own producer (cms-sw#318)
Browse files Browse the repository at this point in the history
Implement a non-caching host allocator, useful for host-to-device copy buffers:
  - not bound to any CUDA stream to allow use in EDM beginStream();
  - with the possibility to pass flags to cudaHostAlloc(), e.g. cudaHostAllocWriteCombined.

Add perfect forwarding overload for CUDAProduct constructor, enabling the use of CUDAScopedContext::emplace() in BeamSpotToCUDA::produce().

Move the BeamSpot host-to-device transfer to its own EDProducer, making use of beginStream()-allocated write-combined memory for the transfer.
  • Loading branch information
makortel authored and fwyzard committed Apr 23, 2019
1 parent 9407e0e commit fcbb820
Show file tree
Hide file tree
Showing 23 changed files with 300 additions and 35 deletions.
8 changes: 8 additions & 0 deletions CUDADataFormats/BeamSpot/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<use name="FWCore/ServiceRegistry"/>
<use name="HeterogeneousCore/CUDAServices"/>
<use name="cuda-api-wrappers"/>
<use name="rootcore"/>

<export>
<lib name="1"/>
</export>
32 changes: 32 additions & 0 deletions CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#ifndef CUDADataFormats_BeamSpot_interface_BeamSpotCUDA_h
#define CUDADataFormats_BeamSpot_interface_BeamSpotCUDA_h

#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"

#include <cuda/api_wrappers.h>

class BeamSpotCUDA {
public:
// alignas(128) doesn't really make sense as there is only one
// beamspot per event?
struct Data {
float x,y,z; // position
// TODO: add covariance matrix

float sigmaZ;
float beamWidthX, beamWidthY;
float dxdz, dydz;
float emittanceX, emittanceY;
float betaStar;
};

BeamSpotCUDA() = default;
BeamSpotCUDA(Data const* data_h, cuda::stream_t<>& stream);

Data const* data() const { return data_d_.get(); }

private:
cudautils::device::unique_ptr<Data> data_d_;
};

#endif
11 changes: 11 additions & 0 deletions CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"

#include "FWCore/ServiceRegistry/interface/Service.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"

BeamSpotCUDA::BeamSpotCUDA(Data const* data_h, cuda::stream_t<>& stream) {
edm::Service<CUDAService> cs;

data_d_ = cs->make_device_unique<Data>(stream);
cuda::memory::async::copy(data_d_.get(), data_h, sizeof(Data), stream.id());
}
8 changes: 8 additions & 0 deletions CUDADataFormats/BeamSpot/src/classes.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#ifndef CUDADataFormats_BeamSpot_classes_h
#define CUDADataFormats_BeamSpot_classes_h

#include "CUDADataFormats/Common/interface/CUDAProduct.h"
#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
#include "DataFormats/Common/interface/Wrapper.h"

#endif
4 changes: 4 additions & 0 deletions CUDADataFormats/BeamSpot/src/classes_def.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
<lcgdict>
<class name="CUDAProduct<BeamSpotCUDA>" persistent="false"/>
<class name="edm::Wrapper<CUDAProduct<BeamSpotCUDA>>" persistent="false"/>
</lcgdict>
6 changes: 6 additions & 0 deletions CUDADataFormats/Common/interface/CUDAProduct.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,12 @@ class CUDAProduct: public CUDAProductBase {
data_(std::move(data))
{}

template <typename... Args>
explicit CUDAProduct(int device, std::shared_ptr<cuda::stream_t<>> stream, std::shared_ptr<cuda::event_t> event, Args&&... args):
CUDAProductBase(device, std::move(stream), std::move(event)),
data_(std::forward<Args>(args)...)
{}

T data_; //!
};

Expand Down
4 changes: 2 additions & 2 deletions Configuration/StandardSequences/python/Reconstruction_cff.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,9 +198,9 @@
reconstruction_trackingOnly = cms.Sequence(localreco*globalreco_tracking)
reconstruction_pixelTrackingOnly = cms.Sequence(
pixeltrackerlocalreco*
offlineBeamSpot*
siPixelClusterShapeCachePreSplitting*
recopixelvertexing
recopixelvertexing,
offlineBeamSpotTask
)

#need a fully expanded sequence copy
Expand Down
1 change: 1 addition & 0 deletions HeterogeneousCore/CUDAUtilities/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<use name="cub"/>
<use name="cuda"/>
<use name="cuda-api-wrappers"/>

<export>
<lib name="1"/>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#ifndef HeterogeneousCore_CUDAUtilities_interface_host_noncached_unique_ptr_h
#define HeterogeneousCore_CUDAUtilities_interface_host_noncached_unique_ptr_h

#include <memory>

#include <cuda/api_wrappers.h>
#include <cuda_runtime.h>

namespace cudautils {
namespace host {
namespace noncached {
namespace impl {
// Additional layer of types to distinguish from host::unique_ptr
class HostDeleter {
public:
void operator()(void *ptr) {
cuda::throw_if_error(cudaFreeHost(ptr));
}
};
}

template <typename T>
using unique_ptr = std::unique_ptr<T, impl::HostDeleter>;

namespace impl {
template <typename T>
struct make_host_unique_selector { using non_array = cudautils::host::noncached::unique_ptr<T>; };
template <typename T>
struct make_host_unique_selector<T[]> { using unbounded_array = cudautils::host::noncached::unique_ptr<T[]>; };
template <typename T, size_t N>
struct make_host_unique_selector<T[N]> { struct bounded_array {}; };
}
}
}

/**
* The difference wrt. CUDAService::make_host_unique is that these
* do not cache, so they should not be called per-event.
*/
template <typename T>
typename host::noncached::impl::make_host_unique_selector<T>::non_array
make_host_noncached_unique(unsigned int flags = cudaHostAllocDefault) {
static_assert(std::is_trivially_constructible<T>::value, "Allocating with non-trivial constructor on the pinned host memory is not supported");
void *mem;
cuda::throw_if_error(cudaHostAlloc(&mem, sizeof(T), flags));
return typename cudautils::host::noncached::impl::make_host_unique_selector<T>::non_array(reinterpret_cast<T *>(mem));
}

template <typename T>
typename host::noncached::impl::make_host_unique_selector<T>::unbounded_array
make_host_noncached_unique(size_t n, unsigned int flags = cudaHostAllocDefault) {
using element_type = typename std::remove_extent<T>::type;
static_assert(std::is_trivially_constructible<element_type>::value, "Allocating with non-trivial constructor on the pinned host memory is not supported");
void *mem;
cuda::throw_if_error(cudaHostAlloc(&mem, n*sizeof(element_type), flags));
return typename cudautils::host::noncached::impl::make_host_unique_selector<T>::unbounded_array(reinterpret_cast<element_type *>(mem));
}

template <typename T, typename ...Args>
typename cudautils::host::noncached::impl::make_host_unique_selector<T>::bounded_array
make_host_noncached_unique(Args&&...) = delete;
}

#endif

2 changes: 1 addition & 1 deletion HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
<flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
</bin>

<bin file="testCatch2Main.cpp,copyAsync_t.cpp,memsetAsync_t.cpp" name="cudaMemUtils_t">
<bin file="testCatch2Main.cpp,copyAsync_t.cpp,memsetAsync_t.cpp,host_noncached_unique_ptr_t.cpp" name="cudaMemUtils_t">
<use name="HeterogeneousCore/CUDAServices"/>
<use name="catch2"/>
</bin>
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#include "catch.hpp"

#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"

TEST_CASE("host_noncached_unique_ptr", "[cudaMemTools]") {
exitSansCUDADevices();

SECTION("Single element") {
auto ptr1 = cudautils::make_host_noncached_unique<int>();
REQUIRE(ptr1 != nullptr);
auto ptr2 = cudautils::make_host_noncached_unique<int>(cudaHostAllocPortable | cudaHostAllocWriteCombined);
REQUIRE(ptr2 != nullptr);
}

SECTION("Multiple elements") {
auto ptr1 = cudautils::make_host_noncached_unique<int[]>(10);
REQUIRE(ptr1 != nullptr);
auto ptr2 = cudautils::make_host_noncached_unique<int[]>(10, cudaHostAllocPortable | cudaHostAllocWriteCombined);
REQUIRE(ptr2 != nullptr);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@
#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
#include "FWCore/ParameterSet/interface/ParameterSet.h"
#include "FWCore/ServiceRegistry/interface/Service.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
#include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h"
#include "RecoTracker/Record/interface/CkfComponentsRecord.h"

Expand Down Expand Up @@ -62,6 +64,7 @@ class SiPixelRawToClusterCUDA: public edm::stream::EDProducer<edm::ExternalWork>
std::unique_ptr<PixelUnpackingRegions> regions_;

pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo_;
std::unique_ptr<pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender> wordFedAppender_;
PixelDataFormatter::Errors errors_;

const bool includeErrors_;
Expand All @@ -88,6 +91,11 @@ SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(const edm::ParameterSet& iConfi
}

if(usePilotBlade_) edm::LogInfo("SiPixelRawToCluster") << " Use pilot blade data (FED 40)";

edm::Service<CUDAService> cs;
if(cs->enabled()) {
wordFedAppender_ = std::make_unique<pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender>();
}
}

void SiPixelRawToClusterCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
Expand Down Expand Up @@ -161,7 +169,6 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::Event

// In CPU algorithm this loop is part of PixelDataFormatter::interpretRawData()
ErrorChecker errorcheck;
auto wordFedAppender = pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender(ctx.stream());
for(int fedId: fedIds_) {
if (!usePilotBlade_ && (fedId==40) ) continue; // skip pilot blade data
if (regions_ && !regions_->mayUnpackFED(fedId)) continue;
Expand Down Expand Up @@ -209,13 +216,13 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::Event
const cms_uint32_t * ew = (const cms_uint32_t *)(trailer);

assert(0 == (ew-bw)%2);
wordFedAppender.initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw));
wordFedAppender_->initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw));
wordCounterGPU+=(ew-bw);

} // end of for loop

gpuAlgo_.makeClustersAsync(gpuMap, gpuModulesToUnpack, gpuGains,
wordFedAppender,
*wordFedAppender_,
std::move(errors_),
wordCounterGPU, fedCounter,
useQuality_, includeErrors_,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,9 @@ namespace pixelgpudetails {
// number of words for all the FEDs
constexpr uint32_t MAX_FED_WORDS = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;

SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender(cuda::stream_t<>& cudaStream) {
edm::Service<CUDAService> cs;
word_ = cs->make_host_unique<unsigned int[]>(MAX_FED_WORDS, cudaStream);
fedId_ = cs->make_host_unique<unsigned char[]>(MAX_FED_WORDS, cudaStream);
SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender() {
word_ = cudautils::make_host_noncached_unique<unsigned int[]>(MAX_FED_WORDS, cudaHostAllocWriteCombined);
fedId_ = cudautils::make_host_noncached_unique<unsigned char[]>(MAX_FED_WORDS, cudaHostAllocWriteCombined);
}

void SiPixelRawToClusterGPUKernel::WordFedAppender::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "FWCore/Utilities/interface/typedefs.h"
#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
#include "DataFormats/SiPixelDigi/interface/PixelErrors.h"

struct SiPixelFedCablingMapGPU;
Expand Down Expand Up @@ -159,7 +160,7 @@ namespace pixelgpudetails {
public:
class WordFedAppender {
public:
WordFedAppender(cuda::stream_t<>& cudaStream);
WordFedAppender();
~WordFedAppender() = default;

void initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length);
Expand All @@ -168,8 +169,8 @@ namespace pixelgpudetails {
const unsigned char *fedId() const { return fedId_.get(); }

private:
cudautils::host::unique_ptr<unsigned int[]> word_;
cudautils::host::unique_ptr<unsigned char[]> fedId_;
cudautils::host::noncached::unique_ptr<unsigned int[]> word_;
cudautils::host::noncached::unique_ptr<unsigned char[]> fedId_;
};

SiPixelRawToClusterGPUKernel() = default;
Expand Down
1 change: 1 addition & 0 deletions RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
<use name="CUDADataFormats/BeamSpot"/>
<use name="DataFormats/TrackerCommon"/>
<use name="HeterogeneousCore/CUDACore"/>
<use name="HeterogeneousCore/Producer"/>
Expand Down
7 changes: 2 additions & 5 deletions RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ namespace pixelgpudetails {

constexpr auto MAX_HITS = siPixelRecHitsHeterogeneousProduct::maxHits();

cudaCheck(cudaMalloc((void **) & gpu_.bs_d, 3 * sizeof(float)));
cudaCheck(cudaMalloc((void **) & gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t)));

// Coalesce all 32bit and 16bit arrays to two big blobs
Expand Down Expand Up @@ -111,7 +110,6 @@ namespace pixelgpudetails {
#endif
}
PixelRecHitGPUKernel::~PixelRecHitGPUKernel() {
cudaCheck(cudaFree(gpu_.bs_d));
cudaCheck(cudaFree(gpu_.hitsLayerStart_d));
cudaCheck(cudaFree(gpu_.owner_32bit_));
cudaCheck(cudaFree(gpu_.owner_16bit_));
Expand All @@ -131,11 +129,10 @@ namespace pixelgpudetails {

void PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const& digis_d,
SiPixelClustersCUDA const& clusters_d,
float const * bs,
BeamSpotCUDA const& bs_d,
pixelCPEforGPU::ParamsOnGPU const * cpeParams,
bool transferToCPU,
cuda::stream_t<>& stream) {
cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id()));
gpu_.hitsModuleStart_d = clusters_d.clusModuleStart();
gpu_.cpeParams = cpeParams; // copy it for use in clients
cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id()));
Expand All @@ -148,7 +145,7 @@ namespace pixelgpudetails {
#endif
gpuPixelRecHits::getHits<<<blocks, threadsPerBlock, 0, stream.id()>>>(
cpeParams,
gpu_.bs_d,
bs_d.data(),
digis_d.moduleInd(),
digis_d.xx(), digis_d.yy(), digis_d.adc(),
clusters_d.moduleStart(),
Expand Down
3 changes: 2 additions & 1 deletion RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h
#define RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h

#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h"
Expand Down Expand Up @@ -34,7 +35,7 @@ namespace pixelgpudetails {

void makeHitsAsync(SiPixelDigisCUDA const& digis_d,
SiPixelClustersCUDA const& clusters_d,
float const * bs,
BeamSpotCUDA const& bs_d,
pixelCPEforGPU::ParamsOnGPU const * cpeParams,
bool transferToCPU,
cuda::stream_t<>& stream);
Expand Down
Loading

0 comments on commit fcbb820

Please sign in to comment.