Skip to content

Commit

Permalink
Various fixes and cleanup (cms-sw#87)
Browse files Browse the repository at this point in the history
  - replace `exclusive_scan` with `memset` + `inclusive_scan` to avoid an invalid read
  - fix memory sizes in allocations and copies
  - add a missing stream synchronize
  - set `recordWatcherUpdatedSinceLastTransfer_` to avoid spurious copies
  • Loading branch information
makortel authored and fwyzard committed Jun 29, 2018
1 parent eae58e3 commit bc0bc29
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
namespace pixelgpudetails {

SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel() {
int WSIZE = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD * sizeof(unsigned int);
int WSIZE = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;
cudaMallocHost(&word, sizeof(unsigned int)*WSIZE);
cudaMallocHost(&fedId_h, sizeof(unsigned char)*WSIZE);

Expand Down Expand Up @@ -680,8 +680,8 @@ namespace pixelgpudetails {

// std::cout << "found " << nModulesActive << " Modules active" << std::endl;

// TODO: I suspect we need a cudaStreamSynchronize before using nModules below
// In order to avoid the cudaStreamSynchronize, create a new kernel which launches countModules and findClus.
cudaStreamSynchronize(stream.id());

threadsPerBlock = 256;
blocks = nModulesActive;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,7 @@ void SiPixelRawToClusterHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEv
else if(recordWatcherUpdatedSinceLastTransfer_) {
// If regions_ are disabled, it is enough to fill and transfer only if cablingMap has changed
gpuModulesToUnpack_->fillAsync(*cablingMap_, std::set<unsigned int>(), cudaStream);
recordWatcherUpdatedSinceLastTransfer_ = false;
}

edm::ESHandle<SiPixelFedCablingMapGPUWrapper> hgpuMap;
Expand Down
19 changes: 11 additions & 8 deletions RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,13 @@ namespace pixelgpudetails {

cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3*sizeof(float), cudaMemcpyDefault, stream.id()));

thrust::exclusive_scan(thrust::cuda::par.on(stream.id()),
// Set first the first element to 0
cudaCheck(cudaMemsetAsync(gpu_.hitsModuleStart_d, 0, sizeof(uint32_t), stream.id()));
// Then use inclusive_scan to get the partial sum to the rest
thrust::inclusive_scan(thrust::cuda::par.on(stream.id()),
input.clusInModule_d,
input.clusInModule_d + gpuClustering::MaxNumModules + 1,
gpu_.hitsModuleStart_d);
input.clusInModule_d + gpuClustering::MaxNumModules,
&gpu_.hitsModuleStart_d[1]);

int threadsPerBlock = 256;
int blocks = input.nModules; // active modules (with digis)
Expand Down Expand Up @@ -126,11 +129,11 @@ namespace pixelgpudetails {
HitsOnCPU hoc(nhits);
hoc.gpu_d = gpu_d;
memcpy(hoc.hitsModuleStart, hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t));
cudaCheck(cudaMemcpyAsync(hoc.charge.data(), gpu_.charge_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.xl.data(), gpu_.xl_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.yl.data(), gpu_.yl_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.xe.data(), gpu_.xerr_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.ye.data(), gpu_.yerr_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.charge.data(), gpu_.charge_d, nhits*sizeof(int32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.xl.data(), gpu_.xl_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.yl.data(), gpu_.yl_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.xe.data(), gpu_.xerr_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.ye.data(), gpu_.yerr_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.mr.data(), gpu_.mr_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.mc.data(), gpu_.mc_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaStreamSynchronize(stream.id()));
Expand Down

0 comments on commit bc0bc29

Please sign in to comment.