Various fixes and cleanup (cms-sw#87)

- replace `exclusive_scan` with `memset` + `inclusive_scan` to avoid an invalid read - fix memory sizes in allocations and copies - add a missing stream synchronize - set `recordWatcherUpdatedSinceLastTransfer_` to avoid spurious copies
fwyzard · Jun 29, 2018 · bc0bc29 · bc0bc29
1 parent eae58e3
commit bc0bc29
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 10 deletions.
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -39,7 +39,7 @@
 namespace pixelgpudetails {
 
   SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel() {
-    int WSIZE = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD * sizeof(unsigned int);
+    int WSIZE = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;
     cudaMallocHost(&word,       sizeof(unsigned int)*WSIZE);
     cudaMallocHost(&fedId_h,    sizeof(unsigned char)*WSIZE);
 
@@ -680,8 +680,8 @@ namespace pixelgpudetails {
 
     // std::cout << "found " << nModulesActive << " Modules active" << std::endl;
 
-    // TODO: I suspect we need a cudaStreamSynchronize before using nModules below
     // In order to avoid the cudaStreamSynchronize, create a new kernel which launches countModules and findClus.
+    cudaStreamSynchronize(stream.id());
 
     threadsPerBlock = 256;
     blocks = nModulesActive;

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc
@@ -439,6 +439,7 @@ void SiPixelRawToClusterHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEv
   else if(recordWatcherUpdatedSinceLastTransfer_) {
     // If regions_ are disabled, it is enough to fill and transfer only if cablingMap has changed
     gpuModulesToUnpack_->fillAsync(*cablingMap_, std::set<unsigned int>(), cudaStream);
+    recordWatcherUpdatedSinceLastTransfer_ = false;
   }
 
   edm::ESHandle<SiPixelFedCablingMapGPUWrapper> hgpuMap;

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -71,10 +71,13 @@ namespace pixelgpudetails {
 
    cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3*sizeof(float), cudaMemcpyDefault, stream.id()));
 
-    thrust::exclusive_scan(thrust::cuda::par.on(stream.id()),
+    // Set first the first element to 0
+    cudaCheck(cudaMemsetAsync(gpu_.hitsModuleStart_d, 0, sizeof(uint32_t), stream.id()));
+    // Then use inclusive_scan to get the partial sum to the rest
+    thrust::inclusive_scan(thrust::cuda::par.on(stream.id()),
                            input.clusInModule_d,
-                           input.clusInModule_d + gpuClustering::MaxNumModules + 1,
-                           gpu_.hitsModuleStart_d);
+                           input.clusInModule_d + gpuClustering::MaxNumModules,
+                           &gpu_.hitsModuleStart_d[1]);
 
     int threadsPerBlock = 256;
     int blocks = input.nModules; // active modules (with digis)
@@ -126,11 +129,11 @@ namespace pixelgpudetails {
     HitsOnCPU hoc(nhits);
     hoc.gpu_d = gpu_d;
     memcpy(hoc.hitsModuleStart, hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t));
-    cudaCheck(cudaMemcpyAsync(hoc.charge.data(), gpu_.charge_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(hoc.xl.data(), gpu_.xl_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(hoc.yl.data(), gpu_.yl_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(hoc.xe.data(), gpu_.xerr_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(hoc.ye.data(), gpu_.yerr_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(hoc.charge.data(), gpu_.charge_d, nhits*sizeof(int32_t), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(hoc.xl.data(), gpu_.xl_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(hoc.yl.data(), gpu_.yl_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(hoc.xe.data(), gpu_.xerr_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(hoc.ye.data(), gpu_.yerr_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
     cudaCheck(cudaMemcpyAsync(hoc.mr.data(), gpu_.mr_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
     cudaCheck(cudaMemcpyAsync(hoc.mc.data(), gpu_.mc_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
     cudaCheck(cudaStreamSynchronize(stream.id()));