From ccd7aa92a869470c6fba0d9ef3219d00643d9a87 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Mon, 1 Aug 2022 15:43:05 +0200 Subject: [PATCH 1/3] Improve synchronisation in debug mode In debug mode, synchronise the current stream instead of the whole device. --- .../plugins/SiPixelRawToClusterGPUKernel.cu | 21 +++++++------------ 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index e66e322ddff4e..c4c0945b212a7 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -597,8 +597,7 @@ namespace pixelgpudetails { debug); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); + cudaCheck(cudaStreamSynchronize(stream)); #endif if (includeErrors) { @@ -637,8 +636,7 @@ namespace pixelgpudetails { cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); + cudaCheck(cudaStreamSynchronize(stream)); #endif #ifdef GPU_DEBUG @@ -666,8 +664,7 @@ namespace pixelgpudetails { wordCounter); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); + cudaCheck(cudaStreamSynchronize(stream)); #endif // apply charge cut @@ -697,8 +694,7 @@ namespace pixelgpudetails { nModules_Clusters_h.get(), nModules_Clusters_d.get(), 3 * sizeof(uint32_t), cudaMemcpyDefault, stream)); #ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); + cudaCheck(cudaStreamSynchronize(stream)); #endif } // end clusterizer scope @@ -744,8 +740,7 @@ namespace pixelgpudetails { cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); + cudaCheck(cudaStreamSynchronize(stream)); #endif #ifdef GPU_DEBUG @@ -774,8 +769,7 @@ namespace pixelgpudetails { cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); + cudaCheck(cudaStreamSynchronize(stream)); #endif // apply charge cut @@ -799,8 +793,7 @@ namespace pixelgpudetails { nModules_Clusters_h.get(), nModules_Clusters_d.get(), 3 * sizeof(uint32_t), cudaMemcpyDefault, stream)); #ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); + cudaCheck(cudaStreamSynchronize(stream)); #endif } // } // namespace pixelgpudetails From 691c9534986df3d83ab4fc5e0f03f677951726d2 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Mon, 1 Aug 2022 15:44:07 +0200 Subject: [PATCH 2/3] Fix the number of modules --- .../SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index c4c0945b212a7..16438a0674c76 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -649,7 +649,7 @@ namespace pixelgpudetails { cudaCheck(cudaGetLastError()); threadsPerBlock = 256 + 128; /// should be larger than 6000/16 aka (maxPixInModule/maxiter in the kernel) - blocks = phase2PixelTopology::numberOfModules; + blocks = phase1PixelTopology::numberOfModules; #ifdef GPU_DEBUG std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n"; #endif From 0c718bdf42e412cbff54e5fbf4172dcf58e38135 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Mon, 1 Aug 2022 15:56:11 +0200 Subject: [PATCH 3/3] Remove obsolete debug statement --- .../PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index 4df0e43cc3436..3672259c4ac45 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -220,10 +220,6 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells, auto qj = tracks->quality(jt); if (qj <= reject) continue; -#ifdef GPU_DEBUG - if (foundNtuplets->size(it) != foundNtuplets->size(jt)) - printf(" a mess\n"); -#endif auto opj = tracks->stateAtBS.state(jt)(2); auto ctj = tracks->stateAtBS.state(jt)(3); auto dct = nSigma2 * (tracks->stateAtBS.covariance(jt)(12) + e2cti);