diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index e66e322ddff4e..16438a0674c76 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -597,8 +597,7 @@ namespace pixelgpudetails { debug); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); + cudaCheck(cudaStreamSynchronize(stream)); #endif if (includeErrors) { @@ -637,8 +636,7 @@ namespace pixelgpudetails { cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); + cudaCheck(cudaStreamSynchronize(stream)); #endif #ifdef GPU_DEBUG @@ -651,7 +649,7 @@ namespace pixelgpudetails { cudaCheck(cudaGetLastError()); threadsPerBlock = 256 + 128; /// should be larger than 6000/16 aka (maxPixInModule/maxiter in the kernel) - blocks = phase2PixelTopology::numberOfModules; + blocks = phase1PixelTopology::numberOfModules; #ifdef GPU_DEBUG std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n"; #endif @@ -666,8 +664,7 @@ namespace pixelgpudetails { wordCounter); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); + cudaCheck(cudaStreamSynchronize(stream)); #endif // apply charge cut @@ -697,8 +694,7 @@ namespace pixelgpudetails { nModules_Clusters_h.get(), nModules_Clusters_d.get(), 3 * sizeof(uint32_t), cudaMemcpyDefault, stream)); #ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); + cudaCheck(cudaStreamSynchronize(stream)); #endif } // end clusterizer scope @@ -744,8 +740,7 @@ namespace pixelgpudetails { cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); + cudaCheck(cudaStreamSynchronize(stream)); #endif #ifdef GPU_DEBUG @@ -774,8 +769,7 @@ namespace pixelgpudetails { cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); + cudaCheck(cudaStreamSynchronize(stream)); #endif // apply charge cut @@ -799,8 +793,7 @@ namespace pixelgpudetails { nModules_Clusters_h.get(), nModules_Clusters_d.get(), 3 * sizeof(uint32_t), cudaMemcpyDefault, stream)); #ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); + cudaCheck(cudaStreamSynchronize(stream)); #endif } // } // namespace pixelgpudetails diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index 4df0e43cc3436..3672259c4ac45 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -220,10 +220,6 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells, auto qj = tracks->quality(jt); if (qj <= reject) continue; -#ifdef GPU_DEBUG - if (foundNtuplets->size(it) != foundNtuplets->size(jt)) - printf(" a mess\n"); -#endif auto opj = tracks->stateAtBS.state(jt)(2); auto ctj = tracks->stateAtBS.state(jt)(3); auto dct = nSigma2 * (tracks->stateAtBS.covariance(jt)(12) + e2cti);