diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index e66e322ddff4e..16438a0674c76 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -597,8 +597,7 @@ namespace pixelgpudetails {
           debug);
       cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
-      cudaDeviceSynchronize();
-      cudaCheck(cudaGetLastError());
+      cudaCheck(cudaStreamSynchronize(stream));
 #endif
 
       if (includeErrors) {
@@ -637,8 +636,7 @@ namespace pixelgpudetails {
 
       cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
-      cudaDeviceSynchronize();
-      cudaCheck(cudaGetLastError());
+      cudaCheck(cudaStreamSynchronize(stream));
 #endif
 
 #ifdef GPU_DEBUG
@@ -651,7 +649,7 @@ namespace pixelgpudetails {
       cudaCheck(cudaGetLastError());
 
       threadsPerBlock = 256 + 128;  /// should be larger than 6000/16 aka (maxPixInModule/maxiter in the kernel)
-      blocks = phase2PixelTopology::numberOfModules;
+      blocks = phase1PixelTopology::numberOfModules;
 #ifdef GPU_DEBUG
       std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n";
 #endif
@@ -666,8 +664,7 @@ namespace pixelgpudetails {
                                                               wordCounter);
       cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
-      cudaDeviceSynchronize();
-      cudaCheck(cudaGetLastError());
+      cudaCheck(cudaStreamSynchronize(stream));
 #endif
 
       // apply charge cut
@@ -697,8 +694,7 @@ namespace pixelgpudetails {
           nModules_Clusters_h.get(), nModules_Clusters_d.get(), 3 * sizeof(uint32_t), cudaMemcpyDefault, stream));
 
 #ifdef GPU_DEBUG
-      cudaDeviceSynchronize();
-      cudaCheck(cudaGetLastError());
+      cudaCheck(cudaStreamSynchronize(stream));
 #endif
 
     }  // end clusterizer scope
@@ -744,8 +740,7 @@ namespace pixelgpudetails {
     cudaCheck(cudaGetLastError());
 
 #ifdef GPU_DEBUG
-    cudaDeviceSynchronize();
-    cudaCheck(cudaGetLastError());
+    cudaCheck(cudaStreamSynchronize(stream));
 #endif
 
 #ifdef GPU_DEBUG
@@ -774,8 +769,7 @@ namespace pixelgpudetails {
 
     cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
-    cudaDeviceSynchronize();
-    cudaCheck(cudaGetLastError());
+    cudaCheck(cudaStreamSynchronize(stream));
 #endif
 
     // apply charge cut
@@ -799,8 +793,7 @@ namespace pixelgpudetails {
         nModules_Clusters_h.get(), nModules_Clusters_d.get(), 3 * sizeof(uint32_t), cudaMemcpyDefault, stream));
 
 #ifdef GPU_DEBUG
-    cudaDeviceSynchronize();
-    cudaCheck(cudaGetLastError());
+    cudaCheck(cudaStreamSynchronize(stream));
 #endif
   }  //
 }  // namespace pixelgpudetails
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 4df0e43cc3436..3672259c4ac45 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -220,10 +220,6 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
         auto qj = tracks->quality(jt);
         if (qj <= reject)
           continue;
-#ifdef GPU_DEBUG
-        if (foundNtuplets->size(it) != foundNtuplets->size(jt))
-          printf(" a mess\n");
-#endif
         auto opj = tracks->stateAtBS.state(jt)(2);
         auto ctj = tracks->stateAtBS.state(jt)(3);
         auto dct = nSigma2 * (tracks->stateAtBS.covariance(jt)(12) + e2cti);