Merge pull request #6 from hatakeyamak/PFRecHitAndCluster_GPU_12_5_ha…

…ckason1 Merge Felice+Marino's updates
jsamudio · Sep 29, 2022 · a71d4e8 · a71d4e8
2 parents 6830f14 + 178e6fe
commit a71d4e8
Show file tree

Hide file tree

Showing 5 changed files with 221 additions and 266 deletions.
diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
@@ -69,6 +69,46 @@ namespace PFClusterCudaHCAL {
   //int nTopoLoops = 100;
   int nTopoLoops = 35;
 
+  //
+  // --- kernel summary --
+  // initializeCudaConstants
+  // PFRechitToPFCluster_HCAL_entryPoint
+  //   seedingTopoThreshKernel_HCAL
+  //   prepareTopoInputs
+  //   topoClusterLinking
+  //   topoClusterContraction
+  //   fillRhfIndex
+  //   hcalFastCluster_selection
+  //     dev_hcalFastCluster_optimizedSimple
+  //     dev_hcalFastCluster_optimizedComplex
+  //     dev_hcalFastCluster_original
+  // [aux]
+  //     sortEight
+  //     sortSwap
+  // [not used]
+  //   (fillRhfIndex_serialize) serialized version
+  //   (prepareTopoInputsSerial) serialized version
+  //   [compareEdgeArrays] used only for debugging
+  //   seedingKernel_HCAL
+  //   seedingKernel_HCAL_serialize
+  //   compareEdgeArrays
+  //   topoKernel_HCAL_passTopoThresh
+  //   topoKernel_HCALV2
+  //   topoKernel_HCAL_serialize
+  //   hcalFastCluster_optimizedSimple
+  //   hcalFastCluster_optimizedComplex
+  //   hcalFastCluster_sharedRHList
+  //   hcalFastCluster_original
+  //   hcalFastCluster_serialize
+  //   hcalFastCluster_step1
+  //   hcalFastCluster_step2
+  //   hcalFastCluster_step2
+  //   hcalFastCluster_step1_serialize
+  //   hcalFastCluster_step2_serialize
+  //   passingTopoThreshold
+  //   passingTopoThreshold
+  //   printRhfIndex
+
   void initializeCudaConstants(const PFClustering::common::CudaHCALConstants& cudaConstants,
                                const cudaStream_t cudaStream) {
     cudaCheck(cudaMemcpyToSymbolAsync(
@@ -3757,12 +3797,10 @@ namespace PFClusterCudaHCAL {
       // Odd linking
       for (int idx = start; idx < nEdges; idx += gridStride) {
         int i = pfrh_edgeId[idx];  // Get edge topo id
-        //if (pfrh_edgeMask[idx] > 0 && pfrh_passTopoThresh[i] && isLeftEdge(idx, nEdges, pfrh_edgeId, pfrh_edgeMask)) {
         if (pfrh_edgeMask[idx] > 0 && isLeftEdge(idx, nEdges, pfrh_edgeId, pfrh_edgeMask)) {
           pfrh_parent[i] = (int)min(i, pfrh_edgeList[idx]);
         }
       }
-
       __syncthreads();
 
       // edgeParent
@@ -3790,6 +3828,8 @@ namespace PFClusterCudaHCAL {
       if (!notDone)
         break;
 
+      __syncthreads();//!!
+
       if (threadIdx.x == 0) {
         notDone = false;
       }
@@ -3824,17 +3864,20 @@ namespace PFClusterCudaHCAL {
           }
         }
       }
+
       if (threadIdx.x == 0)
         iter++;
 
       __syncthreads();
 
     } while (notDone);
+
     *topoIter = iter;
 #ifdef DEBUG_GPU_HCAL
 //    if (threadIdx.x == 0) {
 //        printf("*** Topo clustering converged in %d iterations ***\n", iter);
-//    } __syncthreads();
+//    }
+//    __syncthreads();
 #endif
   }
 
@@ -4391,23 +4434,19 @@ namespace PFClusterCudaHCAL {
       ::PFClustering::HCAL::OutputDataGPU& outputGPU,
       ::PFClustering::HCAL::ScratchDataGPU& scratchGPU,
       float (&timer)[8]) {
-    int nRH = inputPFRecHits.size;
-    //int nRH = 10;
-    //    printf("Now in PFRechitToPFCluster_HCAL_entryPoint with nRH = %d\tnEdges = %d\n", nRH, nEdges);
-    if (nRH < 1)
-      return;
-    cudaProfilerStart();
 
 #ifdef DEBUG_GPU_HCAL
+    cudaProfilerStart();
     cudaEvent_t start, stop;
     cudaEventCreate(&start);
     cudaEventCreate(&stop);
     cudaEventRecord(start, cudaStream);
 #endif
 
+    int nRH = inputPFRecHits.size;
+
     // Combined seeding & topo clustering thresholds, array initialization
 
-    //seedingTopoThreshKernel_HCAL<<<(nRH+63)/64, 128, 0, cudaStream>>>(
     seedingTopoThreshKernel_HCAL<<<(nRH + 31) / 32, 64, 0, cudaStream>>>(nRH,
                                                                          inputPFRecHits.pfrh_energy.get(),
                                                                          inputPFRecHits.pfrh_x.get(),
@@ -4427,6 +4466,8 @@ namespace PFClusterCudaHCAL {
                                                                          outputGPU.topoSeedList.get(),
                                                                          outputGPU.pfc_iter.get());
 
+    cudaCheck(cudaStreamSynchronize(cudaStream));
+
 #ifdef DEBUG_GPU_HCAL
     cudaEventRecord(stop, cudaStream);
     cudaEventSynchronize(stop);
@@ -4451,6 +4492,7 @@ namespace PFClusterCudaHCAL {
                                                                            inputPFRecHits.pfrh_neighbours.get(),
                                                                            scratchGPU.pfrh_edgeId.get(),
                                                                            scratchGPU.pfrh_edgeList.get());
+    cudaCheck(cudaStreamSynchronize(cudaStream));
 
     //    prepareTopoInputs<<<1, 256, 256 * (8+4) * sizeof(int), cudaStream>>>(
     //        nRH,
@@ -4489,6 +4531,7 @@ namespace PFClusterCudaHCAL {
                                                   //inputGPU.pfrh_edgeMask.get(),
                                                   outputGPU.pfrh_passTopoThresh.get(),
                                                   outputGPU.topoIter.get());
+    cudaCheck(cudaStreamSynchronize(cudaStream));
 
     topoClusterContraction<<<1, 512, 0, cudaStream>>>(nRH,
                                                       outputGPU.pfrh_topoId.get(),
@@ -4552,12 +4595,11 @@ namespace PFClusterCudaHCAL {
                                                            inputGPU.pfc_prevPos4.get(),
                                                            inputGPU.pfc_energy.get(),
                                                            outputGPU.pfc_iter.get());
-
 #ifdef DEBUG_GPU_HCAL
     cudaEventRecord(stop, cudaStream);
     cudaEventSynchronize(stop);
     cudaEventElapsedTime(&timer[3], start, stop);
-#endif
     cudaProfilerStop();
+#endif
   }
 }  // namespace PFClusterCudaHCAL