From 80e88a586216033865212ff1ed4842755e1a909e Mon Sep 17 00:00:00 2001 From: Kenichi Hatakeyama Date: Tue, 4 Oct 2022 11:24:51 -0500 Subject: [PATCH] Add explicit initialization of pfrh_parent. --- .../plugins/PFClusterCudaHCAL.cu | 100 ++++++++++-------- 1 file changed, 53 insertions(+), 47 deletions(-) diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu index 16a1605e4247f..99413662041ea 100644 --- a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu +++ b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu @@ -3851,7 +3851,7 @@ namespace PFClusterCudaHCAL { __syncthreads();//!! if (threadIdx.x == 0) { - notDone = 0; // KenH is this necessary? + notDone = 0; } __syncthreads(); @@ -3949,7 +3949,12 @@ namespace PFClusterCudaHCAL { // // } // } - // __syncthreads(); + // Explicitly initialize pfrh_parent + for (int i = start; i < nRH; i += gridStride) { + pfrh_parent[i] = i; + } + + __syncthreads(); // for notDone if (threadIdx.x == 0) { @@ -3976,25 +3981,25 @@ namespace PFClusterCudaHCAL { // for notDone if (threadIdx.x == 0) { - notDone2 = 0; + notDone2 = 0; } // Follow parents of parents .... to contract parent structure do { - volatile bool threadNotDone = false; - for (int i = threadIdx.x; i < nRH; i += blockDim.x) { - int parent = pfrh_parent[i]; - if (parent >= 0 && parent != pfrh_parent[parent]) { - threadNotDone = true; - pfrh_parent[i] = pfrh_parent[parent]; - } - } - if (threadIdx.x == 0) - notDone = 0; - __syncthreads(); - - atomicAdd(¬Done, (int)threadNotDone); - __syncthreads(); + volatile bool threadNotDone = false; + for (int i = threadIdx.x; i < nRH; i += blockDim.x) { + int parent = pfrh_parent[i]; + if (parent >= 0 && parent != pfrh_parent[parent]) { + threadNotDone = true; + pfrh_parent[i] = pfrh_parent[parent]; + } + } + if (threadIdx.x == 0) + notDone = 0; + __syncthreads(); + + atomicAdd(¬Done, (int)threadNotDone); + __syncthreads(); } while (notDone); @@ -4014,25 +4019,25 @@ namespace PFClusterCudaHCAL { // __syncthreads(); for (int idx = start; idx < nEdges; idx += gridStride) { - //for (int idx = 0; idx < nEdges; idx++) { - int i = pfrh_edgeId[idx]; // Get edge topo id - int j = pfrh_edgeList[idx]; // Get edge neighbor list - int parent_target = pfrh_parent[i]; - int parent_neighbor = pfrh_parent[j]; - if (parent_target!=parent_neighbor){ - notDone2 = 1; - //printf("hmm. they should have the same parent, but they don't. why... %d %d %d\n",i,j,ii); - int min_parent = (int)min(parent_target,parent_neighbor); - int max_parent = (int)max(parent_target,parent_neighbor); - int idx_max = i; - if (parent_neighbor == max_parent) idx_max = j; - pfrh_parent[idx_max] = min_parent; - } + //for (int idx = 0; idx < nEdges; idx++) { + int i = pfrh_edgeId[idx]; // Get edge topo id + int j = pfrh_edgeList[idx]; // Get edge neighbor list + int parent_target = pfrh_parent[i]; + int parent_neighbor = pfrh_parent[j]; + if (parent_target!=parent_neighbor){ + notDone2 = 1; + //printf("hmm. they should have the same parent, but they don't. why... %d %d %d\n",i,j,ii); + int min_parent = (int)min(parent_target,parent_neighbor); + int max_parent = (int)max(parent_target,parent_neighbor); + int idx_max = i; + if (parent_neighbor == max_parent) idx_max = j; + pfrh_parent[idx_max] = min_parent; + } } __syncthreads(); if (notDone2==0) // if topocluster finding is converged, terminate the for-ii loop - break; + break; } // for-loop ii @@ -4057,9 +4062,9 @@ namespace PFClusterCudaHCAL { } while (notDone); - //__syncthreads(); + // __syncthreads(); - // Print out debugging info + // // Print out debugging info // if (threadIdx.x == 0) { // int nnode=0; // for (int i = 0; i < nRH; i++) { @@ -4665,23 +4670,23 @@ namespace PFClusterCudaHCAL { cudaEventRecord(start, cudaStream); #endif - prepareTopoInputsSerial<<<1, 1, 4 * (8+4) * sizeof(int), cudaStream>>>( - nRH, - outputGPU.nEdges.get(), - outputGPU.pfrh_passTopoThresh.get(), - inputPFRecHits.pfrh_neighbours.get(), - scratchGPU.pfrh_edgeId.get(), - scratchGPU.pfrh_edgeList.get()); + // prepareTopoInputsSerial<<<1, 1, 4 * (8+4) * sizeof(int), cudaStream>>>( + // nRH, + // outputGPU.nEdges.get(), + // outputGPU.pfrh_passTopoThresh.get(), + // inputPFRecHits.pfrh_neighbours.get(), + // scratchGPU.pfrh_edgeId.get(), + // scratchGPU.pfrh_edgeList.get()); // Topo clustering // Fill edgeId, edgeList arrays with rechit neighbors // Has a bug when using more than 128 threads.. - // prepareTopoInputs<<<1, 128, 128 * (8 + 4) * sizeof(int), cudaStream>>>(nRH, - // outputGPU.nEdges.get(), - // outputGPU.pfrh_passTopoThresh.get(), - // inputPFRecHits.pfrh_neighbours.get(), - // scratchGPU.pfrh_edgeId.get(), - // scratchGPU.pfrh_edgeList.get()); + prepareTopoInputs<<<1, 128, 128 * (8 + 4) * sizeof(int), cudaStream>>>(nRH, + outputGPU.nEdges.get(), + outputGPU.pfrh_passTopoThresh.get(), + inputPFRecHits.pfrh_neighbours.get(), + scratchGPU.pfrh_edgeId.get(), + scratchGPU.pfrh_edgeList.get()); cudaCheck(cudaStreamSynchronize(cudaStream)); // prepareTopoInputs<<<1, 256, 256 * (8+4) * sizeof(int), cudaStream>>>( @@ -4712,6 +4717,7 @@ namespace PFClusterCudaHCAL { #endif // Topo clustering + //topoClusterLinking<<<1, 512, 0, cudaStream>>>(nRH, topoClusterLinkingKH<<<1, 512, 0, cudaStream>>>(nRH, outputGPU.nEdges.get(), //inputPFRecHits.pfrh_energy.get(), // temporary entry for debugging