Skip to content

Commit

Permalink
Merge pull request #6 from hatakeyamak/PFRecHitAndCluster_GPU_12_5_ha…
Browse files Browse the repository at this point in the history
…ckason1

Merge Felice+Marino's updates
  • Loading branch information
hatakeyamak authored Sep 29, 2022
2 parents 6830f14 + 178e6fe commit a71d4e8
Show file tree
Hide file tree
Showing 5 changed files with 221 additions and 266 deletions.
66 changes: 54 additions & 12 deletions RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,46 @@ namespace PFClusterCudaHCAL {
//int nTopoLoops = 100;
int nTopoLoops = 35;

//
// --- kernel summary --
// initializeCudaConstants
// PFRechitToPFCluster_HCAL_entryPoint
// seedingTopoThreshKernel_HCAL
// prepareTopoInputs
// topoClusterLinking
// topoClusterContraction
// fillRhfIndex
// hcalFastCluster_selection
// dev_hcalFastCluster_optimizedSimple
// dev_hcalFastCluster_optimizedComplex
// dev_hcalFastCluster_original
// [aux]
// sortEight
// sortSwap
// [not used]
// (fillRhfIndex_serialize) serialized version
// (prepareTopoInputsSerial) serialized version
// [compareEdgeArrays] used only for debugging
// seedingKernel_HCAL
// seedingKernel_HCAL_serialize
// compareEdgeArrays
// topoKernel_HCAL_passTopoThresh
// topoKernel_HCALV2
// topoKernel_HCAL_serialize
// hcalFastCluster_optimizedSimple
// hcalFastCluster_optimizedComplex
// hcalFastCluster_sharedRHList
// hcalFastCluster_original
// hcalFastCluster_serialize
// hcalFastCluster_step1
// hcalFastCluster_step2
// hcalFastCluster_step2
// hcalFastCluster_step1_serialize
// hcalFastCluster_step2_serialize
// passingTopoThreshold
// passingTopoThreshold
// printRhfIndex

void initializeCudaConstants(const PFClustering::common::CudaHCALConstants& cudaConstants,
const cudaStream_t cudaStream) {
cudaCheck(cudaMemcpyToSymbolAsync(
Expand Down Expand Up @@ -3757,12 +3797,10 @@ namespace PFClusterCudaHCAL {
// Odd linking
for (int idx = start; idx < nEdges; idx += gridStride) {
int i = pfrh_edgeId[idx]; // Get edge topo id
//if (pfrh_edgeMask[idx] > 0 && pfrh_passTopoThresh[i] && isLeftEdge(idx, nEdges, pfrh_edgeId, pfrh_edgeMask)) {
if (pfrh_edgeMask[idx] > 0 && isLeftEdge(idx, nEdges, pfrh_edgeId, pfrh_edgeMask)) {
pfrh_parent[i] = (int)min(i, pfrh_edgeList[idx]);
}
}

__syncthreads();

// edgeParent
Expand Down Expand Up @@ -3790,6 +3828,8 @@ namespace PFClusterCudaHCAL {
if (!notDone)
break;

__syncthreads();//!!

if (threadIdx.x == 0) {
notDone = false;
}
Expand Down Expand Up @@ -3824,17 +3864,20 @@ namespace PFClusterCudaHCAL {
}
}
}

if (threadIdx.x == 0)
iter++;

__syncthreads();

} while (notDone);

*topoIter = iter;
#ifdef DEBUG_GPU_HCAL
// if (threadIdx.x == 0) {
// printf("*** Topo clustering converged in %d iterations ***\n", iter);
// } __syncthreads();
// }
// __syncthreads();
#endif
}

Expand Down Expand Up @@ -4391,23 +4434,19 @@ namespace PFClusterCudaHCAL {
::PFClustering::HCAL::OutputDataGPU& outputGPU,
::PFClustering::HCAL::ScratchDataGPU& scratchGPU,
float (&timer)[8]) {
int nRH = inputPFRecHits.size;
//int nRH = 10;
// printf("Now in PFRechitToPFCluster_HCAL_entryPoint with nRH = %d\tnEdges = %d\n", nRH, nEdges);
if (nRH < 1)
return;
cudaProfilerStart();

#ifdef DEBUG_GPU_HCAL
cudaProfilerStart();
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, cudaStream);
#endif

int nRH = inputPFRecHits.size;

// Combined seeding & topo clustering thresholds, array initialization

//seedingTopoThreshKernel_HCAL<<<(nRH+63)/64, 128, 0, cudaStream>>>(
seedingTopoThreshKernel_HCAL<<<(nRH + 31) / 32, 64, 0, cudaStream>>>(nRH,
inputPFRecHits.pfrh_energy.get(),
inputPFRecHits.pfrh_x.get(),
Expand All @@ -4427,6 +4466,8 @@ namespace PFClusterCudaHCAL {
outputGPU.topoSeedList.get(),
outputGPU.pfc_iter.get());

cudaCheck(cudaStreamSynchronize(cudaStream));

#ifdef DEBUG_GPU_HCAL
cudaEventRecord(stop, cudaStream);
cudaEventSynchronize(stop);
Expand All @@ -4451,6 +4492,7 @@ namespace PFClusterCudaHCAL {
inputPFRecHits.pfrh_neighbours.get(),
scratchGPU.pfrh_edgeId.get(),
scratchGPU.pfrh_edgeList.get());
cudaCheck(cudaStreamSynchronize(cudaStream));

// prepareTopoInputs<<<1, 256, 256 * (8+4) * sizeof(int), cudaStream>>>(
// nRH,
Expand Down Expand Up @@ -4489,6 +4531,7 @@ namespace PFClusterCudaHCAL {
//inputGPU.pfrh_edgeMask.get(),
outputGPU.pfrh_passTopoThresh.get(),
outputGPU.topoIter.get());
cudaCheck(cudaStreamSynchronize(cudaStream));

topoClusterContraction<<<1, 512, 0, cudaStream>>>(nRH,
outputGPU.pfrh_topoId.get(),
Expand Down Expand Up @@ -4552,12 +4595,11 @@ namespace PFClusterCudaHCAL {
inputGPU.pfc_prevPos4.get(),
inputGPU.pfc_energy.get(),
outputGPU.pfc_iter.get());

#ifdef DEBUG_GPU_HCAL
cudaEventRecord(stop, cudaStream);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timer[3], start, stop);
#endif
cudaProfilerStop();
#endif
}
} // namespace PFClusterCudaHCAL
Loading

0 comments on commit a71d4e8

Please sign in to comment.