From e7f5b13d2e6509470c8aeeba71c783ff87d8f2e9 Mon Sep 17 00:00:00 2001 From: AdrianoDee Date: Thu, 24 Nov 2022 18:34:20 +0100 Subject: [PATCH] Tracks and hits portable (#3) * STUFF that make things work * Use DeviceToDevice copy for cpe params Co-authored-by: Dimitris Papagiannis --- .../interface/SiPixelClustersCUDA.h | 46 +---- .../interface/TrackingRecHitSoADevice.h | 55 ++++-- .../interface/TrackingRecHitSoAHost.h | 30 ++-- .../TrackingRecHit/src/classes_def.xml | 18 +- .../plugins/SiPixelDigisSoAFromCUDA.cc | 36 ++-- .../Configuration/python/HLT_2022v15_cff.py | 2 +- .../Configuration/python/HLT_FULL_cff.py | 2 +- .../Configuration/python/HLT_GRun_cff.py | 2 +- .../Configuration/python/HLT_HIon_cff.py | 2 +- .../Configuration/python/HLT_PRef_cff.py | 2 +- .../Configuration/test/OnLine_HLT_2022v15.py | 2 +- .../Configuration/test/OnLine_HLT_FULL.py | 2 +- .../Configuration/test/OnLine_HLT_GRun.py | 2 +- .../Configuration/test/OnLine_HLT_HIon.py | 2 +- .../Configuration/test/OnLine_HLT_PRef.py | 2 +- .../plugins/PixelRecHitGPUKernel.cu | 30 ++-- .../plugins/SiPixelRecHitCUDA.cc | 12 +- .../plugins/SiPixelRecHitFromCUDA.cc | 15 +- .../plugins/SiPixelRecHitSoAFromLegacy.cc | 165 ++++++++++-------- .../SiPixelRecHits/plugins/gpuPixelRecHits.h | 9 +- .../plugins/PixelTrackProducerFromSoA.cc | 3 +- .../plugins/BrokenLineFitOnGPU.cc | 2 +- .../plugins/BrokenLineFitOnGPU.cu | 28 ++- .../plugins/BrokenLineFitOnGPU.h | 44 +++-- .../PixelTriplets/plugins/CAHitNtupletCUDA.cc | 13 +- .../plugins/CAHitNtupletGeneratorKernels.cc | 13 +- .../plugins/CAHitNtupletGeneratorKernels.cu | 53 +++--- .../plugins/CAHitNtupletGeneratorKernels.h | 8 +- .../CAHitNtupletGeneratorKernelsImpl.h | 14 +- .../plugins/CAHitNtupletGeneratorOnGPU.cc | 68 ++++++-- .../plugins/CAHitNtupletGeneratorOnGPU.h | 14 +- .../PixelTriplets/plugins/GPUCACell.h | 50 +++--- .../PixelTriplets/plugins/HelixFitOnGPU.h | 9 +- .../PixelTriplets/plugins/RiemannFitOnGPU.cc | 2 +- .../PixelTriplets/plugins/RiemannFitOnGPU.cu | 2 +- .../PixelTriplets/plugins/RiemannFitOnGPU.h | 6 +- .../PixelTriplets/plugins/gpuFishbone.h | 2 +- .../PixelTriplets/plugins/gpuPixelDoublets.h | 2 +- .../plugins/gpuPixelDoubletsAlgos.h | 2 +- 39 files changed, 417 insertions(+), 354 deletions(-) diff --git a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h index f83953bbc3e15..0deabe8fb5034 100644 --- a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h +++ b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h @@ -21,19 +21,11 @@ using SiPixelClustersCUDASoA = SiPixelClustersCUDALayout<>; class SiPixelClustersCUDA : public cms::cuda::PortableDeviceCollection> { public: SiPixelClustersCUDA() = default; - // explicit SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream); - explicit SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream) - : PortableDeviceCollection>(maxModules + 1, stream) {} ~SiPixelClustersCUDA() = default; - // // Restrict view - // using RestrictConstView = - // Layout::ConstViewTemplate; - // - // RestrictConstView restrictConstView() const { return RestrictConstView(layout()); } + explicit SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream) + : PortableDeviceCollection>(maxModules + 1, stream) {} - // SiPixelClustersCUDA(const SiPixelClustersCUDA &) = delete; - // SiPixelClustersCUDA &operator=(const SiPixelClustersCUDA &) = delete; SiPixelClustersCUDA(SiPixelClustersCUDA &&) = default; SiPixelClustersCUDA &operator=(SiPixelClustersCUDA &&) = default; @@ -45,41 +37,7 @@ class SiPixelClustersCUDA : public cms::cuda::PortableDeviceCollection moduleStart_d; // index of the first pixel of each module - // cms::cuda::device::unique_ptr clusInModule_d; // number of clusters found in each module - // cms::cuda::device::unique_ptr moduleId_d; // module id of each module - // - // // originally from rechits - // cms::cuda::device::unique_ptr clusModuleStart_d; // index of the first cluster of each module - // - // cms::cuda::device::unique_ptr view_d; // "me" pointer - uint32_t nClusters_h = 0; int32_t offsetBPIX2_h = 0; }; diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h index 56ed23b0582fd..08cf5fa39bd52 100644 --- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h @@ -15,19 +15,39 @@ namespace trackingRecHit TrackingRecHitSoADevice() = default; // cms::cuda::Product needs this // Constructor which specifies the SoA size - explicit TrackingRecHitSoADevice(uint32_t nHits, bool isPhase2, int32_t offsetBPIX2, pixelCPEforGPU::ParamsOnGPU const* cpeParams, uint32_t const* hitsModuleStart, cudaStream_t stream) - : PortableDeviceCollection>(nHits, stream), nHits_(nHits), cpeParams_(cpeParams), hitsModuleStart_(hitsModuleStart), offsetBPIX2_(offsetBPIX2) - { - nModules_ = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules; - phiBinner_ = &(view().phiBinner()); - // phiBinner_ = cms::cuda::make_device_unique(stream).get(); - cudaCheck(cudaMemcpyAsync(&(view().nHits()), &nHits, sizeof(uint32_t),cudaMemcpyHostToDevice,stream)); - cudaCheck(cudaMemcpyAsync(&(view().nMaxModules()), &nModules_, sizeof(uint32_t),cudaMemcpyHostToDevice,stream)); - cudaCheck(cudaMemcpyAsync(view().hitsModuleStart().data(), hitsModuleStart, sizeof(uint32_t) * int(nModules_ + 1),cudaMemcpyHostToDevice,stream)); - // cudaCheck(cudaMemcpyAsync(&(view().cpeParams()), cpeParams, int(sizeof(pixelCPEforGPU::ParamsOnGPU)),cudaMemcpyHostToDevice,stream)); - cudaCheck(cudaMemcpyAsync(&(view().offsetBPIX2()), &offsetBPIX2, sizeof(int32_t),cudaMemcpyHostToDevice,stream)); + explicit TrackingRecHitSoADevice(uint32_t nHits, + bool isPhase2, + int32_t offsetBPIX2, + pixelCPEforGPU::ParamsOnGPU const* cpeParams, + uint32_t const* hitsModuleStart, + cudaStream_t stream) + : PortableDeviceCollection>(nHits, stream), + nHits_(nHits), + cpeParams_(cpeParams), + hitsModuleStart_(hitsModuleStart), + offsetBPIX2_(offsetBPIX2) { + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); - } + nModules_ = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules; + phiBinner_ = &(view().phiBinner()); + // phiBinner_ = cms::cuda::make_device_unique(stream).get(); + cudaCheck(cudaMemcpyAsync(&(view().nHits()), &nHits, sizeof(uint32_t), cudaMemcpyHostToDevice, stream)); + cudaCheck(cudaMemcpyAsync(&(view().nMaxModules()), &nModules_, sizeof(uint32_t), cudaMemcpyHostToDevice, stream)); + cudaCheck(cudaMemcpyAsync(view().hitsModuleStart().data(), + hitsModuleStart, + sizeof(uint32_t) * int(nModules_ + 1), + cudaMemcpyHostToDevice, + stream)); + cudaCheck( + cudaMemcpyAsync(&(view().offsetBPIX2()), &offsetBPIX2, sizeof(int32_t), cudaMemcpyHostToDevice, stream)); + + // cpeParams argument is a pointer to device memory, copy + // its contents into the Layout. + + cudaCheck(cudaMemcpyAsync( + &(view().cpeParams()), cpeParams, int(sizeof(pixelCPEforGPU::ParamsOnGPU)), cudaMemcpyDeviceToDevice, stream)); + } uint32_t nHits() const { return nHits_; } //go to size of view uint32_t nModules() const { return nModules_; } @@ -35,14 +55,14 @@ namespace trackingRecHit cms::cuda::host::unique_ptr localCoordToHostAsync(cudaStream_t stream) const { auto ret = cms::cuda::make_host_unique(4 * nHits(), stream); size_t rowSize = sizeof(float) * nHits(); - printf("%d \n",nModules()); - printf("%d \n",nHits()); - cudaCheck(cudaMemcpyAsync(ret.get(), view().xLocal() , rowSize * 4, cudaMemcpyDeviceToHost, stream)); + printf("nModules=%d \n", nModules()); + printf("nHits=%d \n", nHits()); + cudaCheck(cudaMemcpyAsync(ret.get(), view().xLocal(), rowSize * 4, cudaMemcpyDeviceToHost, stream)); // cudaCheck(cudaMemcpyAsync(ret.get() + rowSize , view().yLocal() , rowSize, cudaMemcpyDeviceToHost, stream)); // cudaCheck(cudaMemcpyAsync(ret.get() + size_t(rowSize * 2), view().xerrLocal() , rowSize, cudaMemcpyDeviceToHost, stream)); // cudaCheck(cudaMemcpyAsync(ret.get() + size_t(rowSize * 3) , view().yerrLocal() , rowSize, cudaMemcpyDeviceToHost, stream)); return ret; - } //move to utilities + } //move to utilities cms::cuda::host::unique_ptr hitsModuleStartToHostAsync(cudaStream_t stream) const { // printf("%d \n",nModules()); @@ -58,7 +78,8 @@ namespace trackingRecHit private: uint32_t nHits_; //Needed for the host SoA size - pixelCPEforGPU::ParamsOnGPU const* cpeParams_; //TODO: this is used not that much from the hits (only once in BrokenLineFit), would make sens to remove it from this class. + pixelCPEforGPU::ParamsOnGPU const* + cpeParams_; //TODO: this is used not that much from the hits (only once in BrokenLineFit), would make sens to remove it from this class. uint32_t const* hitsModuleStart_; uint32_t offsetBPIX2_; diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h index 020ddf4d64bf6..8f7381409837a 100644 --- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h @@ -20,18 +20,24 @@ namespace trackingRecHit explicit TrackingRecHitSoAHost(uint32_t nHits, cudaStream_t stream) : PortableHostCollection>(nHits, stream) {} - explicit TrackingRecHitSoAHost(uint32_t nHits, bool isPhase2, int32_t offsetBPIX2, pixelCPEforGPU::ParamsOnGPU const* cpeParams, uint32_t const* hitsModuleStart, cudaStream_t stream) - : PortableHostCollection>(nHits, stream), nHits_(nHits), cpeParams_(cpeParams), offsetBPIX2_(offsetBPIX2) - { - nModules_ = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules; - - view().nHits() = nHits; - view().nMaxModules() = nModules_; - std::copy(hitsModuleStart,hitsModuleStart+nModules_+1,view().hitsModuleStart().begin()); - - view().offsetBPIX2() = offsetBPIX2; - - } + explicit TrackingRecHitSoAHost(uint32_t nHits, + bool isPhase2, + int32_t offsetBPIX2, + pixelCPEforGPU::ParamsOnGPU const* cpeParams, + uint32_t const* hitsModuleStart, + cudaStream_t stream) + : PortableHostCollection>(nHits, stream), + nHits_(nHits), + cpeParams_(cpeParams), + offsetBPIX2_(offsetBPIX2) { + nModules_ = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules; + std::cout << "PORCA MADONNA!!!!!!!!!!!!!!!!!" << std::endl; + view().nHits() = nHits; + view().nMaxModules() = nModules_; + std::copy(hitsModuleStart, hitsModuleStart + nModules_ + 1, view().hitsModuleStart().begin()); + memcpy(&(view().cpeParams()), cpeParams, sizeof(pixelCPEforGPU::ParamsOnGPU)); + view().offsetBPIX2() = offsetBPIX2; + } uint32_t nHits() const { return nHits_; } uint32_t nModules() const { return nModules_; } diff --git a/CUDADataFormats/TrackingRecHit/src/classes_def.xml b/CUDADataFormats/TrackingRecHit/src/classes_def.xml index b0589a5a46d82..9bb7984550377 100644 --- a/CUDADataFormats/TrackingRecHit/src/classes_def.xml +++ b/CUDADataFormats/TrackingRecHit/src/classes_def.xml @@ -1,15 +1,17 @@ - - - - - - - - + + + + + + + + + + diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc index ab47d396884f3..cda526f513c9e 100644 --- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc +++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc @@ -1,4 +1,5 @@ #include "CUDADataFormats/Common/interface/Product.h" +#include "CUDADataFormats/Common/interface/PortableHostCollection.h" #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" #include "DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h" #include "FWCore/Framework/interface/EventSetup.h" @@ -27,7 +28,7 @@ class SiPixelDigisSoAFromCUDA : public edm::stream::EDProducer> digiGetToken_; edm::EDPutTokenT digiPutToken_; - cms::cuda::host::unique_ptr store_; + cms::cuda::PortableHostCollection> digis_h; int nDigis_; }; @@ -48,31 +49,22 @@ void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent, // Do the transfer in a CUDA stream parallel to the computation CUDA stream cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)}; - const auto& gpuDigis = ctx.get(iEvent, digiGetToken_); + const auto& digis_d = ctx.get(iEvent, digiGetToken_); - nDigis_ = gpuDigis.nDigis(); - store_ = gpuDigis.copyAllToHostAsync(ctx.stream()); + nDigis_ = digis_d.nDigis(); + digis_h = cms::cuda::PortableHostCollection>(digis_d.view().metadata().size(), ctx.stream()); + cudaCheck(cudaMemcpyAsync( + digis_h.buffer().get(), digis_d.const_buffer().get(), digis_d.bufferSize(), cudaMemcpyDeviceToHost, ctx.stream())); + cudaCheck(cudaGetLastError()); } void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { - // The following line copies the data from the pinned host memory to - // regular host memory. In principle that feels unnecessary (why not - // just use the pinned host memory?). There are a few arguments for - // doing it though - // - Now can release the pinned host memory back to the (caching) allocator - // * if we'd like to keep the pinned memory, we'd need to also - // keep the CUDA stream around as long as that, or allow pinned - // host memory to be allocated without a CUDA stream - // - What if a CPU algorithm would produce the same SoA? We can't - // use cudaMallocHost without a GPU... - - // auto tmp_view = SiPixelDigisCUDASOAView(store_, nDigis_, SiPixelDigisCUDASOAView::StorageLocationHost::kMAX); - SiPixelDigisCUDASOA tmp_layout(store_.get(), nDigis_); - SiPixelDigisCUDASOA::View tmp_view(tmp_layout); - - iEvent.emplace(digiPutToken_, nDigis_, tmp_view.pdigi(), tmp_view.rawIdArr(), tmp_view.adc(), tmp_view.clus()); - - store_.reset(); + iEvent.emplace(digiPutToken_, + nDigis_, + digis_h.view().pdigi(), + digis_h.view().rawIdArr(), + digis_h.view().adc(), + digis_h.view().clus()); } // define as framework plugin diff --git a/HLTrigger/Configuration/python/HLT_2022v15_cff.py b/HLTrigger/Configuration/python/HLT_2022v15_cff.py index c23f766e062ad..8b64caf12cd86 100644 --- a/HLTrigger/Configuration/python/HLT_2022v15_cff.py +++ b/HLTrigger/Configuration/python/HLT_2022v15_cff.py @@ -84802,7 +84802,7 @@ fragment.hltSiPixelRecHitsSoA = SwitchProducerCUDA( cpu = cms.EDAlias( hltSiPixelRecHitsFromLegacy = cms.VPSet( - cms.PSet( type = cms.string( "cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous" ) ), + cms.PSet( type = cms.string( "trackingRecHitTrackingRecHitSoAHost" ) ), cms.PSet( type = cms.string( "uintAsHostProduct" ) ) ) ), diff --git a/HLTrigger/Configuration/python/HLT_FULL_cff.py b/HLTrigger/Configuration/python/HLT_FULL_cff.py index a1651875c2977..9ec4c92e62855 100644 --- a/HLTrigger/Configuration/python/HLT_FULL_cff.py +++ b/HLTrigger/Configuration/python/HLT_FULL_cff.py @@ -121310,7 +121310,7 @@ fragment.hltSiPixelRecHitsSoA = SwitchProducerCUDA( cpu = cms.EDAlias( hltSiPixelRecHitsFromLegacy = cms.VPSet( - cms.PSet( type = cms.string( "cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous" ) ), + cms.PSet( type = cms.string( "trackingRecHitTrackingRecHitSoAHost" ) ), cms.PSet( type = cms.string( "uintAsHostProduct" ) ) ) ), diff --git a/HLTrigger/Configuration/python/HLT_GRun_cff.py b/HLTrigger/Configuration/python/HLT_GRun_cff.py index 05dc2f47e700b..522f518ca31ce 100644 --- a/HLTrigger/Configuration/python/HLT_GRun_cff.py +++ b/HLTrigger/Configuration/python/HLT_GRun_cff.py @@ -85517,7 +85517,7 @@ fragment.hltSiPixelRecHitsSoA = SwitchProducerCUDA( cpu = cms.EDAlias( hltSiPixelRecHitsFromLegacy = cms.VPSet( - cms.PSet( type = cms.string( "cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous" ) ), + cms.PSet( type = cms.string( "trackingRecHitTrackingRecHitSoAHost" ) ), cms.PSet( type = cms.string( "uintAsHostProduct" ) ) ) ), diff --git a/HLTrigger/Configuration/python/HLT_HIon_cff.py b/HLTrigger/Configuration/python/HLT_HIon_cff.py index adec53755ea21..5cfb325352d36 100644 --- a/HLTrigger/Configuration/python/HLT_HIon_cff.py +++ b/HLTrigger/Configuration/python/HLT_HIon_cff.py @@ -25503,7 +25503,7 @@ fragment.hltSiPixelRecHitsSoA = SwitchProducerCUDA( cpu = cms.EDAlias( hltSiPixelRecHitsFromLegacy = cms.VPSet( - cms.PSet( type = cms.string( "cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous" ) ), + cms.PSet( type = cms.string( "trackingRecHitTrackingRecHitSoAHost" ) ), cms.PSet( type = cms.string( "uintAsHostProduct" ) ) ) ), diff --git a/HLTrigger/Configuration/python/HLT_PRef_cff.py b/HLTrigger/Configuration/python/HLT_PRef_cff.py index 1f2eee46fbd4f..2bf1b5a4837d9 100644 --- a/HLTrigger/Configuration/python/HLT_PRef_cff.py +++ b/HLTrigger/Configuration/python/HLT_PRef_cff.py @@ -9248,7 +9248,7 @@ fragment.hltSiPixelRecHitsSoA = SwitchProducerCUDA( cpu = cms.EDAlias( hltSiPixelRecHitsFromLegacy = cms.VPSet( - cms.PSet( type = cms.string( "cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous" ) ), + cms.PSet( type = cms.string( "trackingRecHitTrackingRecHitSoAHost" ) ), cms.PSet( type = cms.string( "uintAsHostProduct" ) ) ) ), diff --git a/HLTrigger/Configuration/test/OnLine_HLT_2022v15.py b/HLTrigger/Configuration/test/OnLine_HLT_2022v15.py index 6ad31948f5bfc..06dea8df2ef0a 100644 --- a/HLTrigger/Configuration/test/OnLine_HLT_2022v15.py +++ b/HLTrigger/Configuration/test/OnLine_HLT_2022v15.py @@ -85070,7 +85070,7 @@ process.hltSiPixelRecHitsSoA = SwitchProducerCUDA( cpu = cms.EDAlias( hltSiPixelRecHitsFromLegacy = cms.VPSet( - cms.PSet( type = cms.string( "cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous" ) ), + cms.PSet( type = cms.string( "trackingRecHitTrackingRecHitSoAHost" ) ), cms.PSet( type = cms.string( "uintAsHostProduct" ) ) ) ), diff --git a/HLTrigger/Configuration/test/OnLine_HLT_FULL.py b/HLTrigger/Configuration/test/OnLine_HLT_FULL.py index 6cb2600166b88..681f518b7daed 100644 --- a/HLTrigger/Configuration/test/OnLine_HLT_FULL.py +++ b/HLTrigger/Configuration/test/OnLine_HLT_FULL.py @@ -121947,7 +121947,7 @@ process.hltSiPixelRecHitsSoA = SwitchProducerCUDA( cpu = cms.EDAlias( hltSiPixelRecHitsFromLegacy = cms.VPSet( - cms.PSet( type = cms.string( "cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous" ) ), + cms.PSet( type = cms.string( "trackingRecHitTrackingRecHitSoAHost" ) ), cms.PSet( type = cms.string( "uintAsHostProduct" ) ) ) ), diff --git a/HLTrigger/Configuration/test/OnLine_HLT_GRun.py b/HLTrigger/Configuration/test/OnLine_HLT_GRun.py index a74141d51598f..9c998d956dadc 100644 --- a/HLTrigger/Configuration/test/OnLine_HLT_GRun.py +++ b/HLTrigger/Configuration/test/OnLine_HLT_GRun.py @@ -85785,7 +85785,7 @@ process.hltSiPixelRecHitsSoA = SwitchProducerCUDA( cpu = cms.EDAlias( hltSiPixelRecHitsFromLegacy = cms.VPSet( - cms.PSet( type = cms.string( "cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous" ) ), + cms.PSet( type = cms.string( "trackingRecHitTrackingRecHitSoAHost" ) ), cms.PSet( type = cms.string( "uintAsHostProduct" ) ) ) ), diff --git a/HLTrigger/Configuration/test/OnLine_HLT_HIon.py b/HLTrigger/Configuration/test/OnLine_HLT_HIon.py index 6ab093bb88ae7..46c253d18521c 100644 --- a/HLTrigger/Configuration/test/OnLine_HLT_HIon.py +++ b/HLTrigger/Configuration/test/OnLine_HLT_HIon.py @@ -25771,7 +25771,7 @@ process.hltSiPixelRecHitsSoA = SwitchProducerCUDA( cpu = cms.EDAlias( hltSiPixelRecHitsFromLegacy = cms.VPSet( - cms.PSet( type = cms.string( "cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous" ) ), + cms.PSet( type = cms.string( "trackingRecHitTrackingRecHitSoAHost" ) ), cms.PSet( type = cms.string( "uintAsHostProduct" ) ) ) ), diff --git a/HLTrigger/Configuration/test/OnLine_HLT_PRef.py b/HLTrigger/Configuration/test/OnLine_HLT_PRef.py index f4bab18c1039b..d827ae488507c 100644 --- a/HLTrigger/Configuration/test/OnLine_HLT_PRef.py +++ b/HLTrigger/Configuration/test/OnLine_HLT_PRef.py @@ -9516,7 +9516,7 @@ process.hltSiPixelRecHitsSoA = SwitchProducerCUDA( cpu = cms.EDAlias( hltSiPixelRecHitsFromLegacy = cms.VPSet( - cms.PSet( type = cms.string( "cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous" ) ), + cms.PSet( type = cms.string( "trackingRecHitTrackingRecHitSoAHost" ) ), cms.PSet( type = cms.string( "uintAsHostProduct" ) ) ) ), diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu index f32a36ab2e58b..fe65229867359 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu @@ -35,20 +35,27 @@ namespace { namespace pixelgpudetails { - trackingRecHit::TrackingRecHitSoADevice PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const& digis_d, - SiPixelClustersCUDA const& clusters_d, - BeamSpotCUDA const& bs_d, - pixelCPEforGPU::ParamsOnGPU const* cpeParams, - bool isPhase2, - cudaStream_t stream) const { + trackingRecHit::TrackingRecHitSoADevice PixelRecHitGPUKernel::makeHitsAsync( + SiPixelDigisCUDA const& digis_d, + SiPixelClustersCUDA const& clusters_d, + BeamSpotCUDA const& bs_d, + pixelCPEforGPU::ParamsOnGPU const* cpeParams, + bool isPhase2, + cudaStream_t stream) const { auto nHits = clusters_d.nClusters(); - trackingRecHit::TrackingRecHitSoADevice hits_d(nHits, isPhase2, clusters_d.offsetBPIX2(), cpeParams, clusters_d->clusModuleStart(), stream); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + + trackingRecHit::TrackingRecHitSoADevice hits_d( + nHits, isPhase2, clusters_d.offsetBPIX2(), cpeParams, clusters_d->clusModuleStart(), stream); // TrackingRecHit2DGPU hits_d( // nHits, isPhase2, clusters_d.offsetBPIX2(), cpeParams, clusters_d.clusModuleStart(), stream); // assert(hits_d.nMaxModules() == isPhase2 ? phase2PixelTopology::numberOfModules // : phase1PixelTopology::numberOfModules); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); int activeModulesWithDigis = digis_d.nModules(); // protect from empty events if (activeModulesWithDigis) { @@ -67,7 +74,8 @@ namespace pixelgpudetails { // assuming full warp of threads is better than a smaller number... if (nHits) { - setHitsLayerStart<<<1, 32, 0, stream>>>(clusters_d->clusModuleStart(), cpeParams, hits_d.view().hitsLayerStart().data()); + setHitsLayerStart<<<1, 32, 0, stream>>>( + clusters_d->clusModuleStart(), cpeParams, hits_d.view().hitsLayerStart().data()); cudaCheck(cudaGetLastError()); auto nLayers = isPhase2 ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers; cms::cuda::fillManyFromVector(hits_d.phiBinner(), @@ -85,10 +93,10 @@ namespace pixelgpudetails { #endif } } - #ifdef GPU_DEBUG +#ifdef GPU_DEBUG cudaCheck(cudaDeviceSynchronize()); - std::cout << "DONE" << std::endl; - #endif + std::cout << "DONE" << std::endl; +#endif return hits_d; } diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc index 5287718bdac05..342ebafe0e6d5 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc @@ -66,26 +66,26 @@ void SiPixelRecHitCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, cons if (not fcpe) { throw cms::Exception("Configuration") << "SiPixelRecHitSoAFromLegacy can only use a CPE of type PixelCPEFast"; } - + std::cout << "SiPixelRecHitCUDA::produce " << __LINE__ << std::endl; edm::Handle> hclusters; iEvent.getByToken(token_, hclusters); - + std::cout << "SiPixelRecHitCUDA::produce " << __LINE__ << std::endl; cms::cuda::ScopedContextProduce ctx{*hclusters}; auto const& clusters = ctx.get(*hclusters); - + std::cout << "SiPixelRecHitCUDA::produce " << __LINE__ << std::endl; edm::Handle> hdigis; iEvent.getByToken(tokenDigi_, hdigis); auto const& digis = ctx.get(*hdigis); - + std::cout << "SiPixelRecHitCUDA::produce " << __LINE__ << std::endl; edm::Handle> hbs; iEvent.getByToken(tBeamSpot, hbs); auto const& bs = ctx.get(*hbs); - + std::cout << "SiPixelRecHitCUDA::produce " << __LINE__ << std::endl; ctx.emplace(iEvent, tokenHit_, gpuAlgo_.makeHitsAsync( digis, clusters, bs, fcpe->getGPUProductAsync(ctx.stream()), fcpe->isPhase2(), ctx.stream())); - std::cout << __LINE__< const& inputDataWrapped = iEvent.get(hitsToken_); cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; auto const& inputData = ctx.get(inputDataWrapped); - std::cout << __LINE__<getCPUProduct(); - + std::cout << "GAMIESAI 0" << std::endl; const reco::BeamSpot& bs = iEvent.get(bsGetToken_); BeamSpotPOD bsHost; @@ -105,15 +106,14 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv // allocate a buffer for the indices of the clusters auto hmsp = std::make_unique(nMaxModules + 1); // hitsModuleStart is a non-owning pointer to the buffer - auto hitsModuleStart = hmsp.get(); + // auto hitsModuleStart = hmsp.get(); // wrap the buffer in a HostProduct auto hms = std::make_unique(std::move(hmsp)); // move the HostProduct to the Event, without reallocating the buffer or affecting hitsModuleStart iEvent.put(tokenModuleStart_, std::move(hms)); - + std::cout << "GAMIESAI 1" << std::endl; // legacy output auto legacyOutput = std::make_unique(); - // storage std::vector xx; std::vector yy; @@ -125,19 +125,21 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv constexpr uint32_t maxHitsInModule = gpuClustering::maxHitsInModule(); - HitModuleStart moduleStart_; // index of the first pixel of each module - HitModuleStart clusInModule_; - memset(&clusInModule_, 0, sizeof(HitModuleStart)); // needed?? - memset(&moduleStart_, 0, sizeof(HitModuleStart)); - assert(gpuClustering::maxNumModules + 1 == clusInModule_.size()); - assert(0 == clusInModule_[gpuClustering::maxNumModules]); - uint32_t moduleId_; - moduleStart_[1] = 0; // we run sequentially.... - - // SiPixelClustersCUDA::SiPixelClustersCUDASOAView clusterView{ - // moduleStart_.data(), clusInModule_.data(), &moduleId_, hitsModuleStart}; - SiPixelClustersCUDASoA::ConstView clusterView{ - gpuClustering::maxNumModules + 1, moduleStart_.data(), clusInModule_.data(), &moduleId_, hitsModuleStart}; + // HitModuleStart moduleStart_; // index of the first pixel of each module + // HitModuleStart clusInModule_; + // memset(&clusInModule_, 0, sizeof(HitModuleStart)); // needed?? + // memset(&moduleStart_, 0, sizeof(HitModuleStart)); + + std::cout << "GAMIESAI 2" << std::endl; + cms::cuda::PortableHostCollection> clusters_h(gpuClustering::maxNumModules + 1, nullptr); + + memset(clusters_h.view().clusInModule(), 0, sizeof(HitModuleStart)); // needed?? + memset(clusters_h.view().moduleStart(), 0, sizeof(HitModuleStart)); + + assert(0 == clusters_h.view()[gpuClustering::maxNumModules].clusInModule()); + + clusters_h.view()[1].moduleStart() = 0; // we run sequentially.... + // fill cluster arrays int numberOfClusters = 0; for (auto const& dsv : input) { @@ -147,25 +149,32 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv auto gind = genericDet->index(); assert(gind < nMaxModules); auto const nclus = dsv.size(); - clusInModule_[gind] = nclus; + clusters_h.view()[gind].clusInModule() = nclus; numberOfClusters += nclus; } - hitsModuleStart[0] = 0; + clusters_h.view()[0].clusModuleStart() = 0; - for (int i = 1, n = nMaxModules + 1; i < n; ++i) - hitsModuleStart[i] = hitsModuleStart[i - 1] + clusInModule_[i - 1]; + for (int i = 1, n = nMaxModules + 1; i < n; ++i) { + clusters_h.view()[i].clusModuleStart() = + clusters_h.view()[i - 1].clusModuleStart() + clusters_h.view()[i - 1].clusInModule(); + } - assert(numberOfClusters == int(hitsModuleStart[nMaxModules])); + assert((uint32_t)numberOfClusters == clusters_h.view()[nMaxModules].clusModuleStart()); // output SoA // element 96 is the start of BPIX2 (i.e. the number of clusters in BPIX1) - auto output = std::make_unique( - numberOfClusters, isPhase2_, hitsModuleStart[startBPIX2], &cpeView, hitsModuleStart, nullptr); - assert(output->nModules() == uint32_t(nMaxModules)); + TrackingRecHitSoAHost output = TrackingRecHitSoAHost(numberOfClusters, + isPhase2_, + clusters_h.view()[startBPIX2].clusModuleStart(), + &cpeView, + clusters_h.view().clusModuleStart(), + nullptr); + std::cout << "GAMIESAI 3" << std::endl; + assert(output.nModules() == uint32_t(nMaxModules)); if (0 == numberOfClusters) { - iEvent.put(std::move(output)); + iEvent.emplace(tokenHit_, std::move(output)); if (convert2Legacy_) iEvent.put(std::move(legacyOutput)); return; @@ -173,11 +182,12 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv if (convert2Legacy_) legacyOutput->reserve(nMaxModules, numberOfClusters); - + std::cout << "GAMIESAI 4" << std::endl; int numberOfDetUnits = 0; int numberOfHits = 0; for (auto const& dsv : input) { numberOfDetUnits++; + std::cout << "GAMIESAI 4a_" << numberOfDetUnits << std::endl; unsigned int detid = dsv.detId(); DetId detIdObject(detid); const GeomDetUnit* genericDet = geom_->idToDetUnit(detIdObject); @@ -186,12 +196,14 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv const PixelGeomDetUnit* pixDet = dynamic_cast(genericDet); assert(pixDet); auto const nclus = dsv.size(); - assert(clusInModule_[gind] == nclus); + assert(clusters_h.view()[gind].clusInModule() == nclus); if (0 == nclus) continue; // is this really possible? - - auto const fc = hitsModuleStart[gind]; - auto const lc = hitsModuleStart[gind + 1]; + std::cout << "GAMIESAI 4b_" << numberOfDetUnits << std::endl; + auto const fc = clusters_h.view()[gind].clusModuleStart(); + std::cout << "GAMIESAI 4c_" << numberOfDetUnits << std::endl; + auto const lc = clusters_h.view()[gind + 1].clusModuleStart(); + std::cout << "GAMIESAI 4d_" << numberOfDetUnits << std::endl; assert(lc > fc); LogDebug("SiPixelRecHitSoAFromLegacy") << "in det " << gind << ": conv " << nclus << " hits from " << dsv.size() << " legacy clusters" << ' ' << fc << ',' << lc; @@ -201,55 +213,60 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv "WARNING: too many clusters %d in Module %d. Only first %d Hits converted\n", nclus, gind, maxHitsInModule); // fill digis - xx.clear(); - yy.clear(); - adc.clear(); - moduleInd.clear(); - clus.clear(); + uint32_t ndigi = 0; + for (auto const& clust : dsv) { + assert(clust.size() > 0); + for (int i = 0, nd = clust.size(); i < nd; ++i) { + ndigi++; + } + } + std::cout << "ndigi=" << ndigi << std::endl; + cms::cuda::PortableHostCollection> digis_h(ndigi, nullptr); + // xx.clear(); + // yy.clear(); + // adc.clear(); + // moduleInd.clear(); + // clus.clear(); clusterRef.clear(); - moduleId_ = gind; + // moduleId_ = gind; + clusters_h.view()[0].moduleId() = gind; uint32_t ic = 0; - uint32_t ndigi = 0; + ndigi = 0; for (auto const& clust : dsv) { assert(clust.size() > 0); for (int i = 0, nd = clust.size(); i < nd; ++i) { auto px = clust.pixel(i); - xx.push_back(px.x); - yy.push_back(px.y); - adc.push_back(px.adc); - moduleInd.push_back(gind); - clus.push_back(ic); + digis_h.view()[ndigi].xx() = px.x; + digis_h.view()[ndigi].yy() = px.y; + digis_h.view()[ndigi].adc() = px.adc; + digis_h.view()[ndigi].moduleId() = gind; + digis_h.view()[ndigi].clus() = ic; ++ndigi; } + std::cout << "GAMIESAI 4g_" << numberOfDetUnits << std::endl; if (convert2Legacy_) clusterRef.emplace_back(edmNew::makeRefTo(hclusters, &clust)); ic++; } assert(nclus == ic); - assert(clus.size() == ndigi); + // assert(clus.size() == ndigi); numberOfHits += nclus; - // filled creates view - // SiPixelDigisCUDASOAView digiView; - // digiView.xx_ = xx.data(); - // digiView.yy_ = yy.data(); - // digiView.adc_ = adc.data(); - // digiView.moduleInd_ = moduleInd.data(); - // digiView.clus_ = clus.data(); - // digiView.pdigi_ = nullptr; - // digiView.rawIdArr_ = nullptr; - // assert(digiView.adc(0) != 0); - SiPixelDigisCUDASOAConstView digiView( - ndigi, clus.data(), nullptr, nullptr, adc.data(), xx.data(), yy.data(), moduleInd.data()); - assert(digiView[0].adc() != 0); + + // SiPixelDigisCUDASOAConstView digiView( + // ndigi, clus.data(), nullptr, nullptr, adc.data(), xx.data(), yy.data(), moduleInd.data()); + assert(digis_h.view()[0].adc() != 0); + // we run on blockId.x==0 - gpuPixelRecHits::getHits(&cpeView, &bsHost, digiView, ndigi, clusterView, output->view()); + std::cout << "GAMIESAI 4h_" << numberOfDetUnits << std::endl; + gpuPixelRecHits::getHits(&cpeView, &bsHost, digis_h.view(), ndigi, clusters_h.view(), output.view()); + std::cout << "GAMIESAI 4i_" << numberOfDetUnits << std::endl; // gpuPixelRecHits::getHits(&cpeView, &bsHost, digiView, ndigi, &clusterView, output->view()); for (auto h = fc; h < lc; ++h) if (h - fc < maxHitsInModule) - assert(gind == output->view()[h].detectorIndex()); + assert(gind == output.view()[h].detectorIndex()); else - assert(gpuClustering::invalidModuleId == output->view()[h].detectorIndex()); + assert(gpuClustering::invalidModuleId == output.view()[h].detectorIndex()); if (convert2Legacy_) { SiPixelRecHitCollectionNew::FastFiller recHitsOnDetUnit(*legacyOutput, detid); for (auto h = fc; h < lc; ++h) { @@ -258,38 +275,38 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv if (ih >= maxHitsInModule) break; assert(ih < clusterRef.size()); - LocalPoint lp(output->view()[h].xLocal(), output->view()[h].yLocal()); - LocalError le(output->view()[h].xerrLocal(), 0, output->view()[h].yerrLocal()); + LocalPoint lp(output.view()[h].xLocal(), output.view()[h].yLocal()); + LocalError le(output.view()[h].xerrLocal(), 0, output.view()[h].yerrLocal()); SiPixelRecHitQuality::QualWordType rqw = 0; SiPixelRecHit hit(lp, le, rqw, *genericDet, clusterRef[ih]); recHitsOnDetUnit.push_back(hit); } } } - + std::cout << "GAMIESAI 5" << std::endl; assert(numberOfHits == numberOfClusters); // fill data structure to support CA const auto nLayers = isPhase2_ ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers; for (auto i = 0U; i < nLayers + 1; ++i) { - output->view().hitsLayerStart()[i] = hitsModuleStart[cpeView.layerGeometry().layerStart[i]]; + std::cout << "GAMIESAI 6_" << i << std::endl; + output.view().hitsLayerStart()[i] = clusters_h.view()[cpeView.layerGeometry().layerStart[i]].clusModuleStart(); LogDebug("SiPixelRecHitSoAFromLegacy") << "Layer n." << i << " - starting at module: " << cpeView.layerGeometry().layerStart[i] - << " - starts ad cluster: " << output->view()[i].hitsLayerStart() << "\n"; + << " - starts ad cluster: " << output.view()[i].hitsLayerStart() << "\n"; } - - cms::cuda::fillManyFromVector(&(output->view().phiBinner()), + std::cout << "GAMIESAI 7" << std::endl; + cms::cuda::fillManyFromVector(&(output.view().phiBinner()), nLayers, - output->view().iphi(), - output->view().hitsLayerStart().data(), - output->nHits(), + output.view().iphi(), + output.view().hitsLayerStart().data(), + output.view().nHits(), 256, - output->phiBinnerStorage()); - - + output.view().phiBinnerStorage()); + std::cout << "GAMIESAI 8" << std::endl; LogDebug("SiPixelRecHitSoAFromLegacy") << "created HitSoa for " << numberOfClusters << " clusters in " << numberOfDetUnits << " Dets"; - iEvent.put(std::move(output)); + iEvent.emplace(tokenHit_, std::move(output)); if (convert2Legacy_) iEvent.put(std::move(legacyOutput)); } diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h index 7c8145cf7ebf1..566fb2629a3af 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h @@ -20,10 +20,9 @@ namespace gpuPixelRecHits { __global__ void getHits(pixelCPEforGPU::ParamsOnGPU const* __restrict__ cpeParams, BeamSpotPOD const* __restrict__ bs, // SiPixelDigisCUDASOAView const digis - SiPixelDigisCUDASOAConstView const digis, + SiPixelDigisCUDASOAConstView digis, int numElements, - // SiPixelClustersCUDA::SiPixelClustersCUDASOAView const* __restrict__ pclusters, - SiPixelClustersCUDASoA::ConstView const __restrict__ clusters, + SiPixelClustersCUDASoA::ConstView clusters, trackingRecHitSoA::HitSoAView hits) { // FIXME // the compiler seems NOT to optimize loads from views (even in a simple test case) @@ -82,8 +81,8 @@ namespace gpuPixelRecHits { #endif #ifdef GPU_DEBUG // if (me % 100 == 1) - if (threadIdx.x == 0) - printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, clusters.clusModuleStart(me)); + if (threadIdx.x == 0) + printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, clusters.clusModuleStart(me)); #endif for (int startClus = 0, endClus = nclus; startClus < endClus; startClus += MaxHitsInIter) { int nClusInIter = std::min(MaxHitsInIter, endClus - startClus); diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc index 36d3dd8c3dcc7..9db4d09d16f05 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc @@ -134,7 +134,7 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, auto const &rcs = rechits.data(); auto nhits = rcs.size(); hitmap.resize(nhits, nullptr); - + std::cout << "nHits = " << nhits << std::endl; auto const *hitsModuleStart = iEvent.get(hmsToken_).get(); auto fc = hitsModuleStart; @@ -144,6 +144,7 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, auto const &clus = thit.firstClusterRef(); assert(clus.isPixel()); auto i = fc[detI] + clus.pixelCluster().originalId(); + std::cout << "i = " << i << std::endl; if (i >= hitmap.size()) hitmap.resize(i + 256, nullptr); // only in case of hit overflow in one module assert(nullptr == hitmap[i]); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc index eb4de1485addd..c4ebebc8c6039 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc @@ -1,6 +1,6 @@ #include "BrokenLineFitOnGPU.h" -void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(HitSoAConstView const &hv, uint32_t hitsInFit, uint32_t maxNumberOfTuples) { +void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(HitSoAConstView hv, uint32_t hitsInFit, uint32_t maxNumberOfTuples) { assert(tuples_); #ifdef BROKENLINE_DEBUG diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu index 82bccff64ff7e..e9a3be07f3f8f 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu @@ -1,7 +1,7 @@ #include "BrokenLineFitOnGPU.h" #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -void HelixFitOnGPU::launchBrokenLineKernels(HitSoAConstView const &hv, +void HelixFitOnGPU::launchBrokenLineKernels(HitSoAConstView hv, uint32_t hitsInFit, uint32_t maxNumberOfTuples, cudaStream_t stream) { @@ -21,6 +21,7 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitSoAConstView const &hv, for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { // fit triplets + std::cout << "BITCH 0_" << offset << std::endl; kernel_BLFastFit<3><<>>(tuples_, tupleMultiplicity_, hv, @@ -32,7 +33,8 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitSoAConstView const &hv, 3, offset); cudaCheck(cudaGetLastError()); - + cudaCheck(cudaDeviceSynchronize()); + std::cout << "BITCH 1_" << offset << std::endl; kernel_BLFit<3><<>>(tupleMultiplicity_, bField_, outputSoa_, @@ -41,9 +43,11 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitSoAConstView const &hv, hits_geGPU.get(), fast_fit_resultsGPU.get()); cudaCheck(cudaGetLastError()); - + cudaCheck(cudaDeviceSynchronize()); + std::cout << "BITCH 2_" << offset << std::endl; if (fitNas4_) { // fit all as 4 + std::cout << "BITCH 2a_" << offset << std::endl; kernel_BLFastFit<4><<>>(tuples_, tupleMultiplicity_, hv, @@ -55,7 +59,8 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitSoAConstView const &hv, 8, offset); cudaCheck(cudaGetLastError()); - + cudaCheck(cudaDeviceSynchronize()); + std::cout << "BITCH 3_" << offset << std::endl; kernel_BLFit<4><<>>(tupleMultiplicity_, bField_, outputSoa_, @@ -63,8 +68,10 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitSoAConstView const &hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get()); + std::cout << "BITCH 4_" << offset << std::endl; } else { // fit quads + std::cout << "BITCH 2b_" << offset << std::endl; kernel_BLFastFit<4><<>>(tuples_, tupleMultiplicity_, hv, @@ -76,7 +83,8 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitSoAConstView const &hv, 4, offset); cudaCheck(cudaGetLastError()); - + cudaCheck(cudaDeviceSynchronize()); + std::cout << "BITCH 5_" << offset << std::endl; kernel_BLFit<4><<>>(tupleMultiplicity_, bField_, outputSoa_, @@ -84,6 +92,7 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitSoAConstView const &hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get()); + std::cout << "BITCH 6_" << offset << std::endl; // fit penta (all 5) kernel_BLFastFit<5><<>>(tuples_, tupleMultiplicity_, @@ -96,7 +105,8 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitSoAConstView const &hv, 5, offset); cudaCheck(cudaGetLastError()); - + cudaCheck(cudaDeviceSynchronize()); + std::cout << "BITCH 7_" << offset << std::endl; kernel_BLFit<5><<<8, blockSize, 0, stream>>>(tupleMultiplicity_, bField_, outputSoa_, @@ -105,6 +115,7 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitSoAConstView const &hv, hits_geGPU.get(), fast_fit_resultsGPU.get()); cudaCheck(cudaGetLastError()); + std::cout << "BITCH 8_" << offset << std::endl; // fit sexta and above (as 6) kernel_BLFastFit<6><<<4, blockSize, 0, stream>>>(tuples_, tupleMultiplicity_, @@ -117,7 +128,8 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitSoAConstView const &hv, 8, offset); cudaCheck(cudaGetLastError()); - + cudaCheck(cudaDeviceSynchronize()); + std::cout << "BITCH 9_" << offset << std::endl; kernel_BLFit<6><<<4, blockSize, 0, stream>>>(tupleMultiplicity_, bField_, outputSoa_, @@ -126,6 +138,8 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitSoAConstView const &hv, hits_geGPU.get(), fast_fit_resultsGPU.get()); cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + std::cout << "BITCH 10_" << offset << std::endl; } } // loop on concurrent fits diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h index a8d4be7ca969d..6030019778f91 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h @@ -27,7 +27,7 @@ constexpr auto invalidTkId = std::numeric_limits::max(); template __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets, caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity, - HitSoAConstView const &__restrict__ hh, + HitSoAConstView hh, tindex_type *__restrict__ ptkids, double *__restrict__ phits, float *__restrict__ phits_ge, @@ -35,6 +35,8 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets, uint32_t nHitsL, uint32_t nHitsH, int32_t offset) { + // look in bin for this hit multiplicity + auto local_start = blockIdx.x * blockDim.x + threadIdx.x; constexpr uint32_t hitsInFit = N; assert(hitsInFit <= nHitsL); @@ -45,18 +47,17 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets, assert(foundNtuplets); assert(tupleMultiplicity); - // look in bin for this hit multiplicity - auto local_start = blockIdx.x * blockDim.x + threadIdx.x; int totTK = tupleMultiplicity->end(nHitsH) - tupleMultiplicity->begin(nHitsL); + assert(totTK <= int(tupleMultiplicity->size())); assert(totTK >= 0); -#ifdef BROKENLINE_DEBUG + // #ifdef BROKENLINE_DEBUG if (0 == local_start) { printf("%d total Ntuple\n", tupleMultiplicity->size()); printf("%d Ntuple of size %d/%d for %d hits to fit\n", totTK, nHitsL, nHitsH, hitsInFit); } -#endif + // #endif for (int local_idx = local_start, nt = riemannFit::maxNumberOfConcurrentFits; local_idx < nt; local_idx += gridDim.x * blockDim.x) { @@ -65,6 +66,7 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets, ptkids[local_idx] = invalidTkId; break; } + // get it from the ntuple container (one to one to helix) auto tkid = *(tupleMultiplicity->begin(nHitsL) + tuple_idx); assert(tkid < foundNtuplets->nOnes()); @@ -92,10 +94,11 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets, // #define YERR_FROM_DC #ifdef YERR_FROM_DC + // try to compute more precise error in y - auto dx = hh.xGlobal(hitId[hitsInFit - 1]) - hh.xGlobal(hitId[0]); - auto dy = hh.yGlobal(hitId[hitsInFit - 1]) - hh.yGlobal(hitId[0]); - auto dz = hh.zGlobal(hitId[hitsInFit - 1]) - hh.zGlobal(hitId[0]); + auto dx = hh[hitId[hitsInFit - 1]].xGlobal() - hh[hitId[0]].xGlobal(); + auto dy = hh[hitId[hitsInFit - 1]].yGlobal() - hh[hitId[0]].yGlobal(); + auto dz = hh[hitId[hitsInFit - 1]].zGlobal() - hh[hitId[0]].zGlobal(); float ux, uy, uz; #endif @@ -111,12 +114,14 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets, float ge[6]; #ifdef YERR_FROM_DC + auto const &dp = hh.cpeParams().detParams(hh.detectorIndex(hit)); - auto status = hh.status(hit); + auto status = hh[hit].status(); int qbin = CPEFastParametrisation::kGenErrorQBins - 1 - status.qBin; assert(qbin >= 0 && qbin < 5); bool nok = (status.isBigY | status.isOneY); // compute cotanbeta and use it to recompute error + dp.frame.rotation().multiply(dx, dy, dz, ux, uy, uz); auto cb = std::abs(uy / uz); int bin = @@ -125,16 +130,17 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets, int high_value = CPEFastParametrisation::kNumErrorBins - 1; // return estimated bin value truncated to [0, 15] bin = std::clamp(bin, low_value, high_value); + float yerr = dp.sigmay[bin] * 1.e-4f; // toCM yerr *= dp.yfact[qbin]; // inflate yerr *= yerr; yerr += dp.apeYY; - yerr = nok ? hh.yerrLocal(hit) : yerr; - dp.frame.toGlobal(hh.xerrLocal(hit), 0, yerr, ge); + yerr = nok ? hh[hit].yerrLocal() : yerr; + dp.frame.toGlobal(hh[hit].xerrLocal(), 0, yerr, ge); #else - hh.cpeParams() - .detParams(hh.detectorIndex(hit)) - .frame.toGlobal(hh.xerrLocal(hit), 0, hh.yerrLocal(hit), ge); + + hh.cpeParams().detParams(hh.detectorIndex(hit)).frame.toGlobal(hh.xerrLocal(hit), 0, hh.yerrLocal(hit), ge); + #endif #ifdef BL_DUMP_HITS @@ -144,16 +150,16 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets, local_idx, tkid, hit, - hh.detectorIndex(hit), + hh[hit].detectorIndex(), i, - hh.xGlobal(hit), - hh.yGlobal(hit), - hh.zGlobal(hit)); + hh[hit].xGlobal(), + hh[hit].yGlobal(), + hh[hit].zGlobal()); printf("Error: hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n", i, ge[0], ge[1], ge[2], ge[3], ge[4], ge[5]); } #endif - hits.col(i) << hh.xGlobal(hit), hh.yGlobal(hit), hh.zGlobal(hit); + hits.col(i) << hh[hit].xGlobal(), hh[hit].yGlobal(), hh[hit].zGlobal(); hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5]; } brokenline::fastFit(hits, fast_fit); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc index 9d4851010097a..2dd5e92f7583d 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc @@ -87,15 +87,16 @@ void CAHitNtupletCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const auto bf = 1. / es.getData(tokenField_).inverseBzAtOriginInGeV(); if (onGPU_) { - auto hHits = iEvent.getHandle(tokenHitGPU_); + edm::Handle> hHits; + iEvent.getByToken(tokenHitGPU_, hHits); + // auto hHits = iEvent.getHandle(tokenHitGPU_); cms::cuda::ScopedContextProduce ctx{*hHits}; - auto const& hits = ctx.get(*hHits); - - ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream())); + auto& hits_d = ctx.get(*hHits); + ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits_d, bf, ctx.stream())); } else { - auto const& hits = iEvent.get(tokenHitCPU_); - iEvent.emplace(tokenTrackCPU_, gpuAlgo_.makeTuples(hits, bf)); + auto& hits_h = iEvent.get(tokenHitCPU_); + iEvent.emplace(tokenTrackCPU_, gpuAlgo_.makeTuples(hits_h, bf)); } } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc index cccdaeee50a00..85399ba317b69 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -14,7 +14,7 @@ void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const *counters) { } template <> -void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsConstView const&hh, int32_t offsetBPIX2, cudaStream_t stream) { +void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsConstView hh, int32_t offsetBPIX2, cudaStream_t stream) { uint32_t nhits = hh.metadata().size(); #ifdef NTUPLE_DEBUG @@ -82,9 +82,7 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsConstView const&hh, int3 } template <> -void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsConstView const&hh, - TkSoAView tracks_view, - cudaStream_t cudaStream) { +void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsConstView hh, TkSoAView tracks_view, cudaStream_t cudaStream) { // zero tuples cms::cuda::launchZero(&tracks_view.hitIndices(), cudaStream); @@ -142,9 +140,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsConstView const&hh, } template <> -void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsConstView const&hh, - TkSoAView tracks_view, - cudaStream_t cudaStream) { +void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsConstView hh, TkSoAView tracks_view, cudaStream_t cudaStream) { int32_t nhits = hh.metadata().size(); auto *quality_d = pixelTrack::utilities::qualityData(tracks_view); @@ -165,12 +161,10 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsConstView const&hh, cms::cuda::launchFinalize(hitToTupleView_, cudaStream); kernel_fillHitInTracks(tracks_view, device_hitToTuple_.get()); } - // remove duplicates (tracks that share at least one hit) if (params_.doSharedHitCut_) { kernel_rejectDuplicate( tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); - kernel_sharedHitCleaner( hh, tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); if (params_.useSimpleTripletCleaner_) { @@ -197,7 +191,6 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsConstView const&hh, params_.maxNumberOfDoublets_, counters_); } - if (params_.doStats_) { // counters (add flag???) std::lock_guard guard(lock_stat); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index 8561f703e527e..8bd2d62bd13ef 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -2,9 +2,7 @@ #include template <> -void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsConstView const&hh, - TkSoAView tracks_view, - cudaStream_t cudaStream) { +void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsConstView hh, TkSoAView tracks_view, cudaStream_t cudaStream) { // these are pointer on GPU! auto *quality_d = pixelTrack::utilities::qualityData(tracks_view); @@ -130,7 +128,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsConstView const&hh, } template <> -void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsConstView const&hh, int32_t offsetBPIX2, cudaStream_t stream) { +void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsConstView hh, int32_t offsetBPIX2, cudaStream_t stream) { int32_t nhits = hh.metadata().size(); isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), offsetBPIX2}; @@ -145,8 +143,8 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsConstView const&hh, int3 #endif std::cout << __LINE__ << std::endl; // in principle we can use "nhits" to heuristically dimension the workspace... - device_isOuterHitOfCell_ = cms::cuda::make_device_unique( - std::max(1, nhits - offsetBPIX2), stream); + device_isOuterHitOfCell_ = + cms::cuda::make_device_unique(std::max(1, nhits - offsetBPIX2), stream); assert(device_isOuterHitOfCell_.get()); std::cout << __LINE__ << std::endl; isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), offsetBPIX2}; @@ -220,20 +218,18 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsConstView const&hh, int3 } template <> -void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsConstView const&hh, - TkSoAView tracks_view, - cudaStream_t cudaStream) { +void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsConstView hh, TkSoAView tracks_view, cudaStream_t cudaStream) { // these are pointer on GPU! auto *quality_d = pixelTrack::utilities::qualityData(tracks_view); int32_t nhits = hh.metadata().size(); - + std::cout << "PSOFOS 1" << std::endl; auto blockSize = 64; // classify tracks based on kinematics auto numberOfBlocks = nQuadrupletBlocks(blockSize); kernel_classifyTracks<<>>(tracks_view, quality_d, params_.cuts_); - + std::cout << "PSOFOS 2" << std::endl; cudaCheck(cudaGetLastError()); if (params_.lateFishbone_) { @@ -242,57 +238,70 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsConstView const&hh, kernel_fishboneCleaner<<>>( device_theCells_.get(), device_nCells_, quality_d); cudaCheck(cudaGetLastError()); + std::cout << "PSOFOS 2.1" << std::endl; } - + std::cout << "PSOFOS 2.2" << std::endl; // mark duplicates (tracks that share a doublet) numberOfBlocks = nDoubletBlocks(blockSize); + std::cout << "PSOFOS 2.3" << std::endl; kernel_fastDuplicateRemover<<>>( device_theCells_.get(), device_nCells_, tracks_view, params_.dupPassThrough_); + std::cout << "PSOFOS 2.4" << std::endl; cudaCheck(cudaGetLastError()); + std::cout << "PSOFOS 3" << std::endl; + #ifdef GPU_DEBUG cudaCheck(cudaDeviceSynchronize()); #endif - + std::cout << "PSOFOS 3 new" << std::endl; if (params_.doSharedHitCut_ || params_.doStats_) { + std::cout << "PSOFOS 3.1" << std::endl; // fill hit->track "map" assert(hitToTupleView_.offSize > nhits); + std::cout << "PSOFOS 3.2" << std::endl; numberOfBlocks = nQuadrupletBlocks(blockSize); + std::cout << "PSOFOS 3.3" << std::endl; kernel_countHitInTracks<<>>(tracks_view, device_hitToTuple_.get()); cudaCheck(cudaGetLastError()); + std::cout << "PSOFOS 3.4" << std::endl; assert((hitToTupleView_.assoc == device_hitToTuple_.get()) && (hitToTupleView_.offStorage == device_hitToTupleStorage_.get()) && (hitToTupleView_.offSize > 0)); cms::cuda::launchFinalize(hitToTupleView_, cudaStream); cudaCheck(cudaGetLastError()); + std::cout << "PSOFOS 3.5" << std::endl; kernel_fillHitInTracks<<>>(tracks_view, device_hitToTuple_.get()); cudaCheck(cudaGetLastError()); + std::cout << "PSOFOS 3.6" << std::endl; #ifdef GPU_DEBUG cudaCheck(cudaDeviceSynchronize()); #endif } - + std::cout << "PSOFOS 4" << std::endl; if (params_.doSharedHitCut_) { // mark duplicates (tracks that share at least one hit) numberOfBlocks = (hitToTupleView_.offSize + blockSize - 1) / blockSize; kernel_rejectDuplicate<<>>( tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); - + std::cout << "PSOFOS 4.1" << std::endl; kernel_sharedHitCleaner<<>>( hh, tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); - + std::cout << "PSOFOS 4.2" << std::endl; if (params_.useSimpleTripletCleaner_) { kernel_simpleTripletCleaner<<>>( tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + std::cout << "PSOFOS 4.3" << std::endl; } else { kernel_tripletCleaner<<>>( tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + std::cout << "PSOFOS 4.4" << std::endl; } cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaCheck(cudaDeviceSynchronize()); #endif } - + std::cout << "PSOFOS 5" << std::endl; if (params_.doStats_) { numberOfBlocks = (std::max(nhits, int(params_.maxNumberOfDoublets_)) + blockSize - 1) / blockSize; kernel_checkOverflows<<>>(tracks_view, @@ -308,16 +317,19 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsConstView const&hh, params_.maxNumberOfDoublets_, counters_); cudaCheck(cudaGetLastError()); + std::cout << "PSOFOS 5.1" << std::endl; } - + std::cout << "PSOFOS 6" << std::endl; if (params_.doStats_) { // counters (add flag???) numberOfBlocks = (hitToTupleView_.offSize + blockSize - 1) / blockSize; kernel_doStatsForHitInTracks<<>>(device_hitToTuple_.get(), counters_); cudaCheck(cudaGetLastError()); + std::cout << "PSOFOS 6.1" << std::endl; numberOfBlocks = (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize; kernel_doStatsForTracks<<>>(tracks_view, quality_d, counters_); cudaCheck(cudaGetLastError()); + std::cout << "PSOFOS 6.2" << std::endl; } #ifdef GPU_DEBUG cudaDeviceSynchronize(); @@ -331,13 +343,14 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsConstView const&hh, std::lock_guard guard(lock); ++iev; for (int k = 0; k < 20000; k += 500) { - kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>( - hh, tracks_view, device_hitToTuple_.get(), k, k + 500, iev); + kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(hh, tracks_view, device_hitToTuple_.get(), k, k + 500, iev); cudaDeviceSynchronize(); + std::cout << "PSOFOS 7" << std::endl; } kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>( hh, tracks_view, device_hitToTuple_.get(), 20000, 1000000, iev); cudaDeviceSynchronize(); + std::cout << "PSOFOS 8" << std::endl; // cudaStreamSynchronize(cudaStream); } #endif diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h index d2b2c4b1c77a0..2dce3909adfa5 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h @@ -32,6 +32,7 @@ namespace cAHitNtupletGenerator { unsigned long long nZeroTrackCells; }; + using HitsView = trackingRecHitSoA::HitSoAView; using HitsConstView = trackingRecHitSoA::HitSoAConstView; // using HitsConstView = TrackingRecHit2DSOAView; @@ -173,6 +174,7 @@ class CAHitNtupletGeneratorKernels { template using unique_ptr = typename Traits::template unique_ptr; + using HitsView = trackingRecHitSoA::HitSoAView; using HitsConstView = trackingRecHitSoA::HitSoAConstView; // using HitsOnGPU = TrackingRecHit2DSOAView; // using HitsOnCPU = TrackingRecHit2DHeterogeneous; @@ -191,11 +193,11 @@ class CAHitNtupletGeneratorKernels { TupleMultiplicity const* tupleMultiplicity() const { return device_tupleMultiplicity_.get(); } - void launchKernels(HitsConstView const& hh, TkSoAView tracks_view, cudaStream_t cudaStream); + void launchKernels(HitsConstView hh, TkSoAView tracks_view, cudaStream_t cudaStream); - void classifyTuples(HitsConstView const& hh, TkSoAView tracks_view, cudaStream_t cudaStream); + void classifyTuples(HitsConstView hh, TkSoAView tracks_view, cudaStream_t cudaStream); - void buildDoublets(HitsConstView const& hh, int32_t offsetBPIX2, cudaStream_t stream); + void buildDoublets(HitsConstView hh, int32_t offsetBPIX2, cudaStream_t stream); void allocateOnGPU(int32_t nHits, cudaStream_t stream); void cleanup(cudaStream_t cudaStream); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index 49cac0dd0026d..8a00aba6b80bb 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -26,7 +26,8 @@ // using HitsOnGPU = TrackingRecHit2DSOAView; // using HitsOnCPU = TrackingRecHit2DGPU; -using HitSoAView = trackingRecHitSoA::HitSoAConstView; +using HitSoAView = trackingRecHitSoA::HitSoAView; +using HitSoAConstView = trackingRecHitSoA::HitSoAConstView; using HitToTuple = caConstants::HitToTuple; using TupleMultiplicity = caConstants::TupleMultiplicity; @@ -269,7 +270,7 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells, __global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1, cms::cuda::AtomicPairCounter *apc2, // just to zero them, - GPUCACell::HitsConstView const& __restrict__ hh, + GPUCACell::HitsConstView hh, GPUCACell *cells, uint32_t const *__restrict__ nCells, gpuPixelDoublets::CellNeighborsVector *cellNeighbors, @@ -334,7 +335,7 @@ __global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1, } // loop on outer cells } -__global__ void kernel_find_ntuplets(GPUCACell::HitsConstView const& __restrict__ hh, +__global__ void kernel_find_ntuplets(GPUCACell::HitsConstView hh, GPUCACell *__restrict__ cells, uint32_t const *nCells, gpuPixelDoublets::CellTracksVector *cellTracks, @@ -547,14 +548,13 @@ __global__ void kernel_fillHitInTracks(TkSoAView tracks_view, // TODO: Make Con } } -__global__ void kernel_fillHitDetIndices(TkSoAView tracks_view, HitSoAView const& __restrict__ hh) { +__global__ void kernel_fillHitDetIndices(TkSoAView tracks_view, HitSoAConstView hh) { int first = blockDim.x * blockIdx.x + threadIdx.x; // copy offsets for (int idx = first, ntot = tracks_view.hitIndices().totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { tracks_view.detIndices().off[idx] = tracks_view.hitIndices().off[idx]; } // fill hit indices - // auto const &hh = *hhp; auto nhits = hh.nHits(); for (int idx = first, ntot = tracks_view.hitIndices().size(); idx < ntot; idx += gridDim.x * blockDim.x) { assert(tracks_view.hitIndices().content[idx] < nhits); @@ -707,7 +707,7 @@ __global__ void kernel_rejectDuplicate(TkSoAView tracks_view, } } -__global__ void kernel_sharedHitCleaner(HitSoAView const& __restrict__ hh, +__global__ void kernel_sharedHitCleaner(HitSoAConstView hh, TkSoAView tracks_view, int nmin, bool dupPassThrough, @@ -855,7 +855,7 @@ __global__ void kernel_simpleTripletCleaner( } // loop over hits } -__global__ void kernel_print_found_ntuplets(HitSoAView& __restrict__ hh, +__global__ void kernel_print_found_ntuplets(HitSoAView hh, TkSoAView tracks_view, CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple, int32_t firstPrint, diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc index 0246d0ecb608e..7fa3f339260fc 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -191,27 +191,58 @@ void CAHitNtupletGeneratorOnGPU::endJob() { } } -pixelTrack::TrackSoADevice CAHitNtupletGeneratorOnGPU::makeTuplesAsync(HitsOnGPU const& hits_d, - float bfield, - cudaStream_t stream) const { +pixelTrack::TrackSoADevice CAHitNtupletGeneratorOnGPU::makeTuplesAsync( + trackingRecHit::TrackingRecHitSoADevice const& hits_d, float bfield, cudaStream_t stream) const { pixelTrack::TrackSoADevice tracks(stream); - auto* soa = &tracks; + std::cout << "!!!!" << hits_d.offsetBPIX2() << ", " << hits_d.nHits() << std::endl; CAHitNtupletGeneratorKernelsGPU kernels(m_params); kernels.setCounters(m_counters); + std::cout << "GAMW THN PANAGIA 3" << std::endl; + kernels.allocateOnGPU(hits_d.nHits(), stream); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + std::cout << "GAMW THN PANAGIA 4" << std::endl; + + kernels.buildDoublets(hits_d.view(), hits_d.offsetBPIX2(), stream); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + std::cout << "GAMW THN PANAGIA 5" << std::endl; - kernels.buildDoublets(hits_d.const_view(), hits_d.offsetBPIX2(), stream); - kernels.launchKernels(hits_d.const_view(), soa->view(), stream); + kernels.launchKernels(hits_d.view(), tracks.view(), stream); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + std::cout << "GAMW THN PANAGIA 6" << std::endl; HelixFitOnGPU fitter(bfield, m_params.fitNas4_); - fitter.allocateOnGPU(kernels.tupleMultiplicity(), soa->view()); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + std::cout << "GAMW THN PANAGIA 7" << std::endl; + + fitter.allocateOnGPU(kernels.tupleMultiplicity(), tracks.view()); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + std::cout << "GAMW THN PANAGIA 8" << std::endl; + if (m_params.useRiemannFit_) { - fitter.launchRiemannKernels(hits_d.const_view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream); + std::cout << "GAMW THN PANAGIA 8.1a" << std::endl; + fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + std::cout << "GAMW THN PANAGIA 8.1" << std::endl; } else { - fitter.launchBrokenLineKernels(hits_d.const_view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream); + std::cout << "GAMW THN PANAGIA 8.2a" << std::endl; + fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + std::cout << "GAMW THN PANAGIA 8.2" << std::endl; } - kernels.classifyTuples(hits_d.const_view(), soa->view(), stream); + + kernels.classifyTuples(hits_d.view(), tracks.view(), stream); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + std::cout << "GAMW THN PANAGIA 9" << std::endl; #ifdef GPU_DEBUG cudaDeviceSynchronize(); @@ -222,17 +253,18 @@ pixelTrack::TrackSoADevice CAHitNtupletGeneratorOnGPU::makeTuplesAsync(HitsOnGPU return tracks; } -pixelTrack::TrackSoAHost CAHitNtupletGeneratorOnGPU::makeTuples(HitsOnCPU const& hits_d, float bfield) const { +pixelTrack::TrackSoAHost CAHitNtupletGeneratorOnGPU::makeTuples(trackingRecHit::TrackingRecHitSoAHost const& hits_h, + float bfield) const { pixelTrack::TrackSoAHost tracks(nullptr); CAHitNtupletGeneratorKernelsCPU kernels(m_params); kernels.setCounters(m_counters); - kernels.allocateOnGPU(hits_d.nHits(), nullptr); + kernels.allocateOnGPU(hits_h.nHits(), nullptr); - kernels.buildDoublets(hits_d.const_view(), hits_d.offsetBPIX2(), nullptr); - kernels.launchKernels(hits_d.const_view(), tracks.view(), nullptr); + kernels.buildDoublets(hits_h.view(), hits_h.offsetBPIX2(), nullptr); + kernels.launchKernels(hits_h.view(), tracks.view(), nullptr); - if (0 == hits_d.nHits()) + if (0 == hits_h.nHits()) return tracks; // now fit @@ -240,12 +272,12 @@ pixelTrack::TrackSoAHost CAHitNtupletGeneratorOnGPU::makeTuples(HitsOnCPU const& fitter.allocateOnGPU(kernels.tupleMultiplicity(), tracks.view()); if (m_params.useRiemannFit_) { - fitter.launchRiemannKernelsOnCPU(hits_d.const_view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets); + fitter.launchRiemannKernelsOnCPU(hits_h.view(), hits_h.nHits(), caConstants::maxNumberOfQuadruplets); } else { - fitter.launchBrokenLineKernelsOnCPU(hits_d.const_view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets); + fitter.launchBrokenLineKernelsOnCPU(hits_h.view(), hits_h.nHits(), caConstants::maxNumberOfQuadruplets); } - kernels.classifyTuples(hits_d.const_view(), tracks.view(), nullptr); + kernels.classifyTuples(hits_h.view(), tracks.view(), nullptr); #ifdef GPU_DEBUG std::cout << "finished building pixel tracks on CPU" << std::endl; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h index 3447e56354ebb..2d46bd09d1e0a 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h @@ -6,9 +6,9 @@ #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" #include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h" #include "FWCore/ParameterSet/interface/ParameterSet.h" @@ -28,6 +28,8 @@ namespace edm { class CAHitNtupletGeneratorOnGPU { public: + using HitsView = trackingRecHitSoA::HitSoAView; + using HitsConstView = trackingRecHitSoA::HitSoAConstView; using HitsOnGPU = trackingRecHit::TrackingRecHitSoADevice; using HitsOnCPU = trackingRecHit::TrackingRecHitSoAHost; // using hindex_type = TrackingRecHit2DSOAView::hindex_type; @@ -54,19 +56,19 @@ class CAHitNtupletGeneratorOnGPU { void endJob(); // On GPU - pixelTrack::TrackSoADevice makeTuplesAsync(HitsOnGPU const& hits_d, + pixelTrack::TrackSoADevice makeTuplesAsync(trackingRecHit::TrackingRecHitSoADevice const& hits_d, float bfield, cudaStream_t stream) const; // On CPU - pixelTrack::TrackSoAHost makeTuples(HitsOnCPU const& hits_d, float bfield) const; + pixelTrack::TrackSoAHost makeTuples(trackingRecHit::TrackingRecHitSoAHost const& hits_h, float bfield) const; private: - void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream) const; + void buildDoublets(HitsConstView hh, cudaStream_t stream) const; - void hitNtuplets(HitsOnCPU const& hh, const edm::EventSetup& es, bool useRiemannFit, cudaStream_t cudaStream); + void hitNtuplets(HitsConstView hh, const edm::EventSetup& es, bool useRiemannFit, cudaStream_t cudaStream); - void launchKernels(HitsOnCPU const& hh, bool useRiemannFit, cudaStream_t cudaStream) const; + void launchKernels(HitsConstView hh, bool useRiemannFit, cudaStream_t cudaStream) const; Params m_params; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h index a1f0c49b6fa8d..4267d674e1ca8 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h @@ -47,7 +47,7 @@ class GPUCACell { __device__ __forceinline__ void init(CellNeighborsVector& cellNeighbors, CellTracksVector& cellTracks, - HitsConstView const& hh, + HitsConstView hh, int layerPairId, hindex_type innerHitId, hindex_type outerHitId) { @@ -113,22 +113,22 @@ class GPUCACell { __device__ __forceinline__ CellTracks const& tracks() const { return *theTracks; } __device__ __forceinline__ CellNeighbors& outerNeighbors() { return *theOuterNeighbors; } __device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return *theOuterNeighbors; } - __device__ __forceinline__ float inner_x(HitsConstView const& hh) const { return hh[theInnerHitId].xGlobal(); } - __device__ __forceinline__ float outer_x(HitsConstView const& hh) const { return hh[theOuterHitId].xGlobal(); } - __device__ __forceinline__ float inner_y(HitsConstView const& hh) const { return hh[theInnerHitId].yGlobal(); } - __device__ __forceinline__ float outer_y(HitsConstView const& hh) const { return hh[theOuterHitId].yGlobal(); } - __device__ __forceinline__ float inner_z(HitsConstView const& hh) const { return theInnerZ; } + __device__ __forceinline__ float inner_x(HitsConstView hh) const { return hh[theInnerHitId].xGlobal(); } + __device__ __forceinline__ float outer_x(HitsConstView hh) const { return hh[theOuterHitId].xGlobal(); } + __device__ __forceinline__ float inner_y(HitsConstView hh) const { return hh[theInnerHitId].yGlobal(); } + __device__ __forceinline__ float outer_y(HitsConstView hh) const { return hh[theOuterHitId].yGlobal(); } + __device__ __forceinline__ float inner_z(HitsConstView hh) const { return theInnerZ; } // { return hh.zGlobal(theInnerHitId); } // { return theInnerZ; } - __device__ __forceinline__ float outer_z(HitsConstView const& hh) const { return hh[theOuterHitId].zGlobal(); } - __device__ __forceinline__ float inner_r(HitsConstView const& hh) const { return theInnerR; } + __device__ __forceinline__ float outer_z(HitsConstView hh) const { return hh[theOuterHitId].zGlobal(); } + __device__ __forceinline__ float inner_r(HitsConstView hh) const { return theInnerR; } // { return hh.rGlobal(theInnerHitId); } // { return theInnerR; } - __device__ __forceinline__ float outer_r(HitsConstView const& hh) const { return hh[theOuterHitId].rGlobal(); } + __device__ __forceinline__ float outer_r(HitsConstView hh) const { return hh[theOuterHitId].rGlobal(); } - __device__ __forceinline__ auto inner_iphi(HitsConstView const& hh) const { return hh[theInnerHitId].iphi(); } - __device__ __forceinline__ auto outer_iphi(HitsConstView const& hh) const { return hh[theOuterHitId].iphi(); } + __device__ __forceinline__ auto inner_iphi(HitsConstView hh) const { return hh[theInnerHitId].iphi(); } + __device__ __forceinline__ auto outer_iphi(HitsConstView hh) const { return hh[theOuterHitId].iphi(); } - __device__ __forceinline__ float inner_detIndex(HitsConstView const& hh) const { return hh[theInnerHitId].detectorIndex(); } - __device__ __forceinline__ float outer_detIndex(HitsConstView const& hh) const { return hh[theOuterHitId].detectorIndex(); } + __device__ __forceinline__ float inner_detIndex(HitsConstView hh) const { return hh[theInnerHitId].detectorIndex(); } + __device__ __forceinline__ float outer_detIndex(HitsConstView hh) const { return hh[theOuterHitId].detectorIndex(); } constexpr unsigned int inner_hit_id() const { return theInnerHitId; } constexpr unsigned int outer_hit_id() const { return theOuterHitId; } @@ -140,7 +140,7 @@ class GPUCACell { theOuterHitId); } - __device__ bool check_alignment(HitsConstView const& hh, + __device__ bool check_alignment(HitsConstView hh, GPUCACell const& otherCell, const float ptmin, const float hardCurvCut, @@ -187,7 +187,7 @@ class GPUCACell { return tan_12_13_half_mul_distance_13_squared * pMin <= thetaCut * distance_13_squared * radius_diff; } - __device__ inline bool dcaCut(HitsConstView const& hh, + __device__ inline bool dcaCut(HitsConstView hh, GPUCACell const& otherCell, const float region_origin_radius_plus_tolerance, const float maxCurv) const { @@ -224,7 +224,7 @@ class GPUCACell { return std::abs(eq.dca0()) < region_origin_radius_plus_tolerance * std::abs(eq.curvature()); } - __device__ inline bool hole0(HitsConstView const& hh, GPUCACell const& innerCell) const { + __device__ inline bool hole0(HitsConstView hh, GPUCACell const& innerCell) const { using caConstants::first_ladder_bpx0; using caConstants::max_ladder_bpx0; using caConstants::module_length_bpx0; @@ -247,7 +247,7 @@ class GPUCACell { return gap; } - __device__ inline bool hole4(HitsConstView const& hh, GPUCACell const& innerCell) const { + __device__ inline bool hole4(HitsConstView hh, GPUCACell const& innerCell) const { using caConstants::first_ladder_bpx4; using caConstants::max_ladder_bpx4; using caConstants::module_length_bpx4; @@ -275,7 +275,7 @@ class GPUCACell { // trying to free the track building process from hardcoded layers, leaving // the visit of the graph based on the neighborhood connections between cells. template - __device__ inline void find_ntuplets(HitsConstView const& hh, + __device__ inline void find_ntuplets(HitsConstView hh, GPUCACell* __restrict__ cells, CellTracksVector& cellTracks, HitContainer& foundNtuplets, @@ -345,14 +345,14 @@ class GPUCACell { __device__ __forceinline__ bool unused() const { return 0 == (uint16_t(StatusBit::kUsed) & theStatus_); } __device__ __forceinline__ void setStatusBits(StatusBit mask) { theStatus_ |= uint16_t(mask); } - __device__ __forceinline__ void setFishbone(hindex_type id, float z, HitsConstView const& hh) { + __device__ __forceinline__ void setFishbone(hindex_type id, float z, HitsConstView hh) { // make it deterministic: use the farther apart (in z) auto old = theFishboneId; - while ( - old != - atomicCAS(&theFishboneId, - old, - (invalidHitId == old || std::abs(z - theInnerZ) > std::abs(hh[old].zGlobal() - theInnerZ)) ? id : old)) + while (old != + atomicCAS( + &theFishboneId, + old, + (invalidHitId == old || std::abs(z - theInnerZ) > std::abs(hh[old].zGlobal() - theInnerZ)) ? id : old)) old = theFishboneId; } __device__ __forceinline__ auto fishboneId() const { return theFishboneId; } @@ -373,7 +373,7 @@ class GPUCACell { }; template <> -__device__ inline void GPUCACell::find_ntuplets<0>(HitsConstView const& hh, +__device__ inline void GPUCACell::find_ntuplets<0>(HitsConstView hh, GPUCACell* __restrict__ cells, CellTracksVector& cellTracks, HitContainer& foundNtuplets, diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h index fe618c035f38f..1fdef06b56db1 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h @@ -33,6 +33,7 @@ namespace riemannFit { class HelixFitOnGPU { public: + using HitSoAView = trackingRecHitSoA::HitSoAView; using HitSoAConstView = trackingRecHitSoA::HitSoAConstView; using Tuples = pixelTrack::HitContainer; @@ -44,11 +45,11 @@ class HelixFitOnGPU { ~HelixFitOnGPU() { deallocateOnGPU(); } void setBField(double bField) { bField_ = bField; } - void launchRiemannKernels(HitSoAConstView const &hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream); - void launchBrokenLineKernels(HitSoAConstView const &hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream); + void launchRiemannKernels(HitSoAConstView hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream); + void launchBrokenLineKernels(HitSoAConstView hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream); - void launchRiemannKernelsOnCPU(HitSoAConstView const &hv, uint32_t nhits, uint32_t maxNumberOfTuples); - void launchBrokenLineKernelsOnCPU(HitSoAConstView const &hv, uint32_t nhits, uint32_t maxNumberOfTuples); + void launchRiemannKernelsOnCPU(HitSoAConstView hv, uint32_t nhits, uint32_t maxNumberOfTuples); + void launchBrokenLineKernelsOnCPU(HitSoAConstView hv, uint32_t nhits, uint32_t maxNumberOfTuples); void allocateOnGPU(TupleMultiplicity const *tupleMultiplicity, OutputSoAView outputSoA); void deallocateOnGPU(); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc index c5dcd9f32f0e9..1b2e08767efa2 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc @@ -1,6 +1,6 @@ #include "RiemannFitOnGPU.h" -void HelixFitOnGPU::launchRiemannKernelsOnCPU(HitSoAConstView const &hv, uint32_t nhits, uint32_t maxNumberOfTuples) { +void HelixFitOnGPU::launchRiemannKernelsOnCPU(HitSoAConstView hv, uint32_t nhits, uint32_t maxNumberOfTuples) { assert(tuples_); // Fit internals diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu index a62cc64cb1fb5..87db9d8f0e71d 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu @@ -1,7 +1,7 @@ #include "RiemannFitOnGPU.h" #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -void HelixFitOnGPU::launchRiemannKernels(HitSoAConstView const &hv, +void HelixFitOnGPU::launchRiemannKernels(HitSoAConstView hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t stream) { diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h index 1f91172cb6105..f203acb8047ff 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h @@ -22,7 +22,7 @@ template __global__ void kernel_FastFit(Tuples const *__restrict__ foundNtuplets, caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity, uint32_t nHits, - HitSoAConstView const &__restrict__ hh, + HitSoAConstView hh, double *__restrict__ phits, float *__restrict__ phits_ge, double *__restrict__ pfast_fit, @@ -65,9 +65,7 @@ __global__ void kernel_FastFit(Tuples const *__restrict__ foundNtuplets, auto hit = hitId[i]; // printf("Hit global: %f,%f,%f\n", hh.xg_d[hit],hh.yg_d[hit],hh.zg_d[hit]); float ge[6]; - hh.cpeParams() - .detParams(hh.detectorIndex(hit)) - .frame.toGlobal(hh.xerrLocal(hit), 0, hh.yerrLocal(hit), ge); + hh.cpeParams().detParams(hh.detectorIndex(hit)).frame.toGlobal(hh.xerrLocal(hit), 0, hh.yerrLocal(hit), ge); // printf("Error: %d: %f,%f,%f,%f,%f,%f\n",hh.detInd_d[hit],ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]); hits.col(i) << hh.xGlobal(hit), hh.yGlobal(hit), hh.zGlobal(hit); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h index 6432289fc6cc4..b2b6592e78a74 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h @@ -16,7 +16,7 @@ namespace gpuPixelDoublets { - __global__ void fishbone(GPUCACell::HitsConstView const& __restrict__ hh, + __global__ void fishbone(GPUCACell::HitsConstView hh, GPUCACell* cells, uint32_t const* __restrict__ nCells, GPUCACell::OuterHitOfCell const isOuterHitOfCellWrap, diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h index c51559b530711..e12476d9a4b0d 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h @@ -99,7 +99,7 @@ namespace gpuPixelDoublets { uint32_t* nCells, CellNeighborsVector* cellNeighbors, CellTracksVector* cellTracks, - HitsConstView const& __restrict__ hh, + HitsConstView hh, GPUCACell::OuterHitOfCell isOuterHitOfCell, int nActualPairs, bool ideal_cond, diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h index b16e08ba5e22a..8bf5eec7a00ce 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h @@ -29,7 +29,7 @@ namespace gpuPixelDoublets { uint32_t* nCells, CellNeighborsVector* cellNeighbors, CellTracksVector* cellTracks, - HitsConstView const& __restrict__ hh, + HitsConstView hh, GPUCACell::OuterHitOfCell isOuterHitOfCell, int16_t const* __restrict__ phicuts, float const* __restrict__ minz,