From d3e5e681c597a3724331da4f8566c9e6b18503fa Mon Sep 17 00:00:00 2001 From: Breno Orzari Date: Wed, 28 Sep 2022 12:31:41 +0200 Subject: [PATCH 001/110] Adding PixelTrack CUDADataFormats header and dummy --- .../interface/TrackSoAHeterogeneousT_test.h | 114 ++++++++++++++++++ .../Track/src/TrackSoAHeterogeneous_t_test.cc | 1 + 2 files changed, 115 insertions(+) create mode 100644 CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h create mode 100644 CUDADataFormats/Track/src/TrackSoAHeterogeneous_t_test.cc diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h new file mode 100644 index 0000000000000..b6eb21b0835dc --- /dev/null +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h @@ -0,0 +1,114 @@ +#ifndef CUDADataFormats_Track_TrackHeterogeneousT_H +#define CUDADataFormats_Track_TrackHeterogeneousT_H + +#include +#include + +#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h" +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" + +#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" +#include "DataFormats/SoATemplate/interface/SoALayout.h" + +//#include "DataFormats/Portable/interface/PortableCUDADeviceCollection.h" +#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" + +namespace pixelTrack { + enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity, notQuality }; + constexpr uint32_t qualitySize{uint8_t(Quality::notQuality)}; + const std::string qualityName[qualitySize]{"bad", "edup", "dup", "loose", "strict", "tight", "highPurity"}; + inline Quality qualityByName(std::string const &name) { + auto qp = std::find(qualityName, qualityName + qualitySize, name) - qualityName; + return static_cast(qp); + } +} // namespace pixelTrack + +GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test, + SOA_COLUMN(uint8_t, quality), + SOA_COLUMN(float, chi2), // this is chi2/ndof as not necessarely all hits are used in the fit + SOA_COLUMN(int8_t, nLayers), + SOA_COLUMN(float, eta), + SOA_COLUMN(float, pt)) + // TODO: maybe add stateAtBS + +template +class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection> { + +public: + + static constexpr int32_t stride() { return S; } + + using Quality = pixelTrack::Quality; + using hindex_type = uint32_t; + using HitContainer = cms::cuda::OneToManyAssoc; + + // Always check quality is at least loose! + // CUDA does not support enums in __lgc ... +private: + +public: + constexpr Quality quality(int32_t i) const { return static_cast(view()[i].quality()); } + constexpr Quality &quality(int32_t i) { return static_cast(view()[i].quality()); } + // TODO: static did not work; using reinterpret_cast + constexpr Quality const *qualityData() const { return reinterpret_cast (view().quality()); } + constexpr Quality *qualityData() { return reinterpret_cast< Quality *>(view().quality()); } + + constexpr int nTracks() const { return nTracks_; } + constexpr void setNTracks(int n) { nTracks_ = n; } + + constexpr int nHits(int i) const { return detIndices.size(i); } + + constexpr bool isTriplet(int i) const { return view()[i].nLayers() == 3; } + + constexpr int computeNumberOfLayers(int32_t i) const { + // layers are in order and we assume tracks are either forward or backward + auto pdet = detIndices.begin(i); + int nl = 1; + auto ol = phase1PixelTopology::getLayer(*pdet); + for (; pdet < detIndices.end(i); ++pdet) { + auto il = phase1PixelTopology::getLayer(*pdet); + if (il != ol) + ++nl; + ol = il; + } + return nl; + } + + // State at the Beam spot + // phi,tip,1/pt,cotan(theta),zip + TrajectoryStateSoAT stateAtBS; + constexpr float charge(int32_t i) const { return std::copysign(1.f, stateAtBS.state(i)(2)); } + constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); } + constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); } + constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); } + + // state at the detector of the outermost hit + // representation to be decided... + // not yet filled on GPU + // TrajectoryStateSoA stateAtOuterDet; + + HitContainer hitIndices; + HitContainer detIndices; + +private: + int nTracks_; +}; + +namespace pixelTrack { + +#ifdef GPU_SMALL_EVENTS + // kept for testing and debugging + constexpr uint32_t maxNumber() { return 2 * 1024; } +#else + // tested on MC events with 55-75 pileup events + constexpr uint32_t maxNumber() { return 32 * 1024; } +#endif + + using TrackSoA = TrackSoAHeterogeneousT; + using TrajectoryState = TrajectoryStateSoAT; + using HitContainer = TrackSoA::HitContainer; + +} // namespace pixelTrack + +#endif // CUDADataFormats_Track_TrackHeterogeneousT_H diff --git a/CUDADataFormats/Track/src/TrackSoAHeterogeneous_t_test.cc b/CUDADataFormats/Track/src/TrackSoAHeterogeneous_t_test.cc new file mode 100644 index 0000000000000..b15debe3cb72b --- /dev/null +++ b/CUDADataFormats/Track/src/TrackSoAHeterogeneous_t_test.cc @@ -0,0 +1 @@ +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" From cee33d5c2f89edf396136d13ccfc0b4101876d7c Mon Sep 17 00:00:00 2001 From: Breno Orzari Date: Wed, 28 Sep 2022 15:10:25 +0200 Subject: [PATCH 002/110] Adding methods for pt, eta and chi2 --- .../Track/interface/PixelTrackHeterogeneous.h | 5 +++-- .../Track/interface/TrackSoAHeterogeneousT_test.h | 12 ++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h index 3ee5af80353dd..c0e5c99b6fd28 100644 --- a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h +++ b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h @@ -2,8 +2,9 @@ #define CUDADataFormats_Track_PixelTrackHeterogeneous_h #include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h" +//#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" using PixelTrackHeterogeneous = HeterogeneousSoA; -#endif // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h \ No newline at end of file +#endif // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h index b6eb21b0835dc..1cf34f14b30a1 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h @@ -37,6 +37,9 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection>::PortableDeviceCollection; + TrackSoAHeterogeneousT() = default; + static constexpr int32_t stride() { return S; } using Quality = pixelTrack::Quality; @@ -54,6 +57,15 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection(view().quality()); } constexpr Quality *qualityData() { return reinterpret_cast< Quality *>(view().quality()); } + constexpr float pt(int32_t i) const { return view()[i].pt(); } + constexpr float &pt(int32_t i) { return view()[i].pt(); } + + constexpr float eta(int32_t i) const { return view()[i].eta(); } + constexpr float &eta(int32_t i) { return view()[i].eta(); } + + constexpr float chi2(int32_t i) const { return view()[i].chi2(); } + constexpr float &chi2(int32_t i) { return view()[i].chi2(); } + constexpr int nTracks() const { return nTracks_; } constexpr void setNTracks(int n) { nTracks_ = n; } From f92a10f5398ac9e42c06355c942d5a55371bc820 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Wed, 28 Sep 2022 16:25:53 +0200 Subject: [PATCH 003/110] Test of TrajectoryStateSoAT with macro-generated SoA compiles --- .../interface/TrajectoryStateSoAT_test.h | 63 +++++++++++++++++++ .../Track/src/TrajectoryStateSoAT_test.cpp | 1 + 2 files changed, 64 insertions(+) create mode 100644 CUDADataFormats/Track/interface/TrajectoryStateSoAT_test.h create mode 100644 CUDADataFormats/Track/src/TrajectoryStateSoAT_test.cpp diff --git a/CUDADataFormats/Track/interface/TrajectoryStateSoAT_test.h b/CUDADataFormats/Track/interface/TrajectoryStateSoAT_test.h new file mode 100644 index 0000000000000..1e561d0131d51 --- /dev/null +++ b/CUDADataFormats/Track/interface/TrajectoryStateSoAT_test.h @@ -0,0 +1,63 @@ +#ifndef CUDADataFormats_Track_TrajectoryStateSOAT_H +#define CUDADataFormats_Track_TrajectoryStateSOAT_H + +#include +#include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h" +#include "DataFormats/SoATemplate/interface/SoALayout.h" +#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" +using Vector5f = Eigen::Matrix; +using Vector15f = Eigen::Matrix; + +using Vector5d = Eigen::Matrix; +using Matrix5d = Eigen::Matrix; +GENERATE_SOA_LAYOUT(TrajectoryStateSoAT_test, + SOA_EIGEN_COLUMN(Vector5f, state), + SOA_EIGEN_COLUMN(Vector15f, covariance)) + +template +struct TrajectoryStateSoAT : public cms::cuda::PortableDeviceCollection> { + static constexpr int32_t stride() { return S; } + + // eigenSoA::MatrixSoA state; + // eigenSoA::MatrixSoA covariance; + + template + __host__ __device__ inline void copyFromCircle( + V3 const& cp, M3 const& ccov, V2 const& lp, M2 const& lcov, float b, int32_t i) { + view()[i].state() << cp.template cast(), lp.template cast(); + view()[i].state()(2) *= b; // TODO?? 2d access?? + auto cov = view()[i].covariance(); + cov(0) = ccov(0, 0); + cov(1) = ccov(0, 1); + cov(2) = b * float(ccov(0, 2)); + cov(4) = cov(3) = 0; + cov(5) = ccov(1, 1); + cov(6) = b * float(ccov(1, 2)); + cov(8) = cov(7) = 0; + cov(9) = b * b * float(ccov(2, 2)); + cov(11) = cov(10) = 0; + cov(12) = lcov(0, 0); + cov(13) = lcov(0, 1); + cov(14) = lcov(1, 1); + } + + template + __host__ __device__ inline void copyFromDense(V5 const& v, M5 const& cov, int32_t i) { + view()[i].state() = v.template cast(); + for (int j = 0, ind = 0; j < 5; ++j) + for (auto k = j; k < 5; ++k) + view()[i].covariance()(ind++) = cov(j, k); + } + + template + __host__ __device__ inline void copyToDense(V5& v, M5& cov, int32_t i) const { + v = view()[i].state().template cast(); + for (int j = 0, ind = 0; j < 5; ++j) { + cov(j, j) = view()[i].covariance()(ind++); + for (auto k = j + 1; k < 5; ++k) + cov(k, j) = cov(j, k) = view()[i].covariance()(ind++); + } + } +}; + +#endif // CUDADataFormats_Track_TrajectoryStateSOAT_H diff --git a/CUDADataFormats/Track/src/TrajectoryStateSoAT_test.cpp b/CUDADataFormats/Track/src/TrajectoryStateSoAT_test.cpp new file mode 100644 index 0000000000000..f6b9659331603 --- /dev/null +++ b/CUDADataFormats/Track/src/TrajectoryStateSoAT_test.cpp @@ -0,0 +1 @@ +#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT_test.h" From b69a948905933d82976e44359f952b4b0cb9ded0 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Thu, 29 Sep 2022 16:29:29 +0200 Subject: [PATCH 004/110] Eric's SoA port for Trajectory; might be dumped --- .../Track/interface/TrajectoryStateSoAT.h | 47 ++++++++----- .../plugins/CAHitNtupletGeneratorKernels.cc | 13 ++-- .../plugins/CAHitNtupletGeneratorKernels.cu | 13 ++-- .../CAHitNtupletGeneratorKernelsImpl.h | 67 ++++++++++--------- 4 files changed, 86 insertions(+), 54 deletions(-) diff --git a/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h b/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h index 64fcd573a6991..7f710ca67c7b6 100644 --- a/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h +++ b/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h @@ -3,26 +3,41 @@ #include #include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h" +#include "DataFormats/SoATemplate/interface/SoALayout.h" +#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" +using Vector5f = Eigen::Matrix; +using Vector15f = Eigen::Matrix; + +using Vector5d = Eigen::Matrix; +using Matrix5d = Eigen::Matrix; +GENERATE_SOA_LAYOUT(TrajectoryStateSoAT_test, + SOA_EIGEN_COLUMN(Vector5f, state), + SOA_EIGEN_COLUMN(Vector15f, covariance)) template -struct TrajectoryStateSoAT { - using Vector5f = Eigen::Matrix; - using Vector15f = Eigen::Matrix; +struct TrajectoryStateSoAT : public cms::cuda::PortableDeviceCollection> { + static constexpr int32_t stride() { return S; } - using Vector5d = Eigen::Matrix; - using Matrix5d = Eigen::Matrix; + // eigenSoA::MatrixSoA state; + // eigenSoA::MatrixSoA covariance; - static constexpr int32_t stride() { return S; } + // Vector5f state(const int32_t i) const { return view()[i].state(); } + // float* state() const { return view().state(); } // TODO: Return Vector5f* ? + // Vector15f covariance(const int32_t i) const { return view()[i].covariance(); } + // float* covariance() const { return view().covariance(); } // TODO: Return Vector15f* ? + + // Restrict view + // using RestrictConstView = + // Layout::ConstViewTemplate; - eigenSoA::MatrixSoA state; - eigenSoA::MatrixSoA covariance; + // RestrictConstView restrictConstView() const { return RestrictConstView(layout()); } template __host__ __device__ inline void copyFromCircle( V3 const& cp, M3 const& ccov, V2 const& lp, M2 const& lcov, float b, int32_t i) { - state(i) << cp.template cast(), lp.template cast(); - state(i)(2) *= b; - auto cov = covariance(i); + view()[i].state() << cp.template cast(), lp.template cast(); + view()[i].state()(2) *= b; + auto cov = view()[i].covariance(); cov(0) = ccov(0, 0); cov(1) = ccov(0, 1); cov(2) = b * float(ccov(0, 2)); @@ -39,19 +54,19 @@ struct TrajectoryStateSoAT { template __host__ __device__ inline void copyFromDense(V5 const& v, M5 const& cov, int32_t i) { - state(i) = v.template cast(); + view()[i].state() = v.template cast(); for (int j = 0, ind = 0; j < 5; ++j) for (auto k = j; k < 5; ++k) - covariance(i)(ind++) = cov(j, k); + view()[i].covariance()(ind++) = cov(j, k); } template __host__ __device__ inline void copyToDense(V5& v, M5& cov, int32_t i) const { - v = state(i).template cast(); + v = view()[i].state().template cast(); for (int j = 0, ind = 0; j < 5; ++j) { - cov(j, j) = covariance(i)(ind++); + cov(j, j) = view()[i].covariance()(ind++); for (auto k = j + 1; k < 5; ++k) - cov(k, j) = cov(j, k) = covariance(i)(ind++); + cov(k, j) = cov(j, k) = view()[i].covariance()(ind++); } } }; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc index 66208debdc98d..bc745817d4e4a 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -151,7 +151,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA auto *quality_d = tracks_d->qualityData(); // classify tracks based on kinematics - kernel_classifyTracks(tuples_d, tracks_d, params_.cuts_, quality_d); + kernel_classifyTracks(tuples_d, tracks_d, tracks_d->stateAtBS.view(), params_.cuts_, quality_d); if (params_.lateFishbone_) { // apply fishbone cleaning to good tracks @@ -159,7 +159,8 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA } // remove duplicates (tracks that share a doublet) - kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d, params_.dupPassThrough_); + kernel_fastDuplicateRemover( + device_theCells_.get(), device_nCells_, tracks_d, tracks_d->stateAtBS.view(), params_.dupPassThrough_); // fill hit->track "map" if (params_.doSharedHitCut_ || params_.doStats_) { @@ -170,8 +171,12 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA // remove duplicates (tracks that share at least one hit) if (params_.doSharedHitCut_) { - kernel_rejectDuplicate( - tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + kernel_rejectDuplicate(tracks_d, + tracks_d->stateAtBS.view(), // stateAtBS SoA view + quality_d, + params_.minHitsForSharingCut_, + params_.dupPassThrough_, + device_hitToTuple_.get()); kernel_sharedHitCleaner(hh.view(), tracks_d, diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index 913b6d5a32d28..712d995a6a6cf 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -233,7 +233,8 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA // classify tracks based on kinematics auto numberOfBlocks = nQuadrupletBlocks(blockSize); - kernel_classifyTracks<<>>(tuples_d, tracks_d, params_.cuts_, quality_d); + kernel_classifyTracks<<>>( + tuples_d, tracks_d, tracks_d->stateAtBS.view(), params_.cuts_, quality_d); cudaCheck(cudaGetLastError()); if (params_.lateFishbone_) { @@ -247,7 +248,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA // mark duplicates (tracks that share a doublet) numberOfBlocks = nDoubletBlocks(blockSize); kernel_fastDuplicateRemover<<>>( - device_theCells_.get(), device_nCells_, tracks_d, params_.dupPassThrough_); + device_theCells_.get(), device_nCells_, tracks_d, tracks_d->stateAtBS.view(), params_.dupPassThrough_); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaCheck(cudaDeviceSynchronize()); @@ -275,8 +276,12 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA // mark duplicates (tracks that share at least one hit) numberOfBlocks = (hitToTupleView_.offSize + blockSize - 1) / blockSize; - kernel_rejectDuplicate<<>>( - tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + kernel_rejectDuplicate<<>>(tracks_d, + tracks_d->stateAtBS.view(), + quality_d, + params_.minHitsForSharingCut_, + params_.dupPassThrough_, + device_hitToTuple_.get()); kernel_sharedHitCleaner<<>>(hh.view(), tracks_d, diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index bbe5df891a735..5806f6e6844e2 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -14,6 +14,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" +#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h" #include "CAConstants.h" #include "CAHitNtupletGeneratorKernels.h" @@ -175,10 +176,12 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells, } // assume the above (so, short tracks already removed) -__global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells, - uint32_t const *__restrict__ nCells, - TkSoA *__restrict__ tracks, - bool dupPassThrough) { +__global__ void kernel_fastDuplicateRemover( + GPUCACell const *__restrict__ cells, + uint32_t const *__restrict__ nCells, + TkSoA *__restrict__ tracks, + cms::cuda::PortableDeviceCollection>::ConstView stateAtBS_view, + bool dupPassThrough) { // quality to mark rejected auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup; constexpr auto loose = pixelTrack::Quality::loose; @@ -211,21 +214,21 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells, auto qi = tracks->quality(it); if (qi <= reject) continue; - auto opi = tracks->stateAtBS.state(it)(2); - auto e2opi = tracks->stateAtBS.covariance(it)(9); - auto cti = tracks->stateAtBS.state(it)(3); - auto e2cti = tracks->stateAtBS.covariance(it)(12); + auto opi = stateAtBS_view[it].state()(2); + auto e2opi = stateAtBS_view[it].covariance()(9); + auto cti = stateAtBS_view[it].state()(3); + auto e2cti = stateAtBS_view[it].covariance()(12); for (auto j = i + 1; j < ntr; ++j) { auto jt = thisCell.tracks()[j]; auto qj = tracks->quality(jt); if (qj <= reject) continue; - auto opj = tracks->stateAtBS.state(jt)(2); - auto ctj = tracks->stateAtBS.state(jt)(3); - auto dct = nSigma2 * (tracks->stateAtBS.covariance(jt)(12) + e2cti); + auto opj = stateAtBS_view[jt].state()(2); + auto ctj = stateAtBS_view[jt].state()(3); + auto dct = nSigma2 * (stateAtBS_view[jt].covariance()(12) + e2cti); if ((cti - ctj) * (cti - ctj) > dct) continue; - auto dop = nSigma2 * (tracks->stateAtBS.covariance(jt)(9) + e2opi); + auto dop = nSigma2 * (stateAtBS_view[jt].covariance()(9) + e2opi); if ((opi - opj) * (opi - opj) > dop) continue; if ((qj < qi) || (qj == qi && score(it) < score(jt))) @@ -410,10 +413,12 @@ __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNt } } -__global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, - TkSoA const *__restrict__ tracks, - CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts, - Quality *__restrict__ quality) { +__global__ void kernel_classifyTracks( + HitContainer const *__restrict__ tuples, + TkSoA const *__restrict__ tracks, + cms::cuda::PortableDeviceCollection>::ConstView stateAtBS_view, + CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts, + Quality *__restrict__ quality) { int first = blockDim.x * blockIdx.x + threadIdx.x; for (int it = first, nt = tuples->nOnes(); it < nt; it += gridDim.x * blockDim.x) { auto nhits = tuples->size(it); @@ -433,7 +438,7 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, // if the fit has any invalid parameters, mark it as bad bool isNaN = false; for (int i = 0; i < 5; ++i) { - isNaN |= std::isnan(tracks->stateAtBS.state(it)(i)); + isNaN |= std::isnan(stateAtBS_view[it].state()(i)); } if (isNaN) { #ifdef NTUPLE_DEBUG @@ -642,11 +647,13 @@ __global__ void kernel_markSharedHit(int const *__restrict__ nshared, } // mostly for very forward triplets..... -__global__ void kernel_rejectDuplicate(TkSoA const *__restrict__ ptracks, - Quality *__restrict__ quality, - uint16_t nmin, - bool dupPassThrough, - CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) { +__global__ void kernel_rejectDuplicate( + TkSoA const *__restrict__ ptracks, // TODO: Change to Constview + cms::cuda::PortableDeviceCollection>::ConstView stateAtBS_view, + Quality *__restrict__ quality, + uint16_t nmin, + bool dupPassThrough, + CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) { // quality to mark rejected auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup; @@ -672,22 +679,22 @@ __global__ void kernel_rejectDuplicate(TkSoA const *__restrict__ ptracks, auto qi = quality[it]; if (qi <= reject) continue; - auto opi = tracks.stateAtBS.state(it)(2); - auto e2opi = tracks.stateAtBS.covariance(it)(9); - auto cti = tracks.stateAtBS.state(it)(3); - auto e2cti = tracks.stateAtBS.covariance(it)(12); + auto opi = stateAtBS_view[it].state()(2); + auto e2opi = stateAtBS_view[it].covariance()(9); + auto cti = stateAtBS_view[it].state()(3); + auto e2cti = stateAtBS_view[it].covariance()(12); auto nli = tracks.nLayers(it); for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) { auto const jt = *jp; auto qj = quality[jt]; if (qj <= reject) continue; - auto opj = tracks.stateAtBS.state(jt)(2); - auto ctj = tracks.stateAtBS.state(jt)(3); - auto dct = nSigma2 * (tracks.stateAtBS.covariance(jt)(12) + e2cti); + auto opj = stateAtBS_view[jt].state()(2); + auto ctj = stateAtBS_view[jt].state()(3); + auto dct = nSigma2 * (stateAtBS_view[jt].covariance()(12) + e2cti); if ((cti - ctj) * (cti - ctj) > dct) continue; - auto dop = nSigma2 * (tracks.stateAtBS.covariance(jt)(9) + e2opi); + auto dop = nSigma2 * (stateAtBS_view[jt].covariance()(9) + e2opi); if ((opi - opj) * (opi - opj) > dop) continue; auto nlj = tracks.nLayers(jt); From e8d296eb1a5cf883ec05bc54391b1d416427e31c Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Mon, 10 Oct 2022 15:53:21 +0200 Subject: [PATCH 005/110] Merged TrajectorySoAT into TracksSoAHeterogeneousT --- .../interface/TrackSoAHeterogeneousT_test.h | 52 +++++++------ .../Track/interface/TrajectoryStateSoAT.h | 1 - .../Track/src/TrajectoryStateSoAT_test.cpp | 1 - CUDADataFormats/Track/test/BuildFile.xml | 12 --- .../Track/test/TrajectoryStateSOA_t.cpp | 1 - .../Track/test/TrajectoryStateSOA_t.cu | 1 - .../Track/test/TrajectoryStateSOA_t.h | 75 ------------------- .../PixelTriplets/test/BuildFile.xml | 1 + 8 files changed, 29 insertions(+), 115 deletions(-) delete mode 100644 CUDADataFormats/Track/src/TrajectoryStateSoAT_test.cpp delete mode 100644 CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp delete mode 100644 CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu delete mode 100644 CUDADataFormats/Track/test/TrajectoryStateSOA_t.h diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h index 1cf34f14b30a1..aa22332dd0cb1 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h @@ -4,7 +4,7 @@ #include #include -#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h" +#include #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" @@ -24,19 +24,37 @@ namespace pixelTrack { } } // namespace pixelTrack +using Vector5f = Eigen::Matrix; +using Vector15f = Eigen::Matrix; + +using Vector5d = Eigen::Matrix; +using Matrix5d = Eigen::Matrix; + GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test, SOA_COLUMN(uint8_t, quality), - SOA_COLUMN(float, chi2), // this is chi2/ndof as not necessarely all hits are used in the fit + SOA_COLUMN(float, chi2), // this is chi2/ndof as not necessarely all hits are used in the fit SOA_COLUMN(int8_t, nLayers), SOA_COLUMN(float, eta), - SOA_COLUMN(float, pt)) - // TODO: maybe add stateAtBS + SOA_COLUMN(float, pt), + SOA_EIGEN_COLUMN(Vector5f, state), + SOA_EIGEN_COLUMN(Vector15f, covariance)) -template -class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection> { +// Previous TrajectoryStateSoAT class methods +namespace pixelTrack { + namespace utilities { + using TrackSoAView = cms::cuda::PortableDeviceCollection>::ConstView; + // State at the Beam spot + // phi,tip,1/pt,cotan(theta),zip + float charge(TrackSoAView tracks, int32_t i) { return std::copysign(1.f, tracks[i].state()(2)); } + float phi(TrackSoAView tracks, int32_t i) { return tracks[i].state()(0); } + float tip(TrackSoAView tracks, int32_t i) { return tracks[i].state()(1); } + float zip(TrackSoAView tracks, int32_t i) { return tracks[i].state()(4); } + } // namespace utilities +} // namespace pixelTrack +template +class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection> { public: - // using cms::cuda::PortableDeviceCollection>::PortableDeviceCollection; TrackSoAHeterogeneousT() = default; @@ -49,13 +67,12 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection(view()[i].quality()); } constexpr Quality &quality(int32_t i) { return static_cast(view()[i].quality()); } // TODO: static did not work; using reinterpret_cast - constexpr Quality const *qualityData() const { return reinterpret_cast (view().quality()); } - constexpr Quality *qualityData() { return reinterpret_cast< Quality *>(view().quality()); } + constexpr Quality const *qualityData() const { return reinterpret_cast(view().quality()); } + constexpr Quality *qualityData() { return reinterpret_cast(view().quality()); } constexpr float pt(int32_t i) const { return view()[i].pt(); } constexpr float &pt(int32_t i) { return view()[i].pt(); } @@ -87,19 +104,6 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection stateAtBS; - constexpr float charge(int32_t i) const { return std::copysign(1.f, stateAtBS.state(i)(2)); } - constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); } - constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); } - constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); } - - // state at the detector of the outermost hit - // representation to be decided... - // not yet filled on GPU - // TrajectoryStateSoA stateAtOuterDet; - HitContainer hitIndices; HitContainer detIndices; @@ -118,7 +122,7 @@ namespace pixelTrack { #endif using TrackSoA = TrackSoAHeterogeneousT; - using TrajectoryState = TrajectoryStateSoAT; + using HitContainer = TrackSoA::HitContainer; } // namespace pixelTrack diff --git a/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h b/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h index 7f710ca67c7b6..23ff2ce2b1986 100644 --- a/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h +++ b/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h @@ -1,7 +1,6 @@ #ifndef CUDADataFormats_Track_TrajectoryStateSOAT_H #define CUDADataFormats_Track_TrajectoryStateSOAT_H -#include #include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h" #include "DataFormats/SoATemplate/interface/SoALayout.h" #include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" diff --git a/CUDADataFormats/Track/src/TrajectoryStateSoAT_test.cpp b/CUDADataFormats/Track/src/TrajectoryStateSoAT_test.cpp deleted file mode 100644 index f6b9659331603..0000000000000 --- a/CUDADataFormats/Track/src/TrajectoryStateSoAT_test.cpp +++ /dev/null @@ -1 +0,0 @@ -#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT_test.h" diff --git a/CUDADataFormats/Track/test/BuildFile.xml b/CUDADataFormats/Track/test/BuildFile.xml index fc78783db473b..985445f1e1b2a 100644 --- a/CUDADataFormats/Track/test/BuildFile.xml +++ b/CUDADataFormats/Track/test/BuildFile.xml @@ -5,15 +5,3 @@ - - - - - - - - - - - - diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp deleted file mode 100644 index d6ff539a642b0..0000000000000 --- a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp +++ /dev/null @@ -1 +0,0 @@ -#include "TrajectoryStateSOA_t.h" diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu deleted file mode 100644 index d6ff539a642b0..0000000000000 --- a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu +++ /dev/null @@ -1 +0,0 @@ -#include "TrajectoryStateSOA_t.h" diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h deleted file mode 100644 index 97b88873c2613..0000000000000 --- a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h +++ /dev/null @@ -1,75 +0,0 @@ -#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h" - -using Vector5d = Eigen::Matrix; -using Matrix5d = Eigen::Matrix; - -__host__ __device__ Matrix5d loadCov(Vector5d const& e) { - Matrix5d cov; - for (int i = 0; i < 5; ++i) - cov(i, i) = e(i) * e(i); - for (int i = 0; i < 5; ++i) { - for (int j = 0; j < i; ++j) { - double v = 0.3 * std::sqrt(cov(i, i) * cov(j, j)); // this makes the matrix pos defined - cov(i, j) = (i + j) % 2 ? -0.4 * v : 0.1 * v; - cov(j, i) = cov(i, j); - } - } - return cov; -} - -using TS = TrajectoryStateSoAT<128>; - -__global__ void testTSSoA(TS* pts, int n) { - assert(n <= 128); - - Vector5d par0; - par0 << 0.2, 0.1, 3.5, 0.8, 0.1; - Vector5d e0; - e0 << 0.01, 0.01, 0.035, -0.03, -0.01; - auto cov0 = loadCov(e0); - - TS& ts = *pts; - - int first = threadIdx.x + blockIdx.x * blockDim.x; - - for (int i = first; i < n; i += blockDim.x * gridDim.x) { - ts.copyFromDense(par0, cov0, i); - Vector5d par1; - Matrix5d cov1; - ts.copyToDense(par1, cov1, i); - Vector5d delV = par1 - par0; - Matrix5d delM = cov1 - cov0; - for (int j = 0; j < 5; ++j) { - assert(std::abs(delV(j)) < 1.e-5); - for (auto k = j; k < 5; ++k) { - assert(cov0(k, j) == cov0(j, k)); - assert(cov1(k, j) == cov1(j, k)); - assert(std::abs(delM(k, j)) < 1.e-5); - } - } - } -} - -#ifdef __CUDACC__ -#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#endif - -int main() { -#ifdef __CUDACC__ - cms::cudatest::requireDevices(); -#endif - - TS ts; - -#ifdef __CUDACC__ - TS* ts_d; - cudaCheck(cudaMalloc(&ts_d, sizeof(TS))); - testTSSoA<<<1, 64>>>(ts_d, 128); - cudaCheck(cudaGetLastError()); - cudaCheck(cudaMemcpy(&ts, ts_d, sizeof(TS), cudaMemcpyDefault)); - cudaCheck(cudaDeviceSynchronize()); -#else - testTSSoA(&ts, 128); -#endif -} diff --git a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml index d480d7408b9e2..522b186f3351b 100644 --- a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml +++ b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml @@ -26,4 +26,5 @@ + From 88e7966e52d018e5b3f67851c0ad53b89b07085c Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Mon, 10 Oct 2022 16:08:38 +0200 Subject: [PATCH 006/110] Moving stuff until they break --- .../interface/TrackSoAHeterogeneousT_test.h | 42 +++++++++++++++++++ .../plugins/BrokenLineFitOnGPU.h | 3 +- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h index aa22332dd0cb1..cdbb3bcba7cc1 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h @@ -49,6 +49,45 @@ namespace pixelTrack { float phi(TrackSoAView tracks, int32_t i) { return tracks[i].state()(0); } float tip(TrackSoAView tracks, int32_t i) { return tracks[i].state()(1); } float zip(TrackSoAView tracks, int32_t i) { return tracks[i].state()(4); } + + template + __host__ __device__ inline void copyFromCircle( + TrackSoAView tracks, V3 const &cp, M3 const &ccov, V2 const &lp, M2 const &lcov, float b, int32_t i) { + tracks[i].state() << cp.template cast(), lp.template cast(); + + tracks[i].state()(2) = tracks[i].state()(2) * b; + auto cov = tracks[i].covariance(); + cov(0) = ccov(0, 0); + cov(1) = ccov(0, 1); + cov(2) = b * float(ccov(0, 2)); + cov(4) = cov(3) = 0; + cov(5) = ccov(1, 1); + cov(6) = b * float(ccov(1, 2)); + cov(8) = cov(7) = 0; + cov(9) = b * b * float(ccov(2, 2)); + cov(11) = cov(10) = 0; + cov(12) = lcov(0, 0); + cov(13) = lcov(0, 1); + cov(14) = lcov(1, 1); + } + + template + __host__ __device__ inline void copyFromDense(TrackSoAView tracks, V5 const &v, M5 const &cov, int32_t i) { + tracks[i].state() = v.template cast(); + for (int j = 0, ind = 0; j < 5; ++j) + for (auto k = j; k < 5; ++k) + tracks[i].covariance()(ind++) = cov(j, k); + } + + template + __host__ __device__ inline void copyToDense(TrackSoAView tracks, V5 &v, M5 &cov, int32_t i) { + v = tracks[i].state().template cast(); + for (int j = 0, ind = 0; j < 5; ++j) { + cov(j, j) = tracks[i].covariance()(ind++); + for (auto k = j + 1; k < 5; ++k) + cov(k, j) = cov(j, k) = tracks[i].covariance()(ind++); + } + } } // namespace utilities } // namespace pixelTrack @@ -58,6 +97,9 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection>::PortableDeviceCollection; TrackSoAHeterogeneousT() = default; + explicit TrackSoAHeterogeneousT(size_t maxModules, cudaStream_t stream) + : PortableDeviceCollection>(maxModules, stream) {} + static constexpr int32_t stride() { return S; } using Quality = pixelTrack::Quality; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h index 6ec6afb83cba1..b4d6da45e42f9 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h @@ -203,7 +203,8 @@ __global__ void kernel_BLFit(caConstants::TupleMultiplicity const *__restrict__ brokenline::lineFit(hits_ge, fast_fit, bField, data, line); brokenline::circleFit(hits, hits_ge, fast_fit, bField, data, circle); - results->stateAtBS.copyFromCircle(circle.par, circle.cov, line.par, line.cov, 1.f / float(bField), tkid); + pixelTrack::utilities::copyFromCircle( + results, circle.par, circle.cov, line.par, line.cov, 1.f / float(bField), tkid); results->pt(tkid) = float(bField) / float(std::abs(circle.par(2))); results->eta(tkid) = asinhf(line.par(0)); results->chi2(tkid) = (circle.chi2 + line.chi2) / (2 * N - 5); From 5604d0b0024ee4442ec016fcd69a040c2b69a1fb Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Mon, 10 Oct 2022 16:10:56 +0200 Subject: [PATCH 007/110] Cleanup --- .../Track/interface/TrackSoAHeterogeneousT_test.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h index cdbb3bcba7cc1..d671c6c3f22c5 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h @@ -7,11 +7,8 @@ #include #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" - -#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" #include "DataFormats/SoATemplate/interface/SoALayout.h" - -//#include "DataFormats/Portable/interface/PortableCUDADeviceCollection.h" +#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" #include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" namespace pixelTrack { From a68c296619c86150a486af8e90c8d1c82070e1a4 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Mon, 10 Oct 2022 16:20:42 +0200 Subject: [PATCH 008/110] Fixing some calls to TkSoA method calls --- .../PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc | 6 +++--- .../PixelTriplets/plugins/BrokenLineFitOnGPU.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc index 59ba877e9e626..12899be2c4156 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc @@ -155,7 +155,7 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, const auto &tsoa = *iEvent.get(tokenTrack_); auto const *quality = tsoa.qualityData(); - auto const &fit = tsoa.stateAtBS; + // auto const &fit = tsoa.stateAtBS; auto const &hitIndices = tsoa.hitIndices; auto nTracks = tsoa.nTracks(); @@ -190,11 +190,11 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, // mind: this values are respect the beamspot! float chi2 = tsoa.chi2(it); - float phi = tsoa.phi(it); + float phi = pixelTrack::utilities::phi(tsoa.view(), it); riemannFit::Vector5d ipar, opar; riemannFit::Matrix5d icov, ocov; - fit.copyToDense(ipar, icov, it); + pixelTrack::utilities::copyToDense(tsoa.view(), ipar, icov, it); riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov); LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h index b4d6da45e42f9..c0046d2888256 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h @@ -204,7 +204,7 @@ __global__ void kernel_BLFit(caConstants::TupleMultiplicity const *__restrict__ brokenline::circleFit(hits, hits_ge, fast_fit, bField, data, circle); pixelTrack::utilities::copyFromCircle( - results, circle.par, circle.cov, line.par, line.cov, 1.f / float(bField), tkid); + results->view(), circle.par, circle.cov, line.par, line.cov, 1.f / float(bField), tkid); results->pt(tkid) = float(bField) / float(std::abs(circle.par(2))); results->eta(tkid) = asinhf(line.par(0)); results->chi2(tkid) = (circle.chi2 + line.chi2) / (2 * N - 5); From 1fc040be1355d9b628dd04d6ef893266fb7c37b4 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Mon, 10 Oct 2022 17:23:50 +0200 Subject: [PATCH 009/110] Switched to View instead of ConstView --- CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h index d671c6c3f22c5..fb1f248621ebe 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h @@ -39,7 +39,7 @@ GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test, // Previous TrajectoryStateSoAT class methods namespace pixelTrack { namespace utilities { - using TrackSoAView = cms::cuda::PortableDeviceCollection>::ConstView; + using TrackSoAView = cms::cuda::PortableDeviceCollection>::View; // State at the Beam spot // phi,tip,1/pt,cotan(theta),zip float charge(TrackSoAView tracks, int32_t i) { return std::copysign(1.f, tracks[i].state()(2)); } From 47ec5428ffcf862484b1711650e89b5ffea9864e Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Tue, 11 Oct 2022 12:03:08 +0200 Subject: [PATCH 010/110] Moving methods to static functions, using View --- .../interface/TrackSoAHeterogeneousT_test.h | 36 ++++++++------ .../plugins/CAHitNtupletGeneratorKernels.cc | 7 ++- .../plugins/CAHitNtupletGeneratorKernels.h | 4 ++ .../CAHitNtupletGeneratorKernelsImpl.h | 49 +++++++++++-------- 4 files changed, 56 insertions(+), 40 deletions(-) diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h index fb1f248621ebe..695931671c1a5 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h @@ -40,12 +40,27 @@ GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test, namespace pixelTrack { namespace utilities { using TrackSoAView = cms::cuda::PortableDeviceCollection>::View; + using TrackSoAConstView = cms::cuda::PortableDeviceCollection>::ConstView; // State at the Beam spot // phi,tip,1/pt,cotan(theta),zip - float charge(TrackSoAView tracks, int32_t i) { return std::copysign(1.f, tracks[i].state()(2)); } - float phi(TrackSoAView tracks, int32_t i) { return tracks[i].state()(0); } - float tip(TrackSoAView tracks, int32_t i) { return tracks[i].state()(1); } - float zip(TrackSoAView tracks, int32_t i) { return tracks[i].state()(4); } + float charge(TrackSoAConstView tracks, int32_t i) { return std::copysign(1.f, tracks[i].state()(2)); } + + float phi(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(0); } + + float tip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(1); } + + float zip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(4); } + + float pt(TrackSoAConstView tracks, int32_t i) { return tracks[i].pt(); } + float &pt(TrackSoAConstView tracks, int32_t i) { return tracks[i].pt(); } + + float eta(TrackSoAConstView tracks, int32_t i) { return tracks[i].eta(); } + float &eta(TrackSoAConstView tracks, int32_t i) { return tracks[i].eta(); } + + float chi2(TrackSoAConstView tracks, int32_t i) { return tracks[i].chi2(); } + float &chi2(TrackSoAConstView tracks, int32_t i) { return tracks[i].chi2(); } + + bool isTriplet(TrackSoAConstView tracks, int i) { return view[i].nLayers() == 3; } template __host__ __device__ inline void copyFromCircle( @@ -113,22 +128,11 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection(view().quality()); } constexpr Quality *qualityData() { return reinterpret_cast(view().quality()); } - constexpr float pt(int32_t i) const { return view()[i].pt(); } - constexpr float &pt(int32_t i) { return view()[i].pt(); } - - constexpr float eta(int32_t i) const { return view()[i].eta(); } - constexpr float &eta(int32_t i) { return view()[i].eta(); } - - constexpr float chi2(int32_t i) const { return view()[i].chi2(); } - constexpr float &chi2(int32_t i) { return view()[i].chi2(); } - constexpr int nTracks() const { return nTracks_; } constexpr void setNTracks(int n) { nTracks_ = n; } constexpr int nHits(int i) const { return detIndices.size(i); } - constexpr bool isTriplet(int i) const { return view()[i].nLayers() == 3; } - constexpr int computeNumberOfLayers(int32_t i) const { // layers are in order and we assume tracks are either forward or backward auto pdet = detIndices.begin(i); @@ -161,6 +165,8 @@ namespace pixelTrack { #endif using TrackSoA = TrackSoAHeterogeneousT; + using TrackSoAView = cms::cuda::PortableDeviceCollection>::View; + using TrackSoAConstView = cms::cuda::PortableDeviceCollection>::ConstView; using HitContainer = TrackSoA::HitContainer; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc index bc745817d4e4a..85c9c539593ca 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -148,10 +148,9 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA int32_t nhits = hh.nHits(); auto const *tuples_d = &tracks_d->hitIndices; - auto *quality_d = tracks_d->qualityData(); // classify tracks based on kinematics - kernel_classifyTracks(tuples_d, tracks_d, tracks_d->stateAtBS.view(), params_.cuts_, quality_d); + kernel_classifyTracks(tuples_d, tracks_d, tracks_d->view(), params_.cuts_); if (params_.lateFishbone_) { // apply fishbone cleaning to good tracks @@ -160,7 +159,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA // remove duplicates (tracks that share a doublet) kernel_fastDuplicateRemover( - device_theCells_.get(), device_nCells_, tracks_d, tracks_d->stateAtBS.view(), params_.dupPassThrough_); + device_theCells_.get(), device_nCells_, tracks_d, tracks_d->view(), params_.dupPassThrough_); // fill hit->track "map" if (params_.doSharedHitCut_ || params_.doStats_) { @@ -172,7 +171,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA // remove duplicates (tracks that share at least one hit) if (params_.doSharedHitCut_) { kernel_rejectDuplicate(tracks_d, - tracks_d->stateAtBS.view(), // stateAtBS SoA view + tracks_d->view(), // stateAtBS SoA view quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h index 8af1176fe92c6..fcab52e96d210 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h @@ -35,6 +35,8 @@ namespace cAHitNtupletGenerator { using Quality = pixelTrack::Quality; using TkSoA = pixelTrack::TrackSoA; + using TkSoAView = pixelTrack::TrackSoAView; + using TkSoAConstView = pixelTrack::TrackSoAConstView; using HitContainer = pixelTrack::HitContainer; struct QualityCuts { @@ -174,6 +176,8 @@ class CAHitNtupletGeneratorKernels { using Quality = pixelTrack::Quality; using TkSoA = pixelTrack::TrackSoA; + using TkSoAView = pixelTrack::TrackSoAView; + using TkSoAConstView = pixelTrack::TrackSoAConstView; using HitContainer = pixelTrack::HitContainer; CAHitNtupletGeneratorKernels(Params const& params) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index 5806f6e6844e2..71a823ba23212 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -30,6 +30,8 @@ using TupleMultiplicity = caConstants::TupleMultiplicity; using Quality = pixelTrack::Quality; using TkSoA = pixelTrack::TrackSoA; +using TkSoAView = pixelTrack::TrackSoAView; +using TkSoAConstView = pixelTrack::TrackSoAConstView; using HitContainer = pixelTrack::HitContainer; namespace { @@ -413,23 +415,26 @@ __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNt } } -__global__ void kernel_classifyTracks( - HitContainer const *__restrict__ tuples, - TkSoA const *__restrict__ tracks, - cms::cuda::PortableDeviceCollection>::ConstView stateAtBS_view, - CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts, - Quality *__restrict__ quality) { +/* + Supply both the original TkSoA and the TkSoAView which contains +the SoA Data + */ +__global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, + TkSoA const *__restrict__ tracks, + TkSoAView tracks_view, + CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts) { int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int it = first, nt = tuples->nOnes(); it < nt; it += gridDim.x * blockDim.x) { auto nhits = tuples->size(it); if (nhits == 0) break; // guard // if duplicate: not even fit - if (quality[it] == pixelTrack::Quality::edup) + if (tracks_view[it].quality() == pixelTrack::Quality::edup) continue; - assert(quality[it] == pixelTrack::Quality::bad); + assert(tracks_view[it].quality() == pixelTrack::Quality::bad); // mark doublets as bad if (nhits < 3) @@ -438,16 +443,16 @@ __global__ void kernel_classifyTracks( // if the fit has any invalid parameters, mark it as bad bool isNaN = false; for (int i = 0; i < 5; ++i) { - isNaN |= std::isnan(stateAtBS_view[it].state()(i)); + isNaN |= std::isnan(tracks_view[it].state()(i)); } if (isNaN) { #ifdef NTUPLE_DEBUG - printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks->chi2(it)); + printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), pixelTrack::utilities::chi2(tracks_view, it)); #endif continue; } - quality[it] = pixelTrack::Quality::strict; + tracks[it].quality() = pixelTrack::Quality::strict; // compute a pT-dependent chi2 cut @@ -473,21 +478,21 @@ __global__ void kernel_classifyTracks( }; // (see CAHitNtupletGeneratorGPU.cc) - float pt = std::min(tracks->pt(it), cuts.chi2MaxPt); + float pt = std::min(pixelTrack::utilities::pt(tracks_view, it), cuts.chi2MaxPt); float chi2Cut = cuts.chi2Scale * (cuts.chi2Coeff[0] + roughLog(pt) * cuts.chi2Coeff[1]); - if (tracks->chi2(it) >= chi2Cut) { + if (pixelTrack::utilities::chi2(tracks_view, it) >= chi2Cut) { #ifdef NTUPLE_FIT_DEBUG printf("Bad chi2 %d size %d pt %f eta %f chi2 %f\n", it, tuples->size(it), - tracks->pt(it), - tracks->eta(it), - tracks->chi2(it)); + pixelTrack::utilities::pt(tracks_view, it), + pixelTrack::utilities::eta(tracks_view, it), + pixelTrack::utilities::chi2(tracks_view, it)); #endif continue; } - quality[it] = pixelTrack::Quality::tight; + tracks[it].quality() = pixelTrack::Quality::tight; // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip) // default cuts: @@ -495,11 +500,13 @@ __global__ void kernel_classifyTracks( // - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm // (see CAHitNtupletGeneratorGPU.cc) auto const ®ion = (nhits > 3) ? cuts.quadruplet : cuts.triplet; - bool isOk = (std::abs(tracks->tip(it)) < region.maxTip) and (tracks->pt(it) > region.minPt) and - (std::abs(tracks->zip(it)) < region.maxZip); + bool isOk = (std::abs(pixelTrack::utilities::tip(tracks_view, it)) < region.maxTip) and + (pixelTrack::utilities::pt(tracks_view, it) > region.minPt) and + (std::abs(pixelTrack::utilities::zip(tracks_view, it)) < region.maxZip); - if (isOk) - quality[it] = pixelTrack::Quality::highPurity; + if (isOk) { + tracks[it].quality() = pixelTrack::Quality::highPurity; + } } } From 61fbb0efed60a69f4795d0eb1846bdd1eca44ba5 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Tue, 11 Oct 2022 12:30:03 +0200 Subject: [PATCH 011/110] working on Broken line --- .../Track/interface/TrackSoAHeterogeneousT_test.h | 14 +++++++------- .../PixelTriplets/plugins/BrokenLineFitOnGPU.h | 14 ++++++++------ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h index 695931671c1a5..d67eef16e9927 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h @@ -51,16 +51,16 @@ namespace pixelTrack { float zip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(4); } - float pt(TrackSoAConstView tracks, int32_t i) { return tracks[i].pt(); } - float &pt(TrackSoAConstView tracks, int32_t i) { return tracks[i].pt(); } + // float pt(TrackSoAConstView tracks, int32_t i) { return tracks[i].pt(); } + // // float &pt(TrackSoAConstView tracks, int32_t i) { return tracks[i].pt(); } - float eta(TrackSoAConstView tracks, int32_t i) { return tracks[i].eta(); } - float &eta(TrackSoAConstView tracks, int32_t i) { return tracks[i].eta(); } + // float eta(TrackSoAConstView tracks, int32_t i) { return tracks[i].eta(); } + // // float &eta(TrackSoAConstView tracks, int32_t i) { return tracks[i].eta(); } - float chi2(TrackSoAConstView tracks, int32_t i) { return tracks[i].chi2(); } - float &chi2(TrackSoAConstView tracks, int32_t i) { return tracks[i].chi2(); } + // float chi2(TrackSoAConstView tracks, int32_t i) { return tracks[i].chi2(); } + // float &chi2(TrackSoAConstView tracks, int32_t i) { return tracks[i].chi2(); } - bool isTriplet(TrackSoAConstView tracks, int i) { return view[i].nLayers() == 3; } + bool isTriplet(TrackSoAConstView tracks, int i) { return tracks[i].nLayers() == 3; } template __host__ __device__ inline void copyFromCircle( diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h index c0046d2888256..eda536640af5a 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h @@ -8,6 +8,7 @@ #include +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" @@ -19,6 +20,7 @@ using HitsOnGPU = TrackingRecHit2DSOAView; using Tuples = pixelTrack::HitContainer; using OutputSoA = pixelTrack::TrackSoA; +using OutputSoAView = pixelTrack::TrackSoAView; using tindex_type = caConstants::tindex_type; constexpr auto invalidTkId = std::numeric_limits::max(); @@ -169,12 +171,12 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets, template __global__ void kernel_BLFit(caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity, double bField, - OutputSoA *results, + OutputSoAView results_view, tindex_type const *__restrict__ ptkids, double *__restrict__ phits, float *__restrict__ phits_ge, double *__restrict__ pfast_fit) { - assert(results); + // assert(results_view); // Need to be replaced with something that works assert(pfast_fit); // same as above... @@ -204,10 +206,10 @@ __global__ void kernel_BLFit(caConstants::TupleMultiplicity const *__restrict__ brokenline::circleFit(hits, hits_ge, fast_fit, bField, data, circle); pixelTrack::utilities::copyFromCircle( - results->view(), circle.par, circle.cov, line.par, line.cov, 1.f / float(bField), tkid); - results->pt(tkid) = float(bField) / float(std::abs(circle.par(2))); - results->eta(tkid) = asinhf(line.par(0)); - results->chi2(tkid) = (circle.chi2 + line.chi2) / (2 * N - 5); + results_view, circle.par, circle.cov, line.par, line.cov, 1.f / float(bField), tkid); + results_view[tkid].pt() = float(bField) / float(std::abs(circle.par(2))); + results_view[tkid].eta() = asinhf(line.par(0)); + results_view[tkid].chi2() = (circle.chi2 + line.chi2) / (2 * N - 5); #ifdef BROKENLINE_DEBUG if (!(circle.chi2 >= 0) || !(line.chi2 >= 0)) From 78a500cb4d57db7124bd3fa20a7d424b7d19cca6 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Tue, 11 Oct 2022 12:41:38 +0200 Subject: [PATCH 012/110] Working on HelixFit --- .../PixelTriplets/plugins/BrokenLineFitOnGPU.h | 3 +-- RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc | 4 ++-- RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h | 6 +++--- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h index eda536640af5a..aefde7ac602b1 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h @@ -19,7 +19,6 @@ using HitsOnGPU = TrackingRecHit2DSOAView; using Tuples = pixelTrack::HitContainer; -using OutputSoA = pixelTrack::TrackSoA; using OutputSoAView = pixelTrack::TrackSoAView; using tindex_type = caConstants::tindex_type; constexpr auto invalidTkId = std::numeric_limits::max(); @@ -176,7 +175,7 @@ __global__ void kernel_BLFit(caConstants::TupleMultiplicity const *__restrict__ double *__restrict__ phits, float *__restrict__ phits_ge, double *__restrict__ pfast_fit) { - // assert(results_view); // Need to be replaced with something that works + // assert(results_view); // TODO Find equivalent assertion for View assert(pfast_fit); // same as above... diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc index 880bdb47dfb5c..624934645338b 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc @@ -3,14 +3,14 @@ void HelixFitOnGPU::allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, - OutputSoA *helix_fit_results) { + OutputSoAView helix_fit_results) { tuples_ = tuples; tupleMultiplicity_ = tupleMultiplicity; outputSoa_ = helix_fit_results; assert(tuples_); assert(tupleMultiplicity_); - assert(outputSoa_); + // assert(outputSoa_); // TODO find equivalent assertion for View } void HelixFitOnGPU::deallocateOnGPU() {} diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h index 9a9c85970af33..031325e2e13d9 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h @@ -36,7 +36,7 @@ class HelixFitOnGPU { using HitsView = TrackingRecHit2DSOAView; using Tuples = pixelTrack::HitContainer; - using OutputSoA = pixelTrack::TrackSoA; + using OutputSoAView = pixelTrack::TrackSoAView; using TupleMultiplicity = caConstants::TupleMultiplicity; @@ -47,7 +47,7 @@ class HelixFitOnGPU { void launchRiemannKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream); void launchBrokenLineKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream); - void launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples); + void launchiRemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples); void launchBrokenLineKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples); void allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoA *outputSoA); @@ -59,7 +59,7 @@ class HelixFitOnGPU { // fowarded Tuples const *tuples_ = nullptr; TupleMultiplicity const *tupleMultiplicity_ = nullptr; - OutputSoA *outputSoa_; + OutputSoAView outputSoa_; float bField_; const bool fitNas4_; From bc3f4ae825c825f370afa86cb69197908b35dd65 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Tue, 11 Oct 2022 12:56:26 +0200 Subject: [PATCH 013/110] Replacing methods with utilities --- .../CAHitNtupletGeneratorKernelsImpl.h | 38 +++++++++---------- .../PixelTriplets/plugins/HelixFitOnGPU.h | 2 +- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index 71a823ba23212..58b371932b933 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -162,7 +162,7 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells, // find maxNl for (auto it : thisCell.tracks()) { - auto nl = tracks.nLayers(it); + auto nl = tracks[it].nLayers(); maxNl = std::max(nl, maxNl); } @@ -171,7 +171,7 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells, // maxNl = std::min(4, maxNl); for (auto it : thisCell.tracks()) { - if (tracks.nLayers(it) < maxNl) + if (tracks[it].nLayers() < maxNl) quality[it] = reject; //no race: simple assignment of the same constant } } @@ -201,13 +201,13 @@ __global__ void kernel_fastDuplicateRemover( /* chi2 penalize higher-pt tracks (try rescale it?) auto score = [&](auto it) { - return tracks->nLayers(it) < 4 ? - std::abs(tracks->tip(it)) : // tip for triplets - tracks->chi2(it); //chi2 for quads + return tracks[it].nLayers() < 4 ? + std::abs(pixelTrack::utilities::tip(tracks,it)) : // tip for triplets + tracks[it].chi2(it); //chi2 for quads }; */ - auto score = [&](auto it) { return std::abs(tracks->tip(it)); }; + auto score = [&](auto it) { return std::abs(pixelTrack::utilities::tip(tracks, it)); }; // full crazy combinatorics int ntr = thisCell.tracks().size(); @@ -577,7 +577,7 @@ __global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks, cms::cuda::Atomi for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) { auto nHits = tracks.nHits(idx); assert(nHits >= 3); - tracks.nLayers(idx) = tracks.computeNumberOfLayers(idx); + tracks[idx].nLayers() = tracks.computeNumberOfLayers(idx); } } @@ -690,7 +690,7 @@ __global__ void kernel_rejectDuplicate( auto e2opi = stateAtBS_view[it].covariance()(9); auto cti = stateAtBS_view[it].state()(3); auto e2cti = stateAtBS_view[it].covariance()(12); - auto nli = tracks.nLayers(it); + auto nli = tracks[it].nLayers(); for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) { auto const jt = *jp; auto qj = quality[jt]; @@ -704,7 +704,7 @@ __global__ void kernel_rejectDuplicate( auto dop = nSigma2 * (stateAtBS_view[jt].covariance()(9) + e2opi); if ((opi - opj) * (opi - opj) > dop) continue; - auto nlj = tracks.nLayers(jt); + auto nlj = tracks[jt].nLayers(); if (nlj < nli || (nlj == nli && (qj < qi || (qj == qi && score(it, nli) < score(jt, nlj))))) quality[jt] = reject; else { @@ -745,7 +745,7 @@ __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restric if (quality[*it] < longTqual) continue; // if (tracks.nHits(*it)==3) continue; - auto nl = tracks.nLayers(*it); + auto nl = tracks[*it].nLayers(); maxNl = std::max(nl, maxNl); } @@ -757,7 +757,7 @@ __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restric // kill all tracks shorter than maxHl (only triplets??? for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { - auto nl = tracks.nLayers(*it); + auto nl = tracks[*it].nLayers(); //checking if shared hit is on bpix1 and if the tuple is short enough if (idx < l1end and nl > nmin) @@ -893,15 +893,15 @@ __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__res 10000 * iev + i, int(quality[i]), nh, - tracks.nLayers(i), - tracks.charge(i), - tracks.pt(i), - tracks.eta(i), - tracks.phi(i), - tracks.tip(i), - tracks.zip(i), + tracks[i].nLayers(), + pixelTrack::utilities::charge(tracks, i), + tracks[i].pt(), + tracks[i].eta(), + pixelTrack::utilities::phi(tracks, i), + pixelTrack::utilities::tip(tracks, i), + pixelTrack::utilities::zip(tracks, i), // asinhf(fit_results[i].par(3)), - tracks.chi2(i), + tracks[i].chi2(), hh.zGlobal(*foundNtuplets.begin(i)), hh.zGlobal(*(foundNtuplets.begin(i) + 1)), hh.zGlobal(*(foundNtuplets.begin(i) + 2)), diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h index 031325e2e13d9..d47e4c5f8ece9 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h @@ -50,7 +50,7 @@ class HelixFitOnGPU { void launchiRemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples); void launchBrokenLineKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples); - void allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoA *outputSoA); + void allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoAView *outputSoA); void deallocateOnGPU(); private: From 4e7d9fa4acddebeac6793edecde5df50e8d76ad8 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Tue, 11 Oct 2022 13:06:59 +0200 Subject: [PATCH 014/110] rejectDuplicate kernel --- .../plugins/CAHitNtupletGeneratorKernels.cc | 11 +-- .../CAHitNtupletGeneratorKernelsImpl.h | 92 +++++++++---------- 2 files changed, 46 insertions(+), 57 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc index 85c9c539593ca..1271f3f6dcd1a 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -158,8 +158,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA } // remove duplicates (tracks that share a doublet) - kernel_fastDuplicateRemover( - device_theCells_.get(), device_nCells_, tracks_d, tracks_d->view(), params_.dupPassThrough_); + kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_); // fill hit->track "map" if (params_.doSharedHitCut_ || params_.doStats_) { @@ -170,12 +169,8 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA // remove duplicates (tracks that share at least one hit) if (params_.doSharedHitCut_) { - kernel_rejectDuplicate(tracks_d, - tracks_d->view(), // stateAtBS SoA view - quality_d, - params_.minHitsForSharingCut_, - params_.dupPassThrough_, - device_hitToTuple_.get()); + kernel_rejectDuplicate( + tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); kernel_sharedHitCleaner(hh.view(), tracks_d, diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index 58b371932b933..343610993e674 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -178,12 +178,10 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells, } // assume the above (so, short tracks already removed) -__global__ void kernel_fastDuplicateRemover( - GPUCACell const *__restrict__ cells, - uint32_t const *__restrict__ nCells, - TkSoA *__restrict__ tracks, - cms::cuda::PortableDeviceCollection>::ConstView stateAtBS_view, - bool dupPassThrough) { +__global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells, + uint32_t const *__restrict__ nCells, + TkSoAView tracks_view, + bool dupPassThrough) { // quality to mark rejected auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup; constexpr auto loose = pixelTrack::Quality::loose; @@ -201,42 +199,42 @@ __global__ void kernel_fastDuplicateRemover( /* chi2 penalize higher-pt tracks (try rescale it?) auto score = [&](auto it) { - return tracks[it].nLayers() < 4 ? - std::abs(pixelTrack::utilities::tip(tracks,it)) : // tip for triplets - tracks[it].chi2(it); //chi2 for quads + return tracks_view[it].nLayers() < 4 ? + std::abs(pixelTrack::utilities::tip(tracks_view, it)) : // tip for triplets + tracks_view[it].chi2(it); //chi2 for quads }; */ - auto score = [&](auto it) { return std::abs(pixelTrack::utilities::tip(tracks, it)); }; + auto score = [&](auto it) { return std::abs(pixelTrack::utilities::tip(tracks_view, it)); }; // full crazy combinatorics int ntr = thisCell.tracks().size(); for (int i = 0; i < ntr - 1; ++i) { auto it = thisCell.tracks()[i]; - auto qi = tracks->quality(it); + auto qi = tracks_view[it].quality(); if (qi <= reject) continue; - auto opi = stateAtBS_view[it].state()(2); - auto e2opi = stateAtBS_view[it].covariance()(9); - auto cti = stateAtBS_view[it].state()(3); - auto e2cti = stateAtBS_view[it].covariance()(12); + auto opi = tracks_view[it].state()(2); + auto e2opi = tracks_view[it].covariance()(9); + auto cti = tracks_view[it].state()(3); + auto e2cti = tracks_view[it].covariance()(12); for (auto j = i + 1; j < ntr; ++j) { auto jt = thisCell.tracks()[j]; - auto qj = tracks->quality(jt); + auto qj = tracks_view[jt].quality(); if (qj <= reject) continue; - auto opj = stateAtBS_view[jt].state()(2); - auto ctj = stateAtBS_view[jt].state()(3); - auto dct = nSigma2 * (stateAtBS_view[jt].covariance()(12) + e2cti); + auto opj = tracks_view[jt].state()(2); + auto ctj = tracks_view[jt].state()(3); + auto dct = nSigma2 * (tracks_view[jt].covariance()(12) + e2cti); if ((cti - ctj) * (cti - ctj) > dct) continue; - auto dop = nSigma2 * (stateAtBS_view[jt].covariance()(9) + e2opi); + auto dop = nSigma2 * (tracks_view[jt].covariance()(9) + e2opi); if ((opi - opj) * (opi - opj) > dop) continue; if ((qj < qi) || (qj == qi && score(it) < score(jt))) - tracks->quality(jt) = reject; + tracks_view[jt].quality() = reject; else { - tracks->quality(it) = reject; + tracks_view[it].quality() = reject; break; } } @@ -245,8 +243,8 @@ __global__ void kernel_fastDuplicateRemover( // find maxQual auto maxQual = reject; // no duplicate! for (auto it : thisCell.tracks()) { - if (tracks->quality(it) > maxQual) - maxQual = tracks->quality(it); + if (tracks_view[it].quality() > maxQual) + maxQual = tracks_view[it].quality(); } if (maxQual <= loose) @@ -254,7 +252,7 @@ __global__ void kernel_fastDuplicateRemover( // find min score for (auto it : thisCell.tracks()) { - if (tracks->quality(it) == maxQual && score(it) < mc) { + if (tracks_view[it].quality() == maxQual && score(it) < mc) { mc = score(it); im = it; } @@ -265,8 +263,8 @@ __global__ void kernel_fastDuplicateRemover( // mark all other duplicates (not yet, keep it loose) for (auto it : thisCell.tracks()) { - if (tracks->quality(it) > loose && it != im) - tracks->quality(it) = loose; //no race: simple assignment of the same constant + if (tracks_view[it].quality() > loose && it != im) + tracks_view[it].quality() = loose; //no race: simple assignment of the same constant } } } @@ -654,18 +652,14 @@ __global__ void kernel_markSharedHit(int const *__restrict__ nshared, } // mostly for very forward triplets..... -__global__ void kernel_rejectDuplicate( - TkSoA const *__restrict__ ptracks, // TODO: Change to Constview - cms::cuda::PortableDeviceCollection>::ConstView stateAtBS_view, - Quality *__restrict__ quality, - uint16_t nmin, - bool dupPassThrough, - CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) { +__global__ void kernel_rejectDuplicate(TkSoAView tracks_view, + uint16_t nmin, + bool dupPassThrough, + CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) { // quality to mark rejected auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup; auto &hitToTuple = *phitToTuple; - auto const &tracks = *ptracks; int first = blockDim.x * blockIdx.x + threadIdx.x; for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { @@ -678,37 +672,37 @@ __global__ void kernel_rejectDuplicate( tracks.chi2(it); //chi2 }; */ - auto score = [&](auto it, auto nl) { return std::abs(tracks.tip(it)); }; + auto score = [&](auto it, auto nl) { return std::abs(pixelTrack::utilities::tip(tracks_view, it)); }; // full combinatorics for (auto ip = hitToTuple.begin(idx); ip < hitToTuple.end(idx) - 1; ++ip) { auto const it = *ip; - auto qi = quality[it]; + auto qi = tracks_view[it].quality(); if (qi <= reject) continue; - auto opi = stateAtBS_view[it].state()(2); - auto e2opi = stateAtBS_view[it].covariance()(9); - auto cti = stateAtBS_view[it].state()(3); - auto e2cti = stateAtBS_view[it].covariance()(12); + auto opi = tracks_view[it].state()(2); + auto e2opi = tracks_view[it].covariance()(9); + auto cti = tracks_view[it].state()(3); + auto e2cti = tracks_view[it].covariance()(12); auto nli = tracks[it].nLayers(); for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) { auto const jt = *jp; - auto qj = quality[jt]; + auto qj = tracks_view[jt].quality(); if (qj <= reject) continue; - auto opj = stateAtBS_view[jt].state()(2); - auto ctj = stateAtBS_view[jt].state()(3); - auto dct = nSigma2 * (stateAtBS_view[jt].covariance()(12) + e2cti); + auto opj = tracks_view[jt].state()(2); + auto ctj = tracks_view[jt].state()(3); + auto dct = nSigma2 * (tracks_view[jt].covariance()(12) + e2cti); if ((cti - ctj) * (cti - ctj) > dct) continue; - auto dop = nSigma2 * (stateAtBS_view[jt].covariance()(9) + e2opi); + auto dop = nSigma2 * (tracks_view[jt].covariance()(9) + e2opi); if ((opi - opj) * (opi - opj) > dop) continue; - auto nlj = tracks[jt].nLayers(); + auto nlj = tracks_view[jt].nLayers(); if (nlj < nli || (nlj == nli && (qj < qi || (qj == qi && score(it, nli) < score(jt, nlj))))) - quality[jt] = reject; + tracks_view[jt].quality() = reject; else { - quality[it] = reject; + tracks_view[it].quality() = reject; break; } } From 6a489bfa37689e6a993469b0943ff2bb1424b01a Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Tue, 11 Oct 2022 13:08:59 +0200 Subject: [PATCH 015/110] sharedHitContainer kernel --- .../plugins/CAHitNtupletGeneratorKernelsImpl.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index 343610993e674..a2bedc3b46ebc 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -711,8 +711,7 @@ __global__ void kernel_rejectDuplicate(TkSoAView tracks_view, } __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restrict__ hhp, - TkSoA const *__restrict__ ptracks, - Quality *__restrict__ quality, + TkSoAView tracks_view, int nmin, bool dupPassThrough, CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) { @@ -722,7 +721,6 @@ __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restric auto const longTqual = pixelTrack::Quality::highPurity; auto &hitToTuple = *phitToTuple; - auto const &tracks = *ptracks; auto const &hh = *hhp; int l1end = hh.hitsLayerStart()[1]; @@ -736,10 +734,10 @@ __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restric // find maxNl for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { - if (quality[*it] < longTqual) + if (tracks_view[*it].quality() < longTqual) continue; // if (tracks.nHits(*it)==3) continue; - auto nl = tracks[*it].nLayers(); + auto nl = tracks_view[*it].nLayers(); maxNl = std::max(nl, maxNl); } @@ -751,14 +749,14 @@ __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restric // kill all tracks shorter than maxHl (only triplets??? for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { - auto nl = tracks[*it].nLayers(); + auto nl = tracks_view[*it].nLayers(); //checking if shared hit is on bpix1 and if the tuple is short enough if (idx < l1end and nl > nmin) continue; - if (nl < maxNl && quality[*it] > reject) - quality[*it] = reject; + if (nl < maxNl && tracks_view[*it].quality() > reject) + tracks_view[*it].quality() = reject; } } } From bc0392de78e657d5ba86e3c32d439258bb9ffea1 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Tue, 11 Oct 2022 14:02:24 +0200 Subject: [PATCH 016/110] Triplet cleaner --- .../plugins/CAHitNtupletGeneratorKernels.cc | 4 ++-- .../plugins/CAHitNtupletGeneratorKernelsImpl.h | 16 +++++++--------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc index 1271f3f6dcd1a..2b950caec5c6f 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -183,7 +183,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); } else { kernel_tripletCleaner( - tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); } } @@ -216,7 +216,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA { std::lock_guard guard(lock); ++iev; - kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 0, 1000000, iev); + kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d->view(), device_hitToTuple_.get(), 0, 1000000, iev); } #endif } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index a2bedc3b46ebc..c6961dd11d571 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -761,8 +761,7 @@ __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restric } } -__global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks, - Quality *__restrict__ quality, +__global__ void kernel_tripletCleaner(TkSoAView tracks_view, uint16_t nmin, bool dupPassThrough, CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) { @@ -772,7 +771,6 @@ __global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks, auto const good = pixelTrack::Quality::strict; auto &hitToTuple = *phitToTuple; - auto const &tracks = *ptracks; int first = blockDim.x * blockIdx.x + threadIdx.x; for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { @@ -785,9 +783,9 @@ __global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks, // check if only triplets for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { - if (quality[*it] <= good) + if (track_view[*it].quality() <= good) continue; - onlyTriplets &= tracks.isTriplet(*it); + onlyTriplets &= pixelTrack::utilities::isTriplet(tracks_view, *it); if (!onlyTriplets) break; } @@ -799,8 +797,8 @@ __global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks, // for triplets choose best tip! (should we first find best quality???) for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { auto const it = *ip; - if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) { - mc = std::abs(tracks.tip(it)); + if (tracks_view[it].quality() >= good && std::abs(pixelTrack::utilities::tip(tracks_view, it)) < mc) { + mc = std::abs(pixelTrack::utilities::tip(tracks_view, it)); im = it; } } @@ -811,8 +809,8 @@ __global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks, // mark worse ambiguities for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { auto const it = *ip; - if (quality[it] > reject && it != im) - quality[it] = reject; //no race: simple assignment of the same constant + if (tracks_view[it].quality() > reject && it != im) + tracks_view[it].quality() = reject; //no race: simple assignment of the same constant } } // loop over hits From 13e04d709759ad2ed8512e3e372d646034cb2202 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Tue, 11 Oct 2022 14:04:27 +0200 Subject: [PATCH 017/110] simpleTripletCleaner --- .../plugins/CAHitNtupletGeneratorKernelsImpl.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index c6961dd11d571..938f29101c781 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -817,8 +817,7 @@ __global__ void kernel_tripletCleaner(TkSoAView tracks_view, } __global__ void kernel_simpleTripletCleaner( - TkSoA const *__restrict__ ptracks, - Quality *__restrict__ quality, + TkSoAView tracks_view, uint16_t nmin, bool dupPassThrough, CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) { @@ -828,7 +827,6 @@ __global__ void kernel_simpleTripletCleaner( auto const good = pixelTrack::Quality::loose; auto &hitToTuple = *phitToTuple; - auto const &tracks = *ptracks; int first = blockDim.x * blockIdx.x + threadIdx.x; for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { @@ -841,8 +839,8 @@ __global__ void kernel_simpleTripletCleaner( // choose best tip! (should we first find best quality???) for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { auto const it = *ip; - if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) { - mc = std::abs(tracks.tip(it)); + if (tracks_view[it].quality() >= good && std::abs(pixelTrack::utilities::tip(tracks_view, it)) < mc) { + mc = std::abs(pixelTrack::utilities::tip(tracks_view, it)); im = it; } } @@ -853,8 +851,8 @@ __global__ void kernel_simpleTripletCleaner( // mark worse ambiguities for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { auto const it = *ip; - if (quality[it] > reject && tracks.isTriplet(it) && it != im) - quality[it] = reject; //no race: simple assignment of the same constant + if (tracks_view[it].quality() > reject && pixelTracks::utilities::isTriplet(tracks_view, it) && it != im) + tracks_view[it].quality() = reject; //no race: simple assignment of the same constant } } // loop over hits From 9833fb637fa61ee10c39fba319538da97df2dcd3 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Tue, 11 Oct 2022 14:10:45 +0200 Subject: [PATCH 018/110] fillNLayers --- .../Track/interface/TrackSoAHeterogeneousT_test.h | 3 ++- .../plugins/CAHitNtupletGeneratorKernels.cc | 12 ++++-------- .../plugins/CAHitNtupletGeneratorKernelsImpl.h | 6 ++++-- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h index d67eef16e9927..f4075b0b385d7 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h @@ -34,7 +34,8 @@ GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test, SOA_COLUMN(float, eta), SOA_COLUMN(float, pt), SOA_EIGEN_COLUMN(Vector5f, state), - SOA_EIGEN_COLUMN(Vector15f, covariance)) + SOA_EIGEN_COLUMN(Vector15f, covariance), + SOA_SCALAR(int, nTracks)) // Previous TrajectoryStateSoAT class methods namespace pixelTrack { diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc index 2b950caec5c6f..4143701fb9cc2 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -129,7 +129,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * cms::cuda::finalizeBulk(device_hitTuple_apc_, tuples_d); kernel_fillHitDetIndices(tuples_d, hh.view(), detId_d); - kernel_fillNLayers(tracks_d, device_hitTuple_apc_); + kernel_fillNLayers(tracks_d, tracks_d->view(), device_hitTuple_apc_); // remove duplicates (tracks that share a doublet) kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d, quality_d, params_.dupPassThrough_); @@ -172,15 +172,11 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA kernel_rejectDuplicate( tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); - kernel_sharedHitCleaner(hh.view(), - tracks_d, - quality_d, - params_.minHitsForSharingCut_, - params_.dupPassThrough_, - device_hitToTuple_.get()); + kernel_sharedHitCleaner( + hh.view(), tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); if (params_.useSimpleTripletCleaner_) { kernel_simpleTripletCleaner( - tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); } else { kernel_tripletCleaner( tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index 938f29101c781..503f4847f78a3 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -565,13 +565,15 @@ __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples } } -__global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks, cms::cuda::AtomicPairCounter *apc) { +__global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks, + TkSoAView tracks_view, + cms::cuda::AtomicPairCounter *apc) { auto &tracks = *ptracks; auto first = blockIdx.x * blockDim.x + threadIdx.x; // clamp the number of tracks to the capacity of the SoA auto ntracks = std::min(apc->get().m, tracks.stride() - 1); if (0 == first) - tracks.setNTracks(ntracks); + tracks_view.nTracks() = ntracks; for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) { auto nHits = tracks.nHits(idx); assert(nHits >= 3); From a683d86cee7b0e1bd8c7945881c89daefc9c5c9b Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Tue, 11 Oct 2022 14:14:14 +0200 Subject: [PATCH 019/110] earlyDuplicateRemover --- .../PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc | 3 ++- .../plugins/CAHitNtupletGeneratorKernelsImpl.h | 8 +++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc index 4143701fb9cc2..2bbf9edab4e99 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -132,7 +132,8 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * kernel_fillNLayers(tracks_d, tracks_d->view(), device_hitTuple_apc_); // remove duplicates (tracks that share a doublet) - kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d, quality_d, params_.dupPassThrough_); + kernel_earlyDuplicateRemover( + device_theCells_.get(), device_nCells_, tracks_d->view(), quality_d, params_.dupPassThrough_); kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get()); cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index 503f4847f78a3..2f83a18c6127d 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -142,14 +142,12 @@ __global__ void kernel_fishboneCleaner(GPUCACell const *cells, uint32_t const *_ // It does not seem to affect efficiency in any way! __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells, uint32_t const *__restrict__ nCells, - TkSoA const *__restrict__ ptracks, + TkSoAConstView tracks_view, Quality *quality, bool dupPassThrough) { // quality to mark rejected constexpr auto reject = pixelTrack::Quality::edup; /// cannot be loose - auto const &tracks = *ptracks; - assert(nCells); auto first = threadIdx.x + blockIdx.x * blockDim.x; for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { @@ -162,7 +160,7 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells, // find maxNl for (auto it : thisCell.tracks()) { - auto nl = tracks[it].nLayers(); + auto nl = tracks_view[it].nLayers(); maxNl = std::max(nl, maxNl); } @@ -171,7 +169,7 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells, // maxNl = std::min(4, maxNl); for (auto it : thisCell.tracks()) { - if (tracks[it].nLayers() < maxNl) + if (tracks_view[it].nLayers() < maxNl) quality[it] = reject; //no race: simple assignment of the same constant } } From 293151b3db77330a3e0d9e10817573aa5a206ab3 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Tue, 11 Oct 2022 14:15:08 +0200 Subject: [PATCH 020/110] earlyDuplicateRemover --- .../plugins/#CAHitNtupletGeneratorKernels.cc# | 219 ++++++++++++++++++ .../CAHitNtupletGeneratorKernelsImpl.h | 5 +- 2 files changed, 221 insertions(+), 3 deletions(-) create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/#CAHitNtupletGeneratorKernels.cc# diff --git a/RecoPixelVertexing/PixelTriplets/plugins/#CAHitNtupletGeneratorKernels.cc# b/RecoPixelVertexing/PixelTriplets/plugins/#CAHitNtupletGeneratorKernels.cc# new file mode 100644 index 0000000000000..9a8cc40d24985 --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/#CAHitNtupletGeneratorKernels.cc# @@ -0,0 +1,219 @@ +#include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h" + +#include + +namespace { + // cuda atomics are NOT atomics on CPU so protect stat update with a mutex + // waiting for a more general solution (incuding multiple devices) to be proposed and implemented + std::mutex lock_stat; +} // namespace + +template <> +void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const *counters) { + kernel_printCounters(counters); +} + +template <> +void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) { + auto nhits = hh.nHits(); + +#ifdef NTUPLE_DEBUG + std::cout << "building Doublets out of " << nhits << " Hits. BPIX2 offset is " << hh.offsetBPIX2() << std::endl; +#endif + + // use "nhits" to heuristically dimension the workspace + + // no need to use the Traits allocations, since we know this is being compiled for the CPU + //device_isOuterHitOfCell_ = Traits::template make_unique(std::max(1U, nhits), stream); + device_isOuterHitOfCell_ = std::make_unique(std::max(1U, nhits)); + assert(device_isOuterHitOfCell_.get()); + isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), hh.offsetBPIX2()}; + + auto cellStorageSize = caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) + + caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks); + // no need to use the Traits allocations, since we know this is being compiled for the CPU + //cellStorage_ = Traits::template make_unique(cellStorageSize, stream); + cellStorage_ = std::make_unique(cellStorageSize); + device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get(); + device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets * + sizeof(GPUCACell::CellNeighbors)); + + gpuPixelDoublets::initDoublets(isOuterHitOfCell_, + nhits, + device_theCellNeighbors_.get(), + device_theCellNeighborsContainer_, + device_theCellTracks_.get(), + device_theCellTracksContainer_); + + // no need to use the Traits allocations, since we know this is being compiled for the CPU + //device_theCells_ = Traits::template make_unique(params_.maxNumberOfDoublets_, stream); + device_theCells_ = std::make_unique(params_.maxNumberOfDoublets_); + if (0 == nhits) + return; // protect against empty events + + // take all layer pairs into account + auto nActualPairs = gpuPixelDoublets::nPairs; + if (not params_.includeJumpingForwardDoublets_) { + // exclude forward "jumping" layer pairs + nActualPairs = gpuPixelDoublets::nPairsForTriplets; + } + if (params_.minHitsPerNtuplet_ > 3) { + // for quadruplets, exclude all "jumping" layer pairs + nActualPairs = gpuPixelDoublets::nPairsForQuadruplets; + } + + assert(nActualPairs <= gpuPixelDoublets::nPairs); + gpuPixelDoublets::getDoubletsFromHisto(device_theCells_.get(), + device_nCells_, + device_theCellNeighbors_.get(), + device_theCellTracks_.get(), + hh.view(), + isOuterHitOfCell_, + nActualPairs, + params_.idealConditions_, + params_.doClusterCut_, + params_.doZ0Cut_, + params_.doPtCut_, + params_.maxNumberOfDoublets_); +} + +template <> +void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { + auto *tuples_d = &tracks_d->hitIndices; + auto *detId_d = &tracks_d->detIndices; + auto *quality_d = tracks_d->qualityData(); + + assert(tuples_d && quality_d); + + // zero tuples + cms::cuda::launchZero(tuples_d, cudaStream); + + auto nhits = hh.nHits(); + + // std::cout << "N hits " << nhits << std::endl; + // if (nhits<2) std::cout << "too few hits " << nhits << std::endl; + + // + // applying conbinatoric cleaning such as fishbone at this stage is too expensive + // + + kernel_connect(device_hitTuple_apc_, + device_hitToTuple_apc_, // needed only to be reset, ready for next kernel + hh.view(), + device_theCells_.get(), + device_nCells_, + device_theCellNeighbors_.get(), + isOuterHitOfCell_, + params_.hardCurvCut_, + params_.ptmin_, + params_.CAThetaCutBarrel_, + params_.CAThetaCutForward_, + params_.dcaCutInnerTriplet_, + params_.dcaCutOuterTriplet_); + + if (nhits > 1 && params_.earlyFishbone_) { + gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, false); + } + + kernel_find_ntuplets(hh.view(), + device_theCells_.get(), + device_nCells_, + device_theCellTracks_.get(), + tuples_d, + device_hitTuple_apc_, + quality_d, + params_.minHitsPerNtuplet_); + if (params_.doStats_) + kernel_mark_used(device_theCells_.get(), device_nCells_); + + cms::cuda::finalizeBulk(device_hitTuple_apc_, tuples_d); + + kernel_fillHitDetIndices(tuples_d, hh.view(), detId_d); + kernel_fillNLayers(tracks_d, tracks_d->view(), device_hitTuple_apc_); + + // remove duplicates (tracks that share a doublet) + kernel_earlyDuplicateRemover( + device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_); + + kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get()); + cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream); + kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get()); + + if (nhits > 1 && params_.lateFishbone_) { + gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, true); + } +} + +template <> +void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { + int32_t nhits = hh.nHits(); + + auto const *tuples_d = &tracks_d->hitIndices; + + // classify tracks based on kinematics + kernel_classifyTracks(tuples_d, tracks_d, tracks_d->view(), params_.cuts_); + + if (params_.lateFishbone_) { + // apply fishbone cleaning to good tracks + kernel_fishboneCleaner(device_theCells_.get(), device_nCells_, quality_d); + } + + // remove duplicates (tracks that share a doublet) + kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_); + + // fill hit->track "map" + if (params_.doSharedHitCut_ || params_.doStats_) { + kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get()); + cms::cuda::launchFinalize(hitToTupleView_, cudaStream); + kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get()); + } + + // remove duplicates (tracks that share at least one hit) + if (params_.doSharedHitCut_) { + kernel_rejectDuplicate( + tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + + kernel_sharedHitCleaner( + hh.view(), tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + if (params_.useSimpleTripletCleaner_) { + kernel_simpleTripletCleaner( + tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + } else { + kernel_tripletCleaner( + tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + } + } + + if (params_.doStats_) { + std::lock_guard guard(lock_stat); + kernel_checkOverflows(tuples_d, + device_tupleMultiplicity_.get(), + device_hitToTuple_.get(), + device_hitTuple_apc_, + device_theCells_.get(), + device_nCells_, + device_theCellNeighbors_.get(), + device_theCellTracks_.get(), + isOuterHitOfCell_, + nhits, + params_.maxNumberOfDoublets_, + counters_); + } + + if (params_.doStats_) { + // counters (add flag???) + std::lock_guard guard(lock_stat); + kernel_doStatsForHitInTracks(device_hitToTuple_.get(), counters_); + kernel_doStatsForTracks(tuples_d, quality_d, counters_); + } + +#ifdef DUMP_GPU_TK_TUPLES + static std::atomic iev(0); + static std::mutex lock; + { + std::lock_guard guard(lock); + ++iev; + kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d->view(), device_hitToTuple_.get(), 0, 1000000, iev); + } +#endif +} diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index 2f83a18c6127d..b3468cbce9dde 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -142,8 +142,7 @@ __global__ void kernel_fishboneCleaner(GPUCACell const *cells, uint32_t const *_ // It does not seem to affect efficiency in any way! __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells, uint32_t const *__restrict__ nCells, - TkSoAConstView tracks_view, - Quality *quality, + TkSoAView tracks_view, bool dupPassThrough) { // quality to mark rejected constexpr auto reject = pixelTrack::Quality::edup; /// cannot be loose @@ -170,7 +169,7 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells, for (auto it : thisCell.tracks()) { if (tracks_view[it].nLayers() < maxNl) - quality[it] = reject; //no race: simple assignment of the same constant + tracks_view[it].quality() = reject; //no race: simple assignment of the same constant } } } From 4312c47b2bcf6eef4a4292928c6a02674fb8da1b Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Tue, 11 Oct 2022 14:16:01 +0200 Subject: [PATCH 021/110] earlyDuplicateRemover --- .../plugins/#CAHitNtupletGeneratorKernels.cc# | 219 ------------------ .../plugins/CAHitNtupletGeneratorKernels.cc | 4 +- 2 files changed, 1 insertion(+), 222 deletions(-) delete mode 100644 RecoPixelVertexing/PixelTriplets/plugins/#CAHitNtupletGeneratorKernels.cc# diff --git a/RecoPixelVertexing/PixelTriplets/plugins/#CAHitNtupletGeneratorKernels.cc# b/RecoPixelVertexing/PixelTriplets/plugins/#CAHitNtupletGeneratorKernels.cc# deleted file mode 100644 index 9a8cc40d24985..0000000000000 --- a/RecoPixelVertexing/PixelTriplets/plugins/#CAHitNtupletGeneratorKernels.cc# +++ /dev/null @@ -1,219 +0,0 @@ -#include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h" - -#include - -namespace { - // cuda atomics are NOT atomics on CPU so protect stat update with a mutex - // waiting for a more general solution (incuding multiple devices) to be proposed and implemented - std::mutex lock_stat; -} // namespace - -template <> -void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const *counters) { - kernel_printCounters(counters); -} - -template <> -void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) { - auto nhits = hh.nHits(); - -#ifdef NTUPLE_DEBUG - std::cout << "building Doublets out of " << nhits << " Hits. BPIX2 offset is " << hh.offsetBPIX2() << std::endl; -#endif - - // use "nhits" to heuristically dimension the workspace - - // no need to use the Traits allocations, since we know this is being compiled for the CPU - //device_isOuterHitOfCell_ = Traits::template make_unique(std::max(1U, nhits), stream); - device_isOuterHitOfCell_ = std::make_unique(std::max(1U, nhits)); - assert(device_isOuterHitOfCell_.get()); - isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), hh.offsetBPIX2()}; - - auto cellStorageSize = caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) + - caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks); - // no need to use the Traits allocations, since we know this is being compiled for the CPU - //cellStorage_ = Traits::template make_unique(cellStorageSize, stream); - cellStorage_ = std::make_unique(cellStorageSize); - device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get(); - device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets * - sizeof(GPUCACell::CellNeighbors)); - - gpuPixelDoublets::initDoublets(isOuterHitOfCell_, - nhits, - device_theCellNeighbors_.get(), - device_theCellNeighborsContainer_, - device_theCellTracks_.get(), - device_theCellTracksContainer_); - - // no need to use the Traits allocations, since we know this is being compiled for the CPU - //device_theCells_ = Traits::template make_unique(params_.maxNumberOfDoublets_, stream); - device_theCells_ = std::make_unique(params_.maxNumberOfDoublets_); - if (0 == nhits) - return; // protect against empty events - - // take all layer pairs into account - auto nActualPairs = gpuPixelDoublets::nPairs; - if (not params_.includeJumpingForwardDoublets_) { - // exclude forward "jumping" layer pairs - nActualPairs = gpuPixelDoublets::nPairsForTriplets; - } - if (params_.minHitsPerNtuplet_ > 3) { - // for quadruplets, exclude all "jumping" layer pairs - nActualPairs = gpuPixelDoublets::nPairsForQuadruplets; - } - - assert(nActualPairs <= gpuPixelDoublets::nPairs); - gpuPixelDoublets::getDoubletsFromHisto(device_theCells_.get(), - device_nCells_, - device_theCellNeighbors_.get(), - device_theCellTracks_.get(), - hh.view(), - isOuterHitOfCell_, - nActualPairs, - params_.idealConditions_, - params_.doClusterCut_, - params_.doZ0Cut_, - params_.doPtCut_, - params_.maxNumberOfDoublets_); -} - -template <> -void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { - auto *tuples_d = &tracks_d->hitIndices; - auto *detId_d = &tracks_d->detIndices; - auto *quality_d = tracks_d->qualityData(); - - assert(tuples_d && quality_d); - - // zero tuples - cms::cuda::launchZero(tuples_d, cudaStream); - - auto nhits = hh.nHits(); - - // std::cout << "N hits " << nhits << std::endl; - // if (nhits<2) std::cout << "too few hits " << nhits << std::endl; - - // - // applying conbinatoric cleaning such as fishbone at this stage is too expensive - // - - kernel_connect(device_hitTuple_apc_, - device_hitToTuple_apc_, // needed only to be reset, ready for next kernel - hh.view(), - device_theCells_.get(), - device_nCells_, - device_theCellNeighbors_.get(), - isOuterHitOfCell_, - params_.hardCurvCut_, - params_.ptmin_, - params_.CAThetaCutBarrel_, - params_.CAThetaCutForward_, - params_.dcaCutInnerTriplet_, - params_.dcaCutOuterTriplet_); - - if (nhits > 1 && params_.earlyFishbone_) { - gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, false); - } - - kernel_find_ntuplets(hh.view(), - device_theCells_.get(), - device_nCells_, - device_theCellTracks_.get(), - tuples_d, - device_hitTuple_apc_, - quality_d, - params_.minHitsPerNtuplet_); - if (params_.doStats_) - kernel_mark_used(device_theCells_.get(), device_nCells_); - - cms::cuda::finalizeBulk(device_hitTuple_apc_, tuples_d); - - kernel_fillHitDetIndices(tuples_d, hh.view(), detId_d); - kernel_fillNLayers(tracks_d, tracks_d->view(), device_hitTuple_apc_); - - // remove duplicates (tracks that share a doublet) - kernel_earlyDuplicateRemover( - device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_); - - kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get()); - cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream); - kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get()); - - if (nhits > 1 && params_.lateFishbone_) { - gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, true); - } -} - -template <> -void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { - int32_t nhits = hh.nHits(); - - auto const *tuples_d = &tracks_d->hitIndices; - - // classify tracks based on kinematics - kernel_classifyTracks(tuples_d, tracks_d, tracks_d->view(), params_.cuts_); - - if (params_.lateFishbone_) { - // apply fishbone cleaning to good tracks - kernel_fishboneCleaner(device_theCells_.get(), device_nCells_, quality_d); - } - - // remove duplicates (tracks that share a doublet) - kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_); - - // fill hit->track "map" - if (params_.doSharedHitCut_ || params_.doStats_) { - kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get()); - cms::cuda::launchFinalize(hitToTupleView_, cudaStream); - kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get()); - } - - // remove duplicates (tracks that share at least one hit) - if (params_.doSharedHitCut_) { - kernel_rejectDuplicate( - tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); - - kernel_sharedHitCleaner( - hh.view(), tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); - if (params_.useSimpleTripletCleaner_) { - kernel_simpleTripletCleaner( - tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); - } else { - kernel_tripletCleaner( - tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); - } - } - - if (params_.doStats_) { - std::lock_guard guard(lock_stat); - kernel_checkOverflows(tuples_d, - device_tupleMultiplicity_.get(), - device_hitToTuple_.get(), - device_hitTuple_apc_, - device_theCells_.get(), - device_nCells_, - device_theCellNeighbors_.get(), - device_theCellTracks_.get(), - isOuterHitOfCell_, - nhits, - params_.maxNumberOfDoublets_, - counters_); - } - - if (params_.doStats_) { - // counters (add flag???) - std::lock_guard guard(lock_stat); - kernel_doStatsForHitInTracks(device_hitToTuple_.get(), counters_); - kernel_doStatsForTracks(tuples_d, quality_d, counters_); - } - -#ifdef DUMP_GPU_TK_TUPLES - static std::atomic iev(0); - static std::mutex lock; - { - std::lock_guard guard(lock); - ++iev; - kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d->view(), device_hitToTuple_.get(), 0, 1000000, iev); - } -#endif -} diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc index 2bbf9edab4e99..ef6cb4ecc0ea6 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -132,9 +132,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * kernel_fillNLayers(tracks_d, tracks_d->view(), device_hitTuple_apc_); // remove duplicates (tracks that share a doublet) - kernel_earlyDuplicateRemover( - device_theCells_.get(), device_nCells_, tracks_d->view(), quality_d, params_.dupPassThrough_); - + kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_); kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get()); cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream); kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get()); From 86bc12aa6eb478f9ee05039124732d4a560794e7 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Tue, 11 Oct 2022 14:19:58 +0200 Subject: [PATCH 022/110] Multiplicity kernels --- .../plugins/CAHitNtupletGeneratorKernels.cc | 2 +- .../plugins/CAHitNtupletGeneratorKernelsImpl.h | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc index ef6cb4ecc0ea6..9ebe0dfe44bdb 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -133,7 +133,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * // remove duplicates (tracks that share a doublet) kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_); - kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get()); + kernel_countMultiplicity(tuples_d, tracks_d->view(), device_tupleMultiplicity_.get()); cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream); kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get()); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index b3468cbce9dde..45ef5009bc44a 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -375,16 +375,16 @@ __global__ void kernel_mark_used(GPUCACell *__restrict__ cells, uint32_t const * } __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets, - Quality const *__restrict__ quality, + TkSoAConstView tracks_view, caConstants::TupleMultiplicity *tupleMultiplicity) { auto first = blockIdx.x * blockDim.x + threadIdx.x; for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) { auto nhits = foundNtuplets->size(it); if (nhits < 3) continue; - if (quality[it] == pixelTrack::Quality::edup) + if (tracks_view[it].quality() == pixelTrack::Quality::edup) continue; - assert(quality[it] == pixelTrack::Quality::bad); + assert(tracks_view[it].quality() == pixelTrack::Quality::bad); if (nhits > 7) // current limit printf("wrong mult %d %d\n", it, nhits); assert(nhits <= caConstants::maxHitsOnTrack); @@ -393,16 +393,16 @@ __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundN } __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets, - Quality const *__restrict__ quality, + TkSoAConstView tracks_view, caConstants::TupleMultiplicity *tupleMultiplicity) { auto first = blockIdx.x * blockDim.x + threadIdx.x; for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) { auto nhits = foundNtuplets->size(it); if (nhits < 3) continue; - if (quality[it] == pixelTrack::Quality::edup) + if (tracks_view[it].quality() == pixelTrack::Quality::edup) continue; - assert(quality[it] == pixelTrack::Quality::bad); + assert(tracks_view[it].quality() == pixelTrack::Quality::bad); if (nhits > 7) printf("wrong mult %d %d\n", it, nhits); assert(nhits <= caConstants::maxHitsOnTrack); From 9cf0298792504a1f2caa1287fc708ded7d81569b Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Tue, 11 Oct 2022 14:22:23 +0200 Subject: [PATCH 023/110] classifyTracks --- .../PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc | 2 +- .../plugins/CAHitNtupletGeneratorKernelsImpl.h | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc index 9ebe0dfe44bdb..e2ec93fbbea86 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -149,7 +149,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA auto const *tuples_d = &tracks_d->hitIndices; // classify tracks based on kinematics - kernel_classifyTracks(tuples_d, tracks_d, tracks_d->view(), params_.cuts_); + kernel_classifyTracks(tuples_d, tracks_d->view(), params_.cuts_); if (params_.lateFishbone_) { // apply fishbone cleaning to good tracks diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index 45ef5009bc44a..178ba71532b8f 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -415,7 +415,6 @@ __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNt the SoA Data */ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, - TkSoA const *__restrict__ tracks, TkSoAView tracks_view, CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts) { int first = blockDim.x * blockIdx.x + threadIdx.x; @@ -447,7 +446,7 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, continue; } - tracks[it].quality() = pixelTrack::Quality::strict; + tracks_view[it].quality() = pixelTrack::Quality::strict; // compute a pT-dependent chi2 cut @@ -487,7 +486,7 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, continue; } - tracks[it].quality() = pixelTrack::Quality::tight; + tracks_view[it].quality() = pixelTrack::Quality::tight; // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip) // default cuts: @@ -500,7 +499,7 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, (std::abs(pixelTrack::utilities::zip(tracks_view, it)) < region.maxZip); if (isOk) { - tracks[it].quality() = pixelTrack::Quality::highPurity; + tracks_view[it].quality() = pixelTrack::Quality::highPurity; } } } From a988e01255f337bb380caf988b5ad8f5aea7dd68 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Tue, 11 Oct 2022 16:14:01 +0200 Subject: [PATCH 024/110] Cleanup, print ntuplets --- .../plugins/CAHitNtupletGeneratorKernels.cu | 33 +++++++---------- .../CAHitNtupletGeneratorKernelsImpl.h | 35 +++++++++---------- 2 files changed, 30 insertions(+), 38 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index 712d995a6a6cf..d293ad00558fe 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -234,12 +234,13 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA // classify tracks based on kinematics auto numberOfBlocks = nQuadrupletBlocks(blockSize); kernel_classifyTracks<<>>( - tuples_d, tracks_d, tracks_d->stateAtBS.view(), params_.cuts_, quality_d); + tuples_d, tracks_d->view(), params_.cuts_, quality_d); cudaCheck(cudaGetLastError()); if (params_.lateFishbone_) { - // apply fishbone cleaning to good tracks - numberOfBlocks = nDoubletBlocks(blockSize); + x + // apply fishbone cleaning to good tracks + numberOfBlocks = nDoubletBlocks(blockSize); kernel_fishboneCleaner<<>>( device_theCells_.get(), device_nCells_, quality_d); cudaCheck(cudaGetLastError()); @@ -248,7 +249,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA // mark duplicates (tracks that share a doublet) numberOfBlocks = nDoubletBlocks(blockSize); kernel_fastDuplicateRemover<<>>( - device_theCells_.get(), device_nCells_, tracks_d, tracks_d->stateAtBS.view(), params_.dupPassThrough_); + device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaCheck(cudaDeviceSynchronize()); @@ -276,26 +277,18 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA // mark duplicates (tracks that share at least one hit) numberOfBlocks = (hitToTupleView_.offSize + blockSize - 1) / blockSize; - kernel_rejectDuplicate<<>>(tracks_d, - tracks_d->stateAtBS.view(), - quality_d, - params_.minHitsForSharingCut_, - params_.dupPassThrough_, - device_hitToTuple_.get()); + kernel_rejectDuplicate<<>>( + tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); - kernel_sharedHitCleaner<<>>(hh.view(), - tracks_d, - quality_d, - params_.minHitsForSharingCut_, - params_.dupPassThrough_, - device_hitToTuple_.get()); + kernel_sharedHitCleaner<<>>( + hh.view(), tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); if (params_.useSimpleTripletCleaner_) { kernel_simpleTripletCleaner<<>>( - tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); } else { kernel_tripletCleaner<<>>( - tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); } cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG @@ -342,11 +335,11 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA ++iev; for (int k = 0; k < 20000; k += 500) { kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>( - hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), k, k + 500, iev); + hh.view(), tuples_d, tracks_d->view(), device_hitToTuple_.get(), k, k + 500, iev); cudaDeviceSynchronize(); } kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>( - hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 20000, 1000000, iev); + hh.view(), tuples_d, tracks_d->view(), device_hitToTuple_.get(), 20000, 1000000, iev); cudaDeviceSynchronize(); // cudaStreamSynchronize(cudaStream); } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index 178ba71532b8f..7e513856cd9c6 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -666,8 +666,8 @@ __global__ void kernel_rejectDuplicate(TkSoAView tracks_view, /* chi2 is bad for large pt auto score = [&](auto it, auto nl) { - return nl < 4 ? std::abs(tracks.tip(it)) : // tip for triplets - tracks.chi2(it); //chi2 + return nl < 4 ? std::abs(pixelTrack::utilities::tip(tracks_view, it)) : // tip for triplets + pixelTrack::utilities::chi2(tracks_view, it); //chi2 }; */ auto score = [&](auto it, auto nl) { return std::abs(pixelTrack::utilities::tip(tracks_view, it)); }; @@ -682,7 +682,7 @@ __global__ void kernel_rejectDuplicate(TkSoAView tracks_view, auto e2opi = tracks_view[it].covariance()(9); auto cti = tracks_view[it].state()(3); auto e2cti = tracks_view[it].covariance()(12); - auto nli = tracks[it].nLayers(); + auto nli = tracks_view[it].nLayers(); for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) { auto const jt = *jp; auto qj = tracks_view[jt].quality(); @@ -734,7 +734,7 @@ __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restric for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { if (tracks_view[*it].quality() < longTqual) continue; - // if (tracks.nHits(*it)==3) continue; + // if (tracks_view[*it].nHits()==3) continue; auto nl = tracks_view[*it].nLayers(); maxNl = std::max(nl, maxNl); } @@ -781,7 +781,7 @@ __global__ void kernel_tripletCleaner(TkSoAView tracks_view, // check if only triplets for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { - if (track_view[*it].quality() <= good) + if (tracks_view[*it].quality() <= good) continue; onlyTriplets &= pixelTrack::utilities::isTriplet(tracks_view, *it); if (!onlyTriplets) @@ -858,8 +858,7 @@ __global__ void kernel_simpleTripletCleaner( __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__restrict__ hhp, HitContainer const *__restrict__ ptuples, - TkSoA const *__restrict__ ptracks, - Quality const *__restrict__ quality, + TkSoAConstView tracks_view, CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple, int32_t firstPrint, int32_t lastPrint, @@ -867,27 +866,27 @@ __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__res constexpr auto loose = pixelTrack::Quality::loose; auto const &hh = *hhp; auto const &foundNtuplets = *ptuples; - auto const &tracks = *ptracks; + int first = firstPrint + blockDim.x * blockIdx.x + threadIdx.x; for (int i = first, np = std::min(lastPrint, foundNtuplets.nOnes()); i < np; i += blockDim.x * gridDim.x) { auto nh = foundNtuplets.size(i); if (nh < 3) continue; - if (quality[i] < loose) + if (tracks_view[i].quality() < loose) continue; printf("TK: %d %d %d %d %f %f %f %f %f %f %f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n", 10000 * iev + i, - int(quality[i]), + int(tracks_view[i].quality()), nh, - tracks[i].nLayers(), - pixelTrack::utilities::charge(tracks, i), - tracks[i].pt(), - tracks[i].eta(), - pixelTrack::utilities::phi(tracks, i), - pixelTrack::utilities::tip(tracks, i), - pixelTrack::utilities::zip(tracks, i), + tracks_view[i].nLayers(), + pixelTrack::utilities::charge(tracks_view, i), + tracks_view[i].pt(), + tracks_view[i].eta(), + pixelTrack::utilities::phi(tracks_view, i), + pixelTrack::utilities::tip(tracks_view, i), + pixelTrack::utilities::zip(tracks_view, i), // asinhf(fit_results[i].par(3)), - tracks[i].chi2(), + tracks_view[i].chi2(), hh.zGlobal(*foundNtuplets.begin(i)), hh.zGlobal(*(foundNtuplets.begin(i) + 1)), hh.zGlobal(*(foundNtuplets.begin(i) + 2)), From f979906d481f873b43640bf1deaece3250a69f07 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Tue, 11 Oct 2022 17:11:26 +0200 Subject: [PATCH 025/110] Adapted for quality, must revert to Quality types --- .../plugins/CAHitNtupletGeneratorKernels.cc | 12 ++-- .../plugins/CAHitNtupletGeneratorKernels.cu | 12 ++-- .../CAHitNtupletGeneratorKernelsImpl.h | 69 ++++++++++--------- .../PixelTriplets/plugins/HelixFitOnGPU.h | 2 +- 4 files changed, 49 insertions(+), 46 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc index e2ec93fbbea86..a34e0f280dd9d 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -83,7 +83,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * auto *detId_d = &tracks_d->detIndices; auto *quality_d = tracks_d->qualityData(); - assert(tuples_d && quality_d); + // assert(tuples_d && quality_d); // TODO Find equivalent for View // zero tuples cms::cuda::launchZero(tuples_d, cudaStream); @@ -135,7 +135,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_); kernel_countMultiplicity(tuples_d, tracks_d->view(), device_tupleMultiplicity_.get()); cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream); - kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get()); + kernel_fillMultiplicity(tuples_d, tracks_d->view(), device_tupleMultiplicity_.get()); if (nhits > 1 && params_.lateFishbone_) { gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, true); @@ -147,9 +147,9 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA int32_t nhits = hh.nHits(); auto const *tuples_d = &tracks_d->hitIndices; - + auto *quality_d = tracks_d->qualityData(); // classify tracks based on kinematics - kernel_classifyTracks(tuples_d, tracks_d->view(), params_.cuts_); + kernel_classifyTracks(tuples_d, tracks_d->view(), quality_d, params_.cuts_); if (params_.lateFishbone_) { // apply fishbone cleaning to good tracks @@ -161,9 +161,9 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA // fill hit->track "map" if (params_.doSharedHitCut_ || params_.doStats_) { - kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get()); + kernel_countHitInTracks(tuples_d, device_hitToTuple_.get()); cms::cuda::launchFinalize(hitToTupleView_, cudaStream); - kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get()); + kernel_fillHitInTracks(tuples_d, device_hitToTuple_.get()); } // remove duplicates (tracks that share at least one hit) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index d293ad00558fe..c0b953f3b5d10 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -91,13 +91,13 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * kernel_fillHitDetIndices<<>>(tuples_d, hh.view(), detId_d); cudaCheck(cudaGetLastError()); - kernel_fillNLayers<<>>(tracks_d, device_hitTuple_apc_); + kernel_fillNLayers<<>>(tracks_d, tracks_d->view(), device_hitTuple_apc_); cudaCheck(cudaGetLastError()); // remove duplicates (tracks that share a doublet) numberOfBlocks = nDoubletBlocks(blockSize); kernel_earlyDuplicateRemover<<>>( - device_theCells_.get(), device_nCells_, tracks_d, quality_d, params_.dupPassThrough_); + device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_); cudaCheck(cudaGetLastError()); blockSize = 128; @@ -234,13 +234,13 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA // classify tracks based on kinematics auto numberOfBlocks = nQuadrupletBlocks(blockSize); kernel_classifyTracks<<>>( - tuples_d, tracks_d->view(), params_.cuts_, quality_d); + tuples_d, tracks_d->view(), quality_d, params_.cuts_); + cudaCheck(cudaGetLastError()); if (params_.lateFishbone_) { - x - // apply fishbone cleaning to good tracks - numberOfBlocks = nDoubletBlocks(blockSize); + // apply fishbone cleaning to good tracks + numberOfBlocks = nDoubletBlocks(blockSize); kernel_fishboneCleaner<<>>( device_theCells_.get(), device_nCells_, quality_d); cudaCheck(cudaGetLastError()); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index 7e513856cd9c6..f38c042ed15c2 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -9,6 +9,7 @@ #include #include +#include #include #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" @@ -145,7 +146,7 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells, TkSoAView tracks_view, bool dupPassThrough) { // quality to mark rejected - constexpr auto reject = pixelTrack::Quality::edup; /// cannot be loose + constexpr auto reject = (uint8_t)pixelTrack::Quality::edup; /// cannot be loose assert(nCells); auto first = threadIdx.x + blockIdx.x * blockDim.x; @@ -180,8 +181,8 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells, TkSoAView tracks_view, bool dupPassThrough) { // quality to mark rejected - auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup; - constexpr auto loose = pixelTrack::Quality::loose; + auto const reject = dupPassThrough ? (uint8_t)pixelTrack::Quality::loose : (uint8_t)pixelTrack::Quality::dup; + constexpr auto loose = (uint8_t)pixelTrack::Quality::loose; assert(nCells); @@ -382,9 +383,9 @@ __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundN auto nhits = foundNtuplets->size(it); if (nhits < 3) continue; - if (tracks_view[it].quality() == pixelTrack::Quality::edup) + if (tracks_view[it].quality() == (uint8_t)pixelTrack::Quality::edup) continue; - assert(tracks_view[it].quality() == pixelTrack::Quality::bad); + assert(tracks_view[it].quality() == (uint8_t)pixelTrack::Quality::bad); if (nhits > 7) // current limit printf("wrong mult %d %d\n", it, nhits); assert(nhits <= caConstants::maxHitsOnTrack); @@ -400,9 +401,9 @@ __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNt auto nhits = foundNtuplets->size(it); if (nhits < 3) continue; - if (tracks_view[it].quality() == pixelTrack::Quality::edup) + if (tracks_view[it].quality() == (uint8_t)pixelTrack::Quality::edup) continue; - assert(tracks_view[it].quality() == pixelTrack::Quality::bad); + assert(tracks_view[it].quality() == (uint8_t)pixelTrack::Quality::bad); if (nhits > 7) printf("wrong mult %d %d\n", it, nhits); assert(nhits <= caConstants::maxHitsOnTrack); @@ -416,6 +417,7 @@ the SoA Data */ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, TkSoAView tracks_view, + Quality *__restrict__ quality, CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts) { int first = blockDim.x * blockIdx.x + threadIdx.x; @@ -425,10 +427,10 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, break; // guard // if duplicate: not even fit - if (tracks_view[it].quality() == pixelTrack::Quality::edup) + if (quality[it] == pixelTrack::Quality::edup) continue; - assert(tracks_view[it].quality() == pixelTrack::Quality::bad); + assert(quality[it] == pixelTrack::Quality::bad); // mark doublets as bad if (nhits < 3) @@ -441,12 +443,12 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, } if (isNaN) { #ifdef NTUPLE_DEBUG - printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), pixelTrack::utilities::chi2(tracks_view, it)); + printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks_view[it].chi2()); #endif continue; } - tracks_view[it].quality() = pixelTrack::Quality::strict; + quality[it] = pixelTrack::Quality::strict; // compute a pT-dependent chi2 cut @@ -472,21 +474,21 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, }; // (see CAHitNtupletGeneratorGPU.cc) - float pt = std::min(pixelTrack::utilities::pt(tracks_view, it), cuts.chi2MaxPt); + float pt = std::min(tracks_view[it].pt(), cuts.chi2MaxPt); float chi2Cut = cuts.chi2Scale * (cuts.chi2Coeff[0] + roughLog(pt) * cuts.chi2Coeff[1]); - if (pixelTrack::utilities::chi2(tracks_view, it) >= chi2Cut) { + if (tracks_view[it].chi2() >= chi2Cut) { #ifdef NTUPLE_FIT_DEBUG printf("Bad chi2 %d size %d pt %f eta %f chi2 %f\n", it, tuples->size(it), - pixelTrack::utilities::pt(tracks_view, it), - pixelTrack::utilities::eta(tracks_view, it), - pixelTrack::utilities::chi2(tracks_view, it)); + tracks_view[it].pt(), + tracks_view[it].eta(), + tracks_view[it].chi2()); #endif continue; } - tracks_view[it].quality() = pixelTrack::Quality::tight; + quality[it] = pixelTrack::Quality::tight; // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip) // default cuts: @@ -495,11 +497,11 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, // (see CAHitNtupletGeneratorGPU.cc) auto const ®ion = (nhits > 3) ? cuts.quadruplet : cuts.triplet; bool isOk = (std::abs(pixelTrack::utilities::tip(tracks_view, it)) < region.maxTip) and - (pixelTrack::utilities::pt(tracks_view, it) > region.minPt) and + (tracks_view[it].pt() > region.minPt) and (std::abs(pixelTrack::utilities::zip(tracks_view, it)) < region.maxZip); if (isOk) { - tracks_view[it].quality() = pixelTrack::Quality::highPurity; + quality[it] = pixelTrack::Quality::highPurity; } } } @@ -521,7 +523,6 @@ __global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples, } __global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples, - Quality const *__restrict__ quality, CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) { int first = blockDim.x * blockIdx.x + threadIdx.x; for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { @@ -533,7 +534,6 @@ __global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples, } __global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples, - Quality const *__restrict__ quality, CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) { int first = blockDim.x * blockIdx.x + threadIdx.x; for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { @@ -561,6 +561,9 @@ __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples } } +/* + Needs both TkSoA and TkSoAView for accessing SoA, computeNumberOfLayers(), nHits(), stride() + */ __global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks, TkSoAView tracks_view, cms::cuda::AtomicPairCounter *apc) { @@ -573,7 +576,7 @@ __global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks, for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) { auto nHits = tracks.nHits(idx); assert(nHits >= 3); - tracks[idx].nLayers() = tracks.computeNumberOfLayers(idx); + tracks_view[idx].nLayers() = tracks.computeNumberOfLayers(idx); } } @@ -630,10 +633,10 @@ __global__ void kernel_markSharedHit(int const *__restrict__ nshared, HitContainer const *__restrict__ tuples, Quality *__restrict__ quality, bool dupPassThrough) { - // constexpr auto bad = pixelTrack::Quality::bad; + // constexpr auto bad = (uint8_t)pixelTrack::Quality::bad; constexpr auto dup = pixelTrack::Quality::dup; constexpr auto loose = pixelTrack::Quality::loose; - // constexpr auto strict = pixelTrack::Quality::strict; + // constexpr auto strict = (uint8_t)pixelTrack::Quality::strict; // quality to mark rejected auto const reject = dupPassThrough ? loose : dup; @@ -655,7 +658,7 @@ __global__ void kernel_rejectDuplicate(TkSoAView tracks_view, bool dupPassThrough, CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) { // quality to mark rejected - auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup; + auto const reject = dupPassThrough ? (uint8_t)pixelTrack::Quality::loose : (uint8_t)pixelTrack::Quality::dup; auto &hitToTuple = *phitToTuple; @@ -714,9 +717,9 @@ __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restric bool dupPassThrough, CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) { // quality to mark rejected - auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup; + auto const reject = dupPassThrough ? (uint8_t)pixelTrack::Quality::loose : (uint8_t)pixelTrack::Quality::dup; // quality of longest track - auto const longTqual = pixelTrack::Quality::highPurity; + auto const longTqual = (uint8_t)pixelTrack::Quality::highPurity; auto &hitToTuple = *phitToTuple; @@ -764,9 +767,9 @@ __global__ void kernel_tripletCleaner(TkSoAView tracks_view, bool dupPassThrough, CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) { // quality to mark rejected - auto const reject = pixelTrack::Quality::loose; + auto const reject = (uint8_t)pixelTrack::Quality::loose; /// min quality of good - auto const good = pixelTrack::Quality::strict; + auto const good = (uint8_t)pixelTrack::Quality::strict; auto &hitToTuple = *phitToTuple; @@ -820,9 +823,9 @@ __global__ void kernel_simpleTripletCleaner( bool dupPassThrough, CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) { // quality to mark rejected - auto const reject = pixelTrack::Quality::loose; + auto const reject = (uint8_t)pixelTrack::Quality::loose; /// min quality of good - auto const good = pixelTrack::Quality::loose; + auto const good = (uint8_t)pixelTrack::Quality::loose; auto &hitToTuple = *phitToTuple; @@ -849,7 +852,7 @@ __global__ void kernel_simpleTripletCleaner( // mark worse ambiguities for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { auto const it = *ip; - if (tracks_view[it].quality() > reject && pixelTracks::utilities::isTriplet(tracks_view, it) && it != im) + if (tracks_view[it].quality() > reject && pixelTrack::utilities::isTriplet(tracks_view, it) && it != im) tracks_view[it].quality() = reject; //no race: simple assignment of the same constant } @@ -863,7 +866,7 @@ __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__res int32_t firstPrint, int32_t lastPrint, int iev) { - constexpr auto loose = pixelTrack::Quality::loose; + constexpr auto loose = (uint8_t)pixelTrack::Quality::loose; auto const &hh = *hhp; auto const &foundNtuplets = *ptuples; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h index d47e4c5f8ece9..78d0c2782f17e 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h @@ -47,7 +47,7 @@ class HelixFitOnGPU { void launchRiemannKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream); void launchBrokenLineKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream); - void launchiRemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples); + void launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples); void launchBrokenLineKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples); void allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoAView *outputSoA); From ffcbc08413fb907dbee60d2370bbf4fa916e46bd Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Tue, 11 Oct 2022 17:21:22 +0200 Subject: [PATCH 026/110] Fix instantiation of TrackSoA --- .../PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc | 2 +- .../PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h | 2 +- RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc index f650ca8ab2a08..f68a315e1d3c3 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -200,7 +200,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH kernels.launchKernels(hits_d, soa, stream); HelixFitOnGPU fitter(bfield, m_params.fitNas4_); - fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); + fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa->view()); if (m_params.useRiemannFit_) { fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream); } else { diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h index ae4576d883530..36212298aac2f 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h @@ -28,7 +28,7 @@ class CAHitNtupletGeneratorOnGPU { using hindex_type = TrackingRecHit2DSOAView::hindex_type; using Quality = pixelTrack::Quality; - using OutputSoA = pixelTrack::TrackSoA; + using OutputSoAView = pixelTrack::TrackSoAView; using HitContainer = pixelTrack::HitContainer; using Tuple = HitContainer; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h index 78d0c2782f17e..67a180c53e887 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h @@ -50,7 +50,7 @@ class HelixFitOnGPU { void launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples); void launchBrokenLineKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples); - void allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoAView *outputSoA); + void allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoAView outputSoA); void deallocateOnGPU(); private: From 4e36dbc4e58ee79b559dfa8e8625c3312d96f518 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Tue, 11 Oct 2022 17:25:30 +0200 Subject: [PATCH 027/110] riemann fit kernel --- .../PixelTriplets/plugins/RiemannFitOnGPU.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h index 926002d674b83..e815a8943d520 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h @@ -128,13 +128,13 @@ template __global__ void kernel_LineFit(caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity, uint32_t nHits, double bField, - OutputSoA *results, + OutputSoAView results_view, double *__restrict__ phits, float *__restrict__ phits_ge, double *__restrict__ pfast_fit_input, riemannFit::CircleFit *__restrict__ circle_fit, uint32_t offset) { - assert(results); + // assert(results); // TODO find equivalent for View assert(circle_fit); assert(N <= nHits); @@ -159,11 +159,16 @@ __global__ void kernel_LineFit(caConstants::TupleMultiplicity const *__restrict_ riemannFit::fromCircleToPerigee(circle_fit[local_idx]); - results->stateAtBS.copyFromCircle( - circle_fit[local_idx].par, circle_fit[local_idx].cov, line_fit.par, line_fit.cov, 1.f / float(bField), tkid); - results->pt(tkid) = bField / std::abs(circle_fit[local_idx].par(2)); - results->eta(tkid) = asinhf(line_fit.par(0)); - results->chi2(tkid) = (circle_fit[local_idx].chi2 + line_fit.chi2) / (2 * N - 5); + pixelTrack::utilities::copyFromCircle(results_view, + circle_fit[local_idx].par, + circle_fit[local_idx].cov, + line_fit.par, + line_fit.cov, + 1.f / float(bField), + tkid); + results_view[tkid].pt() = bField / std::abs(circle_fit[local_idx].par(2)); + results_view[tkid].eta() = asinhf(line_fit.par(0)); + results_view[tkid].chi2() = (circle_fit[local_idx].chi2 + line_fit.chi2) / (2 * N - 5); #ifdef RIEMANN_DEBUG printf("kernelLineFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n", From c4695dad2191eaa1c6be6ff92a5819e757237a77 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Tue, 11 Oct 2022 17:36:45 +0200 Subject: [PATCH 028/110] Fixed allocation call --- .../PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc index f68a315e1d3c3..a56dcc851c1d5 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -235,7 +235,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC // now fit HelixFitOnGPU fitter(bfield, m_params.fitNas4_); - fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); + fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa->view()); if (m_params.useRiemannFit_) { fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets); From 1f04fbfe056df38dfb79f35671205b322e065e76 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Tue, 11 Oct 2022 18:45:53 +0200 Subject: [PATCH 029/110] Several fixes, still breaks --- .../interface/TrackSoAHeterogeneousT_test.h | 29 +++++-------------- .../plugins/PixelTrackProducerFromSoA.cc | 11 +++---- .../plugins/PixelTrackSoAFromCUDA.cc | 2 +- .../plugins/CAHitNtupletGeneratorKernels.cu | 9 +++--- .../plugins/CAHitNtupletGeneratorOnGPU.cc | 2 +- .../PixelTriplets/plugins/RiemannFitOnGPU.h | 4 +-- 6 files changed, 22 insertions(+), 35 deletions(-) diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h index f4075b0b385d7..289c6d2f8c211 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h @@ -44,24 +44,17 @@ namespace pixelTrack { using TrackSoAConstView = cms::cuda::PortableDeviceCollection>::ConstView; // State at the Beam spot // phi,tip,1/pt,cotan(theta),zip - float charge(TrackSoAConstView tracks, int32_t i) { return std::copysign(1.f, tracks[i].state()(2)); } - - float phi(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(0); } - - float tip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(1); } - - float zip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(4); } + __host__ __device__ float charge(TrackSoAConstView tracks, int32_t i) { + return std::copysign(1.f, tracks[i].state()(2)); + } - // float pt(TrackSoAConstView tracks, int32_t i) { return tracks[i].pt(); } - // // float &pt(TrackSoAConstView tracks, int32_t i) { return tracks[i].pt(); } + __host__ __device__ float phi(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(0); } - // float eta(TrackSoAConstView tracks, int32_t i) { return tracks[i].eta(); } - // // float &eta(TrackSoAConstView tracks, int32_t i) { return tracks[i].eta(); } + __host__ __device__ float tip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(1); } - // float chi2(TrackSoAConstView tracks, int32_t i) { return tracks[i].chi2(); } - // float &chi2(TrackSoAConstView tracks, int32_t i) { return tracks[i].chi2(); } + __host__ __device__ float zip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(4); } - bool isTriplet(TrackSoAConstView tracks, int i) { return tracks[i].nLayers() == 3; } + __host__ __device__ bool isTriplet(TrackSoAConstView tracks, int i) { return tracks[i].nLayers() == 3; } template __host__ __device__ inline void copyFromCircle( @@ -93,7 +86,7 @@ namespace pixelTrack { } template - __host__ __device__ inline void copyToDense(TrackSoAView tracks, V5 &v, M5 &cov, int32_t i) { + __host__ __device__ inline void copyToDense(TrackSoAConstView tracks, V5 &v, M5 &cov, int32_t i) { v = tracks[i].state().template cast(); for (int j = 0, ind = 0; j < 5; ++j) { cov(j, j) = tracks[i].covariance()(ind++); @@ -129,9 +122,6 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection(view().quality()); } constexpr Quality *qualityData() { return reinterpret_cast(view().quality()); } - constexpr int nTracks() const { return nTracks_; } - constexpr void setNTracks(int n) { nTracks_ = n; } - constexpr int nHits(int i) const { return detIndices.size(i); } constexpr int computeNumberOfLayers(int32_t i) const { @@ -150,9 +140,6 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection sortIdxs(nTracks); std::iota(sortIdxs.begin(), sortIdxs.end(), 0); - std::sort( - sortIdxs.begin(), sortIdxs.end(), [&](int32_t const i1, int32_t const i2) { return tsoa.pt(i1) > tsoa.pt(i2); }); + std::sort(sortIdxs.begin(), sortIdxs.end(), [&](int32_t const i1, int32_t const i2) { + return tsoa.view()[i1].pt() > tsoa.view()[i2].pt(); + }); //store the index of the SoA: indToEdm[index_SoAtrack] -> index_edmTrack (if it exists) indToEdm.resize(sortIdxs.size(), -1); @@ -189,12 +190,12 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, // mind: this values are respect the beamspot! - float chi2 = tsoa.chi2(it); + float chi2 = tsoa.view()[it].chi2(); float phi = pixelTrack::utilities::phi(tsoa.view(), it); riemannFit::Vector5d ipar, opar; riemannFit::Matrix5d icov, ocov; - pixelTrack::utilities::copyToDense(tsoa.view(), ipar, icov, it); + pixelTrack::utilities::copyToDense(tsoa.view(), ipar, icov, it); riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov); LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.); diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc index 5cf4aac491901..57df7ae63b7a0 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc @@ -63,7 +63,7 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i // check that the fixed-size SoA does not overflow auto const& tsoa = *soa_; auto maxTracks = tsoa.stride(); - auto nTracks = tsoa.nTracks(); + auto nTracks = tsoa.view().nTracks(); assert(nTracks < maxTracks); if (nTracks == maxTracks - 1) { edm::LogWarning("PixelTracks") << "Unsorted reconstructed pixel tracks truncated to " << maxTracks - 1 diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index c0b953f3b5d10..168ba3b0c8144 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -103,10 +103,10 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * blockSize = 128; numberOfBlocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize; kernel_countMultiplicity<<>>( - tuples_d, quality_d, device_tupleMultiplicity_.get()); + tuples_d, tracks_d->view(), device_tupleMultiplicity_.get()); cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream); kernel_fillMultiplicity<<>>( - tuples_d, quality_d, device_tupleMultiplicity_.get()); + tuples_d, tracks_d->view(), device_tupleMultiplicity_.get()); cudaCheck(cudaGetLastError()); // do not run the fishbone if there are hits only in BPIX1 @@ -259,14 +259,13 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA // fill hit->track "map" assert(hitToTupleView_.offSize > nhits); numberOfBlocks = nQuadrupletBlocks(blockSize); - kernel_countHitInTracks<<>>( - tuples_d, quality_d, device_hitToTuple_.get()); + kernel_countHitInTracks<<>>(tuples_d, device_hitToTuple_.get()); cudaCheck(cudaGetLastError()); assert((hitToTupleView_.assoc == device_hitToTuple_.get()) && (hitToTupleView_.offStorage == device_hitToTupleStorage_.get()) && (hitToTupleView_.offSize > 0)); cms::cuda::launchFinalize(hitToTupleView_, cudaStream); cudaCheck(cudaGetLastError()); - kernel_fillHitInTracks<<>>(tuples_d, quality_d, device_hitToTuple_.get()); + kernel_fillHitInTracks<<>>(tuples_d, device_hitToTuple_.get()); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaCheck(cudaDeviceSynchronize()); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc index a56dcc851c1d5..4a5689f572e47 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -252,7 +252,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC // check that the fixed-size SoA does not overflow auto const& tsoa = *soa; auto maxTracks = tsoa.stride(); - auto nTracks = tsoa.nTracks(); + auto nTracks = tsoa.view().nTracks(); assert(nTracks < maxTracks); if (nTracks == maxTracks - 1) { edm::LogWarning("PixelTracks") << "Unsorted reconstructed pixel tracks truncated to " << maxTracks - 1 diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h index e815a8943d520..e511680bf76b7 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h @@ -16,7 +16,7 @@ using HitsOnGPU = TrackingRecHit2DSOAView; using Tuples = pixelTrack::HitContainer; -using OutputSoA = pixelTrack::TrackSoA; +using OutputSoAView = pixelTrack::TrackSoAView; template __global__ void kernel_FastFit(Tuples const *__restrict__ foundNtuplets, @@ -149,7 +149,7 @@ __global__ void kernel_LineFit(caConstants::TupleMultiplicity const *__restrict_ break; // get it for the ntuple container (one to one to helix) - auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); + int32_t tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); riemannFit::Map3xNd hits(phits + local_idx); riemannFit::Map4d fast_fit(pfast_fit_input + local_idx); From 345b22e5f5782c8290c3aa4c46fdb00f84c2fe2d Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Tue, 11 Oct 2022 18:59:01 +0200 Subject: [PATCH 030/110] Removed unused accessors --- CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h index 289c6d2f8c211..667317ff8174c 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h @@ -116,8 +116,6 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection(view()[i].quality()); } - constexpr Quality &quality(int32_t i) { return static_cast(view()[i].quality()); } // TODO: static did not work; using reinterpret_cast constexpr Quality const *qualityData() const { return reinterpret_cast(view().quality()); } constexpr Quality *qualityData() { return reinterpret_cast(view().quality()); } From 70ae74f038764a63ff0085b6895f508e0568c90b Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Wed, 12 Oct 2022 11:11:11 +0200 Subject: [PATCH 031/110] Removed unused file --- .../Track/interface/TrajectoryStateSoAT.h | 73 ------------------- .../interface/TrajectoryStateSoAT_test.h | 63 ---------------- 2 files changed, 136 deletions(-) delete mode 100644 CUDADataFormats/Track/interface/TrajectoryStateSoAT.h delete mode 100644 CUDADataFormats/Track/interface/TrajectoryStateSoAT_test.h diff --git a/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h b/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h deleted file mode 100644 index 23ff2ce2b1986..0000000000000 --- a/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef CUDADataFormats_Track_TrajectoryStateSOAT_H -#define CUDADataFormats_Track_TrajectoryStateSOAT_H - -#include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h" -#include "DataFormats/SoATemplate/interface/SoALayout.h" -#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" -using Vector5f = Eigen::Matrix; -using Vector15f = Eigen::Matrix; - -using Vector5d = Eigen::Matrix; -using Matrix5d = Eigen::Matrix; -GENERATE_SOA_LAYOUT(TrajectoryStateSoAT_test, - SOA_EIGEN_COLUMN(Vector5f, state), - SOA_EIGEN_COLUMN(Vector15f, covariance)) - -template -struct TrajectoryStateSoAT : public cms::cuda::PortableDeviceCollection> { - static constexpr int32_t stride() { return S; } - - // eigenSoA::MatrixSoA state; - // eigenSoA::MatrixSoA covariance; - - // Vector5f state(const int32_t i) const { return view()[i].state(); } - // float* state() const { return view().state(); } // TODO: Return Vector5f* ? - // Vector15f covariance(const int32_t i) const { return view()[i].covariance(); } - // float* covariance() const { return view().covariance(); } // TODO: Return Vector15f* ? - - // Restrict view - // using RestrictConstView = - // Layout::ConstViewTemplate; - - // RestrictConstView restrictConstView() const { return RestrictConstView(layout()); } - - template - __host__ __device__ inline void copyFromCircle( - V3 const& cp, M3 const& ccov, V2 const& lp, M2 const& lcov, float b, int32_t i) { - view()[i].state() << cp.template cast(), lp.template cast(); - view()[i].state()(2) *= b; - auto cov = view()[i].covariance(); - cov(0) = ccov(0, 0); - cov(1) = ccov(0, 1); - cov(2) = b * float(ccov(0, 2)); - cov(4) = cov(3) = 0; - cov(5) = ccov(1, 1); - cov(6) = b * float(ccov(1, 2)); - cov(8) = cov(7) = 0; - cov(9) = b * b * float(ccov(2, 2)); - cov(11) = cov(10) = 0; - cov(12) = lcov(0, 0); - cov(13) = lcov(0, 1); - cov(14) = lcov(1, 1); - } - - template - __host__ __device__ inline void copyFromDense(V5 const& v, M5 const& cov, int32_t i) { - view()[i].state() = v.template cast(); - for (int j = 0, ind = 0; j < 5; ++j) - for (auto k = j; k < 5; ++k) - view()[i].covariance()(ind++) = cov(j, k); - } - - template - __host__ __device__ inline void copyToDense(V5& v, M5& cov, int32_t i) const { - v = view()[i].state().template cast(); - for (int j = 0, ind = 0; j < 5; ++j) { - cov(j, j) = view()[i].covariance()(ind++); - for (auto k = j + 1; k < 5; ++k) - cov(k, j) = cov(j, k) = view()[i].covariance()(ind++); - } - } -}; - -#endif // CUDADataFormats_Track_TrajectoryStateSOAT_H diff --git a/CUDADataFormats/Track/interface/TrajectoryStateSoAT_test.h b/CUDADataFormats/Track/interface/TrajectoryStateSoAT_test.h deleted file mode 100644 index 1e561d0131d51..0000000000000 --- a/CUDADataFormats/Track/interface/TrajectoryStateSoAT_test.h +++ /dev/null @@ -1,63 +0,0 @@ -#ifndef CUDADataFormats_Track_TrajectoryStateSOAT_H -#define CUDADataFormats_Track_TrajectoryStateSOAT_H - -#include -#include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h" -#include "DataFormats/SoATemplate/interface/SoALayout.h" -#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" -using Vector5f = Eigen::Matrix; -using Vector15f = Eigen::Matrix; - -using Vector5d = Eigen::Matrix; -using Matrix5d = Eigen::Matrix; -GENERATE_SOA_LAYOUT(TrajectoryStateSoAT_test, - SOA_EIGEN_COLUMN(Vector5f, state), - SOA_EIGEN_COLUMN(Vector15f, covariance)) - -template -struct TrajectoryStateSoAT : public cms::cuda::PortableDeviceCollection> { - static constexpr int32_t stride() { return S; } - - // eigenSoA::MatrixSoA state; - // eigenSoA::MatrixSoA covariance; - - template - __host__ __device__ inline void copyFromCircle( - V3 const& cp, M3 const& ccov, V2 const& lp, M2 const& lcov, float b, int32_t i) { - view()[i].state() << cp.template cast(), lp.template cast(); - view()[i].state()(2) *= b; // TODO?? 2d access?? - auto cov = view()[i].covariance(); - cov(0) = ccov(0, 0); - cov(1) = ccov(0, 1); - cov(2) = b * float(ccov(0, 2)); - cov(4) = cov(3) = 0; - cov(5) = ccov(1, 1); - cov(6) = b * float(ccov(1, 2)); - cov(8) = cov(7) = 0; - cov(9) = b * b * float(ccov(2, 2)); - cov(11) = cov(10) = 0; - cov(12) = lcov(0, 0); - cov(13) = lcov(0, 1); - cov(14) = lcov(1, 1); - } - - template - __host__ __device__ inline void copyFromDense(V5 const& v, M5 const& cov, int32_t i) { - view()[i].state() = v.template cast(); - for (int j = 0, ind = 0; j < 5; ++j) - for (auto k = j; k < 5; ++k) - view()[i].covariance()(ind++) = cov(j, k); - } - - template - __host__ __device__ inline void copyToDense(V5& v, M5& cov, int32_t i) const { - v = view()[i].state().template cast(); - for (int j = 0, ind = 0; j < 5; ++j) { - cov(j, j) = view()[i].covariance()(ind++); - for (auto k = j + 1; k < 5; ++k) - cov(k, j) = cov(j, k) = view()[i].covariance()(ind++); - } - } -}; - -#endif // CUDADataFormats_Track_TrajectoryStateSOAT_H From 5e04512520888c845e105451d6a8823c2683bc89 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Wed, 12 Oct 2022 11:19:33 +0200 Subject: [PATCH 032/110] Added a simple test for trivially_constructible classes --- CUDADataFormats/Track/test/BuildFile.xml | 3 ++- .../Track/test/trivially_constructible.cpp | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 CUDADataFormats/Track/test/trivially_constructible.cpp diff --git a/CUDADataFormats/Track/test/BuildFile.xml b/CUDADataFormats/Track/test/BuildFile.xml index 985445f1e1b2a..4da6a1a0bb38d 100644 --- a/CUDADataFormats/Track/test/BuildFile.xml +++ b/CUDADataFormats/Track/test/BuildFile.xml @@ -1,5 +1,6 @@ - + + diff --git a/CUDADataFormats/Track/test/trivially_constructible.cpp b/CUDADataFormats/Track/test/trivially_constructible.cpp new file mode 100644 index 0000000000000..f560f5ce58faa --- /dev/null +++ b/CUDADataFormats/Track/test/trivially_constructible.cpp @@ -0,0 +1,18 @@ +#include +#include +#include +#include +#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +using namespace std; + +int main() { + std::cout << "pixelTrack::TrackSoA trivially constructible: " + << std::is_trivially_constructible::value << std::endl; + + std::cout << "cms::cuda::PortableDeviceCollection> trivially constructible: " + << std::is_trivially_constructible>>::value + << std::endl; + + return 0; +} From f72cf64866eab45e47b6657be4aa3d5b1c4fd946 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Wed, 12 Oct 2022 12:01:59 +0200 Subject: [PATCH 033/110] Added more tests --- CUDADataFormats/Track/test/BuildFile.xml | 4 +++- ...e.cpp => trivially_constructible_eric_soa.cpp} | 5 +++-- .../test/trivially_constructible_manual_soa.cpp | 15 +++++++++++++++ 3 files changed, 21 insertions(+), 3 deletions(-) rename CUDADataFormats/Track/test/{trivially_constructible.cpp => trivially_constructible_eric_soa.cpp} (78%) create mode 100644 CUDADataFormats/Track/test/trivially_constructible_manual_soa.cpp diff --git a/CUDADataFormats/Track/test/BuildFile.xml b/CUDADataFormats/Track/test/BuildFile.xml index 4da6a1a0bb38d..3d216c02efa2c 100644 --- a/CUDADataFormats/Track/test/BuildFile.xml +++ b/CUDADataFormats/Track/test/BuildFile.xml @@ -1,5 +1,7 @@ - + + + diff --git a/CUDADataFormats/Track/test/trivially_constructible.cpp b/CUDADataFormats/Track/test/trivially_constructible_eric_soa.cpp similarity index 78% rename from CUDADataFormats/Track/test/trivially_constructible.cpp rename to CUDADataFormats/Track/test/trivially_constructible_eric_soa.cpp index f560f5ce58faa..3c9e60df70024 100644 --- a/CUDADataFormats/Track/test/trivially_constructible.cpp +++ b/CUDADataFormats/Track/test/trivially_constructible_eric_soa.cpp @@ -3,11 +3,12 @@ #include #include #include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" + using namespace std; int main() { - std::cout << "pixelTrack::TrackSoA trivially constructible: " + std::cout << "pixelTrack::TrackSoA with Eric's SoA, trivially constructible: " << std::is_trivially_constructible::value << std::endl; std::cout << "cms::cuda::PortableDeviceCollection> trivially constructible: " diff --git a/CUDADataFormats/Track/test/trivially_constructible_manual_soa.cpp b/CUDADataFormats/Track/test/trivially_constructible_manual_soa.cpp new file mode 100644 index 0000000000000..c275ca6e414f9 --- /dev/null +++ b/CUDADataFormats/Track/test/trivially_constructible_manual_soa.cpp @@ -0,0 +1,15 @@ +#include +#include +#include +#include +#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h" + +using namespace std; + +int main() { + std::cout << "pixelTrack::TrackSoA with manually defined SoA, trivially constructible: " + << std::is_trivially_constructible::value << std::endl; + + return 0; +} From 04aa3e3fefd5cd155c571a67429746fb987d55c2 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Thu, 13 Oct 2022 12:05:48 +0200 Subject: [PATCH 034/110] Added test for class instantiation --- .../interface/TrackSoAHeterogeneousT_test.h | 5 +- .../Track/test/TrackSoAHeterogeneous_test.cpp | 48 +++++++++++++++++++ .../Track/test/TrackSoAHeterogeneous_test.cu | 29 +++++++++++ 3 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp create mode 100644 CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h index 1cf34f14b30a1..c9d7e71bd556f 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h @@ -30,7 +30,7 @@ GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test, SOA_COLUMN(int8_t, nLayers), SOA_COLUMN(float, eta), SOA_COLUMN(float, pt)) - // TODO: maybe add stateAtBS + // TODO: maybe add stateAtBS template class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection> { @@ -118,6 +118,9 @@ namespace pixelTrack { #endif using TrackSoA = TrackSoAHeterogeneousT; + using TrackSoAView = cms::cuda::PortableDeviceCollection>::View; + using TrackSoAConstView = cms::cuda::PortableDeviceCollection>::ConstView; + using TrajectoryState = TrajectoryStateSoAT; using HitContainer = TrackSoA::HitContainer; diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp new file mode 100644 index 0000000000000..ac4e9978cc12f --- /dev/null +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp @@ -0,0 +1,48 @@ +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" +#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" + +namespace testTrackSoAHeterogeneousT { + + void runKernels(pixelTrack::TrackSoAView tracks, uint32_t soaSize); + +} + +int main() { + cms::cudatest::requireDevices(); + + cudaStream_t stream; + cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + + auto soaSize = 200; + // inner scope to deallocate memory before destroying the stream + { + /*TrackingRecHit2DGPU tkhit(nHits, false, 0, nullptr, nullptr, stream); + testTrackingRecHit2D::runKernels(tkhit.view()); + + TrackingRecHit2DGPU tkhitPhase2(nHits, true, 0, nullptr, nullptr, stream); + testTrackingRecHit2D::runKernels(tkhitPhase2.view()); + + TrackingRecHit2DHost tkhitH(nHits, false, 0, nullptr, nullptr, stream, &tkhit); + cudaStreamSynchronize(stream); + assert(tkhitH.view()); + assert(tkhitH.view()->nHits() == unsigned(nHits)); + assert(tkhitH.view()->nMaxModules() == phase1PixelTopology::numberOfModules); + + TrackingRecHit2DHost tkhitHPhase2(nHits, true, 0, nullptr, nullptr, stream, &tkhit); + cudaStreamSynchronize(stream); + assert(tkhitHPhase2.view()); + assert(tkhitHPhase2.view()->nHits() == unsigned(nHits)); + assert(tkhitHPhase2.view()->nMaxModules() == phase2PixelTopology::numberOfModules);*/ + + pixelTrack::TrackSoA tracks; + testTrackSoAHeterogeneousT::runKernels(tracks.view(), soaSize); + std::cout << typeid(tracks.view()).name() << std::endl; + } + + cudaCheck(cudaStreamDestroy(stream)); + + return 0; +} diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu new file mode 100644 index 0000000000000..2e1d56a278eb4 --- /dev/null +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu @@ -0,0 +1,29 @@ +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" + +namespace testTrackSoAHeterogeneousT { + + __global__ void fill(pixelTrack::TrackSoAView tracks, uint32_t soaSize) { + assert(tracks); + + int i = threadIdx.x; + if (i > soaSize) + return; + tracks[i].pt() = (float) i; + } + + __global__ void verify(pixelTrack::TrackSoAConstView tracks, uint32_t soaSize) { + assert(tracks); + + int i = threadIdx.x; + if (i > soaSize) + return; + assert(tracks[i].pt() == (float) i) + } + + void runKernels(pixelTrack::TrackSoAView tracks, uint32_t soaSize) { + assert(tracks); + fill<<<1, 1024>>>(tracks, soaSize); + verify<<<1, 1024>>>(tracks, soaSize); + } + +} // namespace testTrackingRecHit2D From 31b4769b0f3d6f65309e7f4e6b7245deab4e74c3 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Thu, 13 Oct 2022 13:55:31 +0200 Subject: [PATCH 035/110] Tests run, illegal memory access --- .../interface/TrackSoAHeterogeneousT_test.h | 16 +++++----- CUDADataFormats/Track/test/BuildFile.xml | 6 ++++ .../Track/test/TrackSoAHeterogeneous_test.cpp | 26 ++-------------- .../Track/test/TrackSoAHeterogeneous_test.cu | 30 ++++++++++--------- 4 files changed, 31 insertions(+), 47 deletions(-) diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h index c9d7e71bd556f..8a2778e10aaec 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h @@ -26,17 +26,15 @@ namespace pixelTrack { GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test, SOA_COLUMN(uint8_t, quality), - SOA_COLUMN(float, chi2), // this is chi2/ndof as not necessarely all hits are used in the fit + SOA_COLUMN(float, chi2), // this is chi2/ndof as not necessarely all hits are used in the fit SOA_COLUMN(int8_t, nLayers), SOA_COLUMN(float, eta), SOA_COLUMN(float, pt)) - // TODO: maybe add stateAtBS +// TODO: maybe add stateAtBS template -class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection> { - +class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection> { public: - // using cms::cuda::PortableDeviceCollection>::PortableDeviceCollection; TrackSoAHeterogeneousT() = default; @@ -52,19 +50,19 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection(view()[i].quality()); } - constexpr Quality &quality(int32_t i) { return static_cast(view()[i].quality()); } + // constexpr Quality &quality(int32_t i) { return static_cast(view()[i].quality()); } // TODO: static did not work; using reinterpret_cast constexpr Quality const *qualityData() const { return reinterpret_cast (view().quality()); } constexpr Quality *qualityData() { return reinterpret_cast< Quality *>(view().quality()); } constexpr float pt(int32_t i) const { return view()[i].pt(); } - constexpr float &pt(int32_t i) { return view()[i].pt(); } + // constexpr float &pt(int32_t i) { return view()[i].pt(); } constexpr float eta(int32_t i) const { return view()[i].eta(); } - constexpr float &eta(int32_t i) { return view()[i].eta(); } + // constexpr float &eta(int32_t i) { return view()[i].eta(); } constexpr float chi2(int32_t i) const { return view()[i].chi2(); } - constexpr float &chi2(int32_t i) { return view()[i].chi2(); } + // constexpr float &chi2(int32_t i) { return view()[i].chi2(); } constexpr int nTracks() const { return nTracks_; } constexpr void setNTracks(int n) { nTracks_ = n; } diff --git a/CUDADataFormats/Track/test/BuildFile.xml b/CUDADataFormats/Track/test/BuildFile.xml index fc78783db473b..dd2d980859f99 100644 --- a/CUDADataFormats/Track/test/BuildFile.xml +++ b/CUDADataFormats/Track/test/BuildFile.xml @@ -17,3 +17,9 @@ + + + + + + diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp index ac4e9978cc12f..c34bda0806111 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp @@ -5,9 +5,7 @@ #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" namespace testTrackSoAHeterogeneousT { - - void runKernels(pixelTrack::TrackSoAView tracks, uint32_t soaSize); - + void runKernels(pixelTrack::TrackSoAView tracks, unsigned int soaSize); } int main() { @@ -16,32 +14,12 @@ int main() { cudaStream_t stream; cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - auto soaSize = 200; + const auto soaSize = 256; // inner scope to deallocate memory before destroying the stream { - /*TrackingRecHit2DGPU tkhit(nHits, false, 0, nullptr, nullptr, stream); - testTrackingRecHit2D::runKernels(tkhit.view()); - - TrackingRecHit2DGPU tkhitPhase2(nHits, true, 0, nullptr, nullptr, stream); - testTrackingRecHit2D::runKernels(tkhitPhase2.view()); - - TrackingRecHit2DHost tkhitH(nHits, false, 0, nullptr, nullptr, stream, &tkhit); - cudaStreamSynchronize(stream); - assert(tkhitH.view()); - assert(tkhitH.view()->nHits() == unsigned(nHits)); - assert(tkhitH.view()->nMaxModules() == phase1PixelTopology::numberOfModules); - - TrackingRecHit2DHost tkhitHPhase2(nHits, true, 0, nullptr, nullptr, stream, &tkhit); - cudaStreamSynchronize(stream); - assert(tkhitHPhase2.view()); - assert(tkhitHPhase2.view()->nHits() == unsigned(nHits)); - assert(tkhitHPhase2.view()->nMaxModules() == phase2PixelTopology::numberOfModules);*/ - pixelTrack::TrackSoA tracks; testTrackSoAHeterogeneousT::runKernels(tracks.view(), soaSize); - std::cout << typeid(tracks.view()).name() << std::endl; } - cudaCheck(cudaStreamDestroy(stream)); return 0; diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu index 2e1d56a278eb4..162fd2448ea6e 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu @@ -1,29 +1,31 @@ #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" - +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" namespace testTrackSoAHeterogeneousT { - __global__ void fill(pixelTrack::TrackSoAView tracks, uint32_t soaSize) { - assert(tracks); - + __global__ void fill(pixelTrack::TrackSoAView tracks, unsigned int soaSize) { int i = threadIdx.x; - if (i > soaSize) + if (i >= soaSize) return; - tracks[i].pt() = (float) i; + tracks[i].pt() = (float)i; } - __global__ void verify(pixelTrack::TrackSoAConstView tracks, uint32_t soaSize) { - assert(tracks); - + __global__ void verify(pixelTrack::TrackSoAConstView tracks, unsigned int soaSize) { int i = threadIdx.x; - if (i > soaSize) + if (i >= soaSize) return; - assert(tracks[i].pt() == (float) i) + assert(tracks[i].pt() == (float)i); } - void runKernels(pixelTrack::TrackSoAView tracks, uint32_t soaSize) { - assert(tracks); + void runKernels(pixelTrack::TrackSoAView tracks, unsigned int soaSize) { fill<<<1, 1024>>>(tracks, soaSize); + cudaError_t cudaerr = cudaDeviceSynchronize(); + if (cudaerr != cudaSuccess) + printf("kernel launch failed with error \"%s\".\n", cudaGetErrorString(cudaerr)); + verify<<<1, 1024>>>(tracks, soaSize); + cudaerr = cudaDeviceSynchronize(); + if (cudaerr != cudaSuccess) + printf("kernel launch failed with error \"%s\".\n", cudaGetErrorString(cudaerr)); } -} // namespace testTrackingRecHit2D +} // namespace testTrackSoAHeterogeneousT From 00380b8c4b8f3263265a924bbfae2ed7548bc363 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Thu, 13 Oct 2022 17:17:24 +0200 Subject: [PATCH 036/110] Fixed tests, verified SoA works --- CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h | 2 ++ CUDADataFormats/Track/test/BuildFile.xml | 3 ++- CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp | 3 ++- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h index 8a2778e10aaec..7880bf728a91e 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h @@ -37,6 +37,8 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection>::PortableDeviceCollection; TrackSoAHeterogeneousT() = default; + explicit TrackSoAHeterogeneousT(cudaStream_t stream) + : PortableDeviceCollection>(S, stream) {} static constexpr int32_t stride() { return S; } diff --git a/CUDADataFormats/Track/test/BuildFile.xml b/CUDADataFormats/Track/test/BuildFile.xml index dd2d980859f99..885760aad6a36 100644 --- a/CUDADataFormats/Track/test/BuildFile.xml +++ b/CUDADataFormats/Track/test/BuildFile.xml @@ -21,5 +21,6 @@ - + + diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp index c34bda0806111..244d2a35f94d4 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp @@ -17,7 +17,8 @@ int main() { const auto soaSize = 256; // inner scope to deallocate memory before destroying the stream { - pixelTrack::TrackSoA tracks; + // pixelTrack::TrackSoA tracks; + TrackSoAHeterogeneousT tracks(stream); testTrackSoAHeterogeneousT::runKernels(tracks.view(), soaSize); } cudaCheck(cudaStreamDestroy(stream)); From f33f1357e1099f247296a28e49864e91707ea48b Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Thu, 13 Oct 2022 18:02:44 +0200 Subject: [PATCH 037/110] FIXED EVERYTHING --- .../interface/TrackSoAHeterogeneousT_test.h | 24 ++++++++------ CUDADataFormats/Track/test/BuildFile.xml | 12 ++++--- .../Track/test/TrackSoAHeterogeneous_test.cpp | 27 ++++++++++++++++ .../Track/test/TrackSoAHeterogeneous_test.cu | 31 +++++++++++++++++++ .../test/trivially_constructible_eric_soa.cpp | 19 ------------ .../trivially_constructible_manual_soa.cpp | 15 --------- 6 files changed, 81 insertions(+), 47 deletions(-) create mode 100644 CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp create mode 100644 CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu delete mode 100644 CUDADataFormats/Track/test/trivially_constructible_eric_soa.cpp delete mode 100644 CUDADataFormats/Track/test/trivially_constructible_manual_soa.cpp diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h index 667317ff8174c..68028c0ed92dc 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h @@ -44,17 +44,23 @@ namespace pixelTrack { using TrackSoAConstView = cms::cuda::PortableDeviceCollection>::ConstView; // State at the Beam spot // phi,tip,1/pt,cotan(theta),zip - __host__ __device__ float charge(TrackSoAConstView tracks, int32_t i) { - return std::copysign(1.f, tracks[i].state()(2)); - } + // __host__ __device__ float charge(TrackSoAConstView tracks, int32_t i) { + // return std::copysign(1.f, tracks[i].state()(2)); + // } + + // __host__ __device__ float phi(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(0); } - __host__ __device__ float phi(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(0); } + // __host__ __device__ float tip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(1); } - __host__ __device__ float tip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(1); } + // __host__ __device__ float zip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(4); } - __host__ __device__ float zip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(4); } + // __host__ __device__ bool isTriplet(TrackSoAConstView tracks, int i) { return tracks[i].nLayers() == 3; } - __host__ __device__ bool isTriplet(TrackSoAConstView tracks, int i) { return tracks[i].nLayers() == 3; } +#define phi(tracks, i) (tracks[i].state()(0)) +#define tip(tracks, i) (tracks[i].state()(1)) +#define charge(tracks, i) (tracks[i].state()(2)) +#define zip(tracks, i) (tracks[i].state()(4)) +#define isTriplet(tracks, i) (tracks[i].nLayers() == 3) template __host__ __device__ inline void copyFromCircle( @@ -103,8 +109,8 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection>::PortableDeviceCollection; TrackSoAHeterogeneousT() = default; - explicit TrackSoAHeterogeneousT(size_t maxModules, cudaStream_t stream) - : PortableDeviceCollection>(maxModules, stream) {} + explicit TrackSoAHeterogeneousT(cudaStream_t stream) + : PortableDeviceCollection>(S, stream) {} static constexpr int32_t stride() { return S; } diff --git a/CUDADataFormats/Track/test/BuildFile.xml b/CUDADataFormats/Track/test/BuildFile.xml index 3d216c02efa2c..8c16498888e4e 100644 --- a/CUDADataFormats/Track/test/BuildFile.xml +++ b/CUDADataFormats/Track/test/BuildFile.xml @@ -1,10 +1,14 @@ - - - - + + + + + + + + diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp new file mode 100644 index 0000000000000..244d2a35f94d4 --- /dev/null +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp @@ -0,0 +1,27 @@ +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" +#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" + +namespace testTrackSoAHeterogeneousT { + void runKernels(pixelTrack::TrackSoAView tracks, unsigned int soaSize); +} + +int main() { + cms::cudatest::requireDevices(); + + cudaStream_t stream; + cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + + const auto soaSize = 256; + // inner scope to deallocate memory before destroying the stream + { + // pixelTrack::TrackSoA tracks; + TrackSoAHeterogeneousT tracks(stream); + testTrackSoAHeterogeneousT::runKernels(tracks.view(), soaSize); + } + cudaCheck(cudaStreamDestroy(stream)); + + return 0; +} diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu new file mode 100644 index 0000000000000..162fd2448ea6e --- /dev/null +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu @@ -0,0 +1,31 @@ +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +namespace testTrackSoAHeterogeneousT { + + __global__ void fill(pixelTrack::TrackSoAView tracks, unsigned int soaSize) { + int i = threadIdx.x; + if (i >= soaSize) + return; + tracks[i].pt() = (float)i; + } + + __global__ void verify(pixelTrack::TrackSoAConstView tracks, unsigned int soaSize) { + int i = threadIdx.x; + if (i >= soaSize) + return; + assert(tracks[i].pt() == (float)i); + } + + void runKernels(pixelTrack::TrackSoAView tracks, unsigned int soaSize) { + fill<<<1, 1024>>>(tracks, soaSize); + cudaError_t cudaerr = cudaDeviceSynchronize(); + if (cudaerr != cudaSuccess) + printf("kernel launch failed with error \"%s\".\n", cudaGetErrorString(cudaerr)); + + verify<<<1, 1024>>>(tracks, soaSize); + cudaerr = cudaDeviceSynchronize(); + if (cudaerr != cudaSuccess) + printf("kernel launch failed with error \"%s\".\n", cudaGetErrorString(cudaerr)); + } + +} // namespace testTrackSoAHeterogeneousT diff --git a/CUDADataFormats/Track/test/trivially_constructible_eric_soa.cpp b/CUDADataFormats/Track/test/trivially_constructible_eric_soa.cpp deleted file mode 100644 index 3c9e60df70024..0000000000000 --- a/CUDADataFormats/Track/test/trivially_constructible_eric_soa.cpp +++ /dev/null @@ -1,19 +0,0 @@ -#include -#include -#include -#include -#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" - -using namespace std; - -int main() { - std::cout << "pixelTrack::TrackSoA with Eric's SoA, trivially constructible: " - << std::is_trivially_constructible::value << std::endl; - - std::cout << "cms::cuda::PortableDeviceCollection> trivially constructible: " - << std::is_trivially_constructible>>::value - << std::endl; - - return 0; -} diff --git a/CUDADataFormats/Track/test/trivially_constructible_manual_soa.cpp b/CUDADataFormats/Track/test/trivially_constructible_manual_soa.cpp deleted file mode 100644 index c275ca6e414f9..0000000000000 --- a/CUDADataFormats/Track/test/trivially_constructible_manual_soa.cpp +++ /dev/null @@ -1,15 +0,0 @@ -#include -#include -#include -#include -#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h" - -using namespace std; - -int main() { - std::cout << "pixelTrack::TrackSoA with manually defined SoA, trivially constructible: " - << std::is_trivially_constructible::value << std::endl; - - return 0; -} From 63991bc93179170113c0321c4810ef6a10013e8f Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Mon, 17 Oct 2022 12:27:42 +0200 Subject: [PATCH 038/110] Made static functions inline --- .../interface/TrackSoAHeterogeneousT_test.h | 20 +++++++------------ 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h index 68028c0ed92dc..cfb895f0c40b6 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h @@ -44,23 +44,17 @@ namespace pixelTrack { using TrackSoAConstView = cms::cuda::PortableDeviceCollection>::ConstView; // State at the Beam spot // phi,tip,1/pt,cotan(theta),zip - // __host__ __device__ float charge(TrackSoAConstView tracks, int32_t i) { - // return std::copysign(1.f, tracks[i].state()(2)); - // } - - // __host__ __device__ float phi(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(0); } + __host__ __device__ inline float charge(TrackSoAConstView tracks, int32_t i) { + return std::copysign(1.f, tracks[i].state()(2)); + } - // __host__ __device__ float tip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(1); } + __host__ __device__ inline float phi(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(0); } - // __host__ __device__ float zip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(4); } + __host__ __device__ inline float tip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(1); } - // __host__ __device__ bool isTriplet(TrackSoAConstView tracks, int i) { return tracks[i].nLayers() == 3; } + __host__ __device__ inline float zip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(4); } -#define phi(tracks, i) (tracks[i].state()(0)) -#define tip(tracks, i) (tracks[i].state()(1)) -#define charge(tracks, i) (tracks[i].state()(2)) -#define zip(tracks, i) (tracks[i].state()(4)) -#define isTriplet(tracks, i) (tracks[i].nLayers() == 3) + __host__ __device__ inline bool isTriplet(TrackSoAConstView tracks, int i) { return tracks[i].nLayers() == 3; } template __host__ __device__ inline void copyFromCircle( From b3b82bf22c2c3f7b7b83afed27e0c9ed4a2990ab Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Mon, 17 Oct 2022 15:25:49 +0200 Subject: [PATCH 039/110] Added Breno's tests --- .../Track/test/TrackSoAHeterogeneous_test.cpp | 13 ++++++-- .../Track/test/TrackSoAHeterogeneous_test.cu | 30 +++++++++++-------- 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp index 244d2a35f94d4..dc525e2259b5c 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp @@ -5,7 +5,9 @@ #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" namespace testTrackSoAHeterogeneousT { - void runKernels(pixelTrack::TrackSoAView tracks, unsigned int soaSize); + + void runKernels(pixelTrack::TrackSoAView tracks, uint32_t soaSize); + } int main() { @@ -17,9 +19,16 @@ int main() { const auto soaSize = 256; // inner scope to deallocate memory before destroying the stream { - // pixelTrack::TrackSoA tracks; TrackSoAHeterogeneousT tracks(stream); + auto ret = cms::cuda::make_host_unique(tracks.bufferSize(), stream); testTrackSoAHeterogeneousT::runKernels(tracks.view(), soaSize); + cudaCheck(cudaMemcpy(ret.get(), tracks.buffer().get(),TrackSoAHeterogeneousT_test<>::computeDataSize(soaSize),cudaMemcpyDeviceToHost)); + TrackSoAHeterogeneousT_test<> tmp_layout(ret.get(),soaSize); + TrackSoAHeterogeneousT_test<>::View tmp_view(tmp_layout); + std::cout << "pt" << "\t" << "eta" << "\t" <<"chi2" << "\t" << "quality" << "\t" << "nLayers" << std::endl; + for(int i = 0; i < soaSize; ++i){ + std::cout << tmp_view[i].pt() << "\t" << tmp_view[i].eta() << "\t" << tmp_view[i].chi2() << "\t" << (int)tmp_view[i].quality() << "\t" << (int)tmp_view[i].nLayers() << std::endl; + } } cudaCheck(cudaStreamDestroy(stream)); diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu index 162fd2448ea6e..9c6085dd824a7 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu @@ -1,31 +1,37 @@ #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + namespace testTrackSoAHeterogeneousT { - __global__ void fill(pixelTrack::TrackSoAView tracks, unsigned int soaSize) { + __global__ void fill(pixelTrack::TrackSoAView tracks, uint32_t soaSize) { + //assert(tracks); + int i = threadIdx.x; if (i >= soaSize) return; tracks[i].pt() = (float)i; + tracks[i].eta() = (float)i; + tracks[i].chi2() = (float)i; + tracks[i].quality() = (uint8_t)i; + tracks[i].nLayers() = i % 128; } - __global__ void verify(pixelTrack::TrackSoAConstView tracks, unsigned int soaSize) { + __global__ void verify(pixelTrack::TrackSoAConstView tracks, uint32_t soaSize) { + //assert(tracks); + int i = threadIdx.x; if (i >= soaSize) return; - assert(tracks[i].pt() == (float)i); + assert(abs(tracks[i].pt() - (float)i) < .0001); + assert(abs(tracks[i].eta() - (float)i) < .0001); + assert(abs(tracks[i].chi2() - (float)i) < .0001); + assert(tracks[i].quality() == i); + assert(tracks[i].nLayers() == i % 128); } - void runKernels(pixelTrack::TrackSoAView tracks, unsigned int soaSize) { + void runKernels(pixelTrack::TrackSoAView tracks, uint32_t soaSize) { + //assert(tracks); fill<<<1, 1024>>>(tracks, soaSize); - cudaError_t cudaerr = cudaDeviceSynchronize(); - if (cudaerr != cudaSuccess) - printf("kernel launch failed with error \"%s\".\n", cudaGetErrorString(cudaerr)); - verify<<<1, 1024>>>(tracks, soaSize); - cudaerr = cudaDeviceSynchronize(); - if (cudaerr != cudaSuccess) - printf("kernel launch failed with error \"%s\".\n", cudaGetErrorString(cudaerr)); } } // namespace testTrackSoAHeterogeneousT From 9a6ccb8c2646fa6e06cab6dabe65ae5413bbad96 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Mon, 17 Oct 2022 16:04:17 +0200 Subject: [PATCH 040/110] Fixed merge leftovers --- .../Track/interface/TrackSoAHeterogeneousT_test.h | 3 --- CUDADataFormats/Track/test/BuildFile.xml | 9 --------- 2 files changed, 12 deletions(-) diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h index 5cb1836ee3325..9ce074c0a24d4 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h @@ -105,9 +105,6 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection>(S, stream) {} - explicit TrackSoAHeterogeneousT(cudaStream_t stream) - : PortableDeviceCollection>(S, stream) {} - static constexpr int32_t stride() { return S; } using Quality = pixelTrack::Quality; diff --git a/CUDADataFormats/Track/test/BuildFile.xml b/CUDADataFormats/Track/test/BuildFile.xml index 1e3f6524e7232..e91df3fc785f7 100644 --- a/CUDADataFormats/Track/test/BuildFile.xml +++ b/CUDADataFormats/Track/test/BuildFile.xml @@ -4,15 +4,6 @@ - - - - - - - - - From 5f1068128aab8666c75cf80e6c4e323e9bc40f90 Mon Sep 17 00:00:00 2001 From: Breno Orzari Date: Mon, 17 Oct 2022 16:04:21 +0200 Subject: [PATCH 041/110] Adding TrackSoAHeterogeneous_test merged with TrajectoryState --- .../interface/TrackSoAHeterogeneousT_test.h | 118 +++++++++++------- 1 file changed, 75 insertions(+), 43 deletions(-) diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h index 7880bf728a91e..6d2623d715a3b 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h @@ -4,14 +4,11 @@ #include #include -#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h" +#include #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" - -#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" #include "DataFormats/SoATemplate/interface/SoALayout.h" - -//#include "DataFormats/Portable/interface/PortableCUDADeviceCollection.h" +#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" #include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" namespace pixelTrack { @@ -24,19 +21,88 @@ namespace pixelTrack { } } // namespace pixelTrack +using Vector5f = Eigen::Matrix; +using Vector15f = Eigen::Matrix; + +using Vector5d = Eigen::Matrix; +using Matrix5d = Eigen::Matrix; + GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test, SOA_COLUMN(uint8_t, quality), SOA_COLUMN(float, chi2), // this is chi2/ndof as not necessarely all hits are used in the fit SOA_COLUMN(int8_t, nLayers), SOA_COLUMN(float, eta), - SOA_COLUMN(float, pt)) -// TODO: maybe add stateAtBS + SOA_COLUMN(float, pt), + SOA_EIGEN_COLUMN(Vector5f, state), + SOA_EIGEN_COLUMN(Vector15f, covariance), + SOA_SCALAR(int, nTracks)) + +// Previous TrajectoryStateSoAT class methods +namespace pixelTrack { + namespace utilities { + using TrackSoAView = cms::cuda::PortableDeviceCollection>::View; + using TrackSoAConstView = cms::cuda::PortableDeviceCollection>::ConstView; + // State at the Beam spot + // phi,tip,1/pt,cotan(theta),zip + __host__ __device__ inline float charge(TrackSoAConstView tracks, int32_t i) { + return std::copysign(1.f, tracks[i].state()(2)); + } + + __host__ __device__ inline float phi(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(0); } + + __host__ __device__ inline float tip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(1); } + + __host__ __device__ inline float zip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(4); } + + __host__ __device__ inline bool isTriplet(TrackSoAConstView tracks, int i) { return tracks[i].nLayers() == 3; } + + template + __device__ inline void copyFromCircle( + TrackSoAView tracks, V3 const &cp, M3 const &ccov, V2 const &lp, M2 const &lcov, float b, int32_t i) { + tracks[i].state() << cp.template cast(), lp.template cast(); + + tracks[i].state()(2) = tracks[i].state()(2) * b; + auto cov = tracks[i].covariance(); + cov(0) = ccov(0, 0); + cov(1) = ccov(0, 1); + cov(2) = b * float(ccov(0, 2)); + cov(4) = cov(3) = 0; + cov(5) = ccov(1, 1); + cov(6) = b * float(ccov(1, 2)); + cov(8) = cov(7) = 0; + cov(9) = b * b * float(ccov(2, 2)); + cov(11) = cov(10) = 0; + cov(12) = lcov(0, 0); + cov(13) = lcov(0, 1); + cov(14) = lcov(1, 1); + } + + template + __device__ inline void copyFromDense(TrackSoAView tracks, V5 const &v, M5 const &cov, int32_t i) { + tracks[i].state() = v.template cast(); + for (int j = 0, ind = 0; j < 5; ++j) + for (auto k = j; k < 5; ++k) + tracks[i].covariance()(ind++) = cov(j, k); + } + + template + __device__ inline void copyToDense(TrackSoAConstView tracks, V5 &v, M5 &cov, int32_t i) { + v = tracks[i].state().template cast(); + for (int j = 0, ind = 0; j < 5; ++j) { + cov(j, j) = tracks[i].covariance()(ind++); + for (auto k = j + 1; k < 5; ++k) + cov(k, j) = cov(j, k) = tracks[i].covariance()(ind++); + } + } + } // namespace utilities +} // namespace pixelTrack template class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection> { public: // using cms::cuda::PortableDeviceCollection>::PortableDeviceCollection; TrackSoAHeterogeneousT() = default; + explicit TrackSoAHeterogeneousT(cudaStream_t stream) : PortableDeviceCollection>(S, stream) {} @@ -49,30 +115,13 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection(view()[i].quality()); } - // constexpr Quality &quality(int32_t i) { return static_cast(view()[i].quality()); } // TODO: static did not work; using reinterpret_cast - constexpr Quality const *qualityData() const { return reinterpret_cast (view().quality()); } - constexpr Quality *qualityData() { return reinterpret_cast< Quality *>(view().quality()); } - - constexpr float pt(int32_t i) const { return view()[i].pt(); } - // constexpr float &pt(int32_t i) { return view()[i].pt(); } - - constexpr float eta(int32_t i) const { return view()[i].eta(); } - // constexpr float &eta(int32_t i) { return view()[i].eta(); } - - constexpr float chi2(int32_t i) const { return view()[i].chi2(); } - // constexpr float &chi2(int32_t i) { return view()[i].chi2(); } - - constexpr int nTracks() const { return nTracks_; } - constexpr void setNTracks(int n) { nTracks_ = n; } + constexpr Quality const *qualityData() const { return reinterpret_cast(view().quality()); } + constexpr Quality *qualityData() { return reinterpret_cast(view().quality()); } constexpr int nHits(int i) const { return detIndices.size(i); } - constexpr bool isTriplet(int i) const { return view()[i].nLayers() == 3; } - constexpr int computeNumberOfLayers(int32_t i) const { // layers are in order and we assume tracks are either forward or backward auto pdet = detIndices.begin(i); @@ -87,24 +136,8 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection stateAtBS; - constexpr float charge(int32_t i) const { return std::copysign(1.f, stateAtBS.state(i)(2)); } - constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); } - constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); } - constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); } - - // state at the detector of the outermost hit - // representation to be decided... - // not yet filled on GPU - // TrajectoryStateSoA stateAtOuterDet; - HitContainer hitIndices; HitContainer detIndices; - -private: - int nTracks_; }; namespace pixelTrack { @@ -121,7 +154,6 @@ namespace pixelTrack { using TrackSoAView = cms::cuda::PortableDeviceCollection>::View; using TrackSoAConstView = cms::cuda::PortableDeviceCollection>::ConstView; - using TrajectoryState = TrajectoryStateSoAT; using HitContainer = TrackSoA::HitContainer; } // namespace pixelTrack From 1f07ee7a7e36d526ae4751fc156cf12028c061d8 Mon Sep 17 00:00:00 2001 From: Breno Orzari Date: Mon, 17 Oct 2022 16:12:30 +0200 Subject: [PATCH 042/110] Adding successful kernel tests --- .../Track/test/TrackSoAHeterogeneous_test.cpp | 15 ++++++-- .../Track/test/TrackSoAHeterogeneous_test.cu | 34 +++++++++++-------- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp index 244d2a35f94d4..8b1504a4a60e7 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp @@ -1,11 +1,13 @@ -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test2.h" #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" namespace testTrackSoAHeterogeneousT { - void runKernels(pixelTrack::TrackSoAView tracks, unsigned int soaSize); + + void runKernels(pixelTrack::TrackSoAView tracks, uint32_t soaSize); + } int main() { @@ -17,9 +19,16 @@ int main() { const auto soaSize = 256; // inner scope to deallocate memory before destroying the stream { - // pixelTrack::TrackSoA tracks; TrackSoAHeterogeneousT tracks(stream); + auto ret = cms::cuda::make_host_unique(tracks.bufferSize(), stream); testTrackSoAHeterogeneousT::runKernels(tracks.view(), soaSize); + cudaCheck(cudaMemcpy(ret.get(), tracks.buffer().get(),TrackSoAHeterogeneousT_test<>::computeDataSize(soaSize),cudaMemcpyDeviceToHost)); + TrackSoAHeterogeneousT_test<> tmp_layout(ret.get(),soaSize); + TrackSoAHeterogeneousT_test<>::View tmp_view(tmp_layout); + std::cout << "pt" << "\t" << "eta" << "\t" <<"chi2" << "\t" << "quality" << "\t" << "nLayers" << std::endl; + for(int i = 0; i < soaSize; ++i){ + std::cout << tmp_view[i].pt() << "\t" << tmp_view[i].eta() << "\t" << tmp_view[i].chi2() << "\t" << (int)tmp_view[i].quality() << "\t" << (int)tmp_view[i].nLayers() << std::endl; + } } cudaCheck(cudaStreamDestroy(stream)); diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu index 162fd2448ea6e..912ed44dba3ed 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu @@ -1,31 +1,37 @@ -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test2.h" + namespace testTrackSoAHeterogeneousT { - __global__ void fill(pixelTrack::TrackSoAView tracks, unsigned int soaSize) { + __global__ void fill(pixelTrack::TrackSoAView tracks, uint32_t soaSize) { + //assert(tracks); + int i = threadIdx.x; if (i >= soaSize) return; tracks[i].pt() = (float)i; + tracks[i].eta() = (float)i; + tracks[i].chi2() = (float)i; + tracks[i].quality() = (uint8_t)i; + tracks[i].nLayers() = i%128; } - __global__ void verify(pixelTrack::TrackSoAConstView tracks, unsigned int soaSize) { + __global__ void verify(pixelTrack::TrackSoAConstView tracks, uint32_t soaSize) { + //assert(tracks); + int i = threadIdx.x; if (i >= soaSize) return; - assert(tracks[i].pt() == (float)i); + assert(abs(tracks[i].pt() - (float)i) < .0001); + assert(abs(tracks[i].eta() - (float)i) < .0001); + assert(abs(tracks[i].chi2() - (float)i) < .0001); + assert(tracks[i].quality() == i); + assert(tracks[i].nLayers() == i%128); } - void runKernels(pixelTrack::TrackSoAView tracks, unsigned int soaSize) { + void runKernels(pixelTrack::TrackSoAView tracks, uint32_t soaSize) { + //assert(tracks); fill<<<1, 1024>>>(tracks, soaSize); - cudaError_t cudaerr = cudaDeviceSynchronize(); - if (cudaerr != cudaSuccess) - printf("kernel launch failed with error \"%s\".\n", cudaGetErrorString(cudaerr)); - verify<<<1, 1024>>>(tracks, soaSize); - cudaerr = cudaDeviceSynchronize(); - if (cudaerr != cudaSuccess) - printf("kernel launch failed with error \"%s\".\n", cudaGetErrorString(cudaerr)); } -} // namespace testTrackSoAHeterogeneousT +} // namespace testTrackingRecHit2D From 2899c8c8aef7f0dd885ca83160fa846102e736a8 Mon Sep 17 00:00:00 2001 From: Breno Orzari Date: Mon, 17 Oct 2022 16:15:19 +0200 Subject: [PATCH 043/110] Fixing header name in kernel tests --- CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp | 2 +- CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp index 8b1504a4a60e7..dc525e2259b5c 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp @@ -1,4 +1,4 @@ -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test2.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu index 912ed44dba3ed..9d26d2497e0b1 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu @@ -1,4 +1,4 @@ -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test2.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" namespace testTrackSoAHeterogeneousT { From f8239610c93e287e2973877c1ee98e955f0b7009 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Tue, 18 Oct 2022 15:08:59 +0200 Subject: [PATCH 044/110] FINALLY TESTS WORK(?) --- .../Track/test/TrackSoAHeterogeneous_test.cpp | 58 ++++++++++++++----- .../Track/test/TrackSoAHeterogeneous_test.cu | 52 +++++++++-------- 2 files changed, 74 insertions(+), 36 deletions(-) diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp index dc525e2259b5c..431dbd4577297 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp @@ -1,13 +1,12 @@ #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" -#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h" +#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h" namespace testTrackSoAHeterogeneousT { - void runKernels(pixelTrack::TrackSoAView tracks, uint32_t soaSize); - + void runKernels(pixelTrack::TrackSoA *tracks, pixelTrack::TrackSoAView tracks_view); } int main() { @@ -16,19 +15,52 @@ int main() { cudaStream_t stream; cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - const auto soaSize = 256; // inner scope to deallocate memory before destroying the stream { - TrackSoAHeterogeneousT tracks(stream); - auto ret = cms::cuda::make_host_unique(tracks.bufferSize(), stream); - testTrackSoAHeterogeneousT::runKernels(tracks.view(), soaSize); - cudaCheck(cudaMemcpy(ret.get(), tracks.buffer().get(),TrackSoAHeterogeneousT_test<>::computeDataSize(soaSize),cudaMemcpyDeviceToHost)); - TrackSoAHeterogeneousT_test<> tmp_layout(ret.get(),soaSize); + // Instantiate tracks on host. Portabledevicecollection allocates + // SoA on device automatically. + int dev = cms::cuda::currentDevice(); + pixelTrack::TrackSoA tracks_h(stream); + + // Make a copy of tracks_h to device, so that we can + // modify hitIndices. + void *mem = cms::cuda::allocate_device(dev, sizeof(pixelTrack::TrackSoA), stream); + cudaCheck(cudaMemcpy(mem, &tracks_h, sizeof(pixelTrack::TrackSoA), cudaMemcpyHostToDevice)); + + // Run the tests + pixelTrack::TrackSoA *tracks_d = reinterpret_cast(mem); + testTrackSoAHeterogeneousT::runKernels(tracks_d, tracks_h.view()); + + // Copy SoA data back to host + auto ret = cms::cuda::make_host_unique(tracks_h.bufferSize(), stream); + cudaCheck(cudaMemcpy(ret.get(), + tracks_h.buffer().get(), + TrackSoAHeterogeneousT_test<>::computeDataSize(tracks_h.stride()), + cudaMemcpyDeviceToHost)); + + // Copy tracks_d back to tracks_h + cudaCheck(cudaMemcpy(&tracks_h, mem, sizeof(pixelTrack::TrackSoA), cudaMemcpyDeviceToHost)); + + // Create a view to access the copied data + TrackSoAHeterogeneousT_test<> tmp_layout(ret.get(), tracks_h.stride()); TrackSoAHeterogeneousT_test<>::View tmp_view(tmp_layout); - std::cout << "pt" << "\t" << "eta" << "\t" <<"chi2" << "\t" << "quality" << "\t" << "nLayers" << std::endl; - for(int i = 0; i < soaSize; ++i){ - std::cout << tmp_view[i].pt() << "\t" << tmp_view[i].eta() << "\t" << tmp_view[i].chi2() << "\t" << (int)tmp_view[i].quality() << "\t" << (int)tmp_view[i].nLayers() << std::endl; + std::cout << "pt" + << "\t" + << "eta" + << "\t" + << "chi2" + << "\t" + << "quality" + << "\t" + << "nLayers" + << "\t" + << "hitIndices off" << std::endl; + for (int i = 0; i < tracks_h.stride(); ++i) { + std::cout << tmp_view[i].pt() << "\t" << tmp_view[i].eta() << "\t" << tmp_view[i].chi2() << "\t" + << (int)tmp_view[i].quality() << "\t" << (int)tmp_view[i].nLayers() << "\t" + << tracks_h.hitIndices.off[i] << std::endl; } + cudaCheck(cudaFree(mem)); } cudaCheck(cudaStreamDestroy(stream)); diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu index 3f95e5ed5fe3f..80b8e1c7ce140 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu @@ -1,37 +1,43 @@ #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" +#include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h" namespace testTrackSoAHeterogeneousT { - __global__ void fill(pixelTrack::TrackSoAView tracks, uint32_t soaSize) { - //assert(tracks); + __global__ void fill(pixelTrack::TrackSoA* __restrict__ tracks, pixelTrack::TrackSoAView tracks_view) { + assert(tracks); int i = threadIdx.x; - if (i >= soaSize) - return; - tracks[i].pt() = (float)i; - tracks[i].eta() = (float)i; - tracks[i].chi2() = (float)i; - tracks[i].quality() = (uint8_t)i; - tracks[i].nLayers() = i % 128; + for (int j = i; j < tracks->stride(); j += blockDim.x) { + tracks_view[j].pt() = (float)j; + tracks_view[j].eta() = (float)j; + tracks_view[j].chi2() = (float)j; + tracks_view[j].quality() = (uint8_t)j % 256; + tracks_view[j].nLayers() = j % 128; + tracks->hitIndices.off[j] = j; + } } - __global__ void verify(pixelTrack::TrackSoAConstView tracks, uint32_t soaSize) { - //assert(tracks); + __global__ void verify(pixelTrack::TrackSoA* const __restrict__ tracks, pixelTrack::TrackSoAConstView tracks_view) { + assert(tracks); int i = threadIdx.x; - if (i >= soaSize) - return; - assert(abs(tracks[i].pt() - (float)i) < .0001); - assert(abs(tracks[i].eta() - (float)i) < .0001); - assert(abs(tracks[i].chi2() - (float)i) < .0001); - assert(tracks[i].quality() == i); - assert(tracks[i].nLayers() == i % 128); + if (i == 0) { + printf("Stride: %d, block dims: %d\n", tracks->stride(), blockDim.x); + } + for (int j = i; j < tracks->stride(); j += blockDim.x) { + assert(abs(tracks_view[j].pt() - (float)j) < .0001); + assert(abs(tracks_view[j].eta() - (float)j) < .0001); + assert(abs(tracks_view[j].chi2() - (float)j) < .0001); + assert(tracks_view[j].quality() == j % 256); + assert(tracks_view[j].nLayers() == j % 128); + assert(tracks->hitIndices.off[j] == j); + } } - void runKernels(pixelTrack::TrackSoAView tracks, uint32_t soaSize) { - //assert(tracks); - fill<<<1, 1024>>>(tracks, soaSize); - verify<<<1, 1024>>>(tracks, soaSize); + void runKernels(pixelTrack::TrackSoA* tracks, pixelTrack::TrackSoAView tracks_view) { + assert(tracks); + fill<<<1, 1024>>>(tracks, tracks_view); + verify<<<1, 1024>>>(tracks, tracks_view); } -} // namespace testTrackingRecHit2D +} // namespace testTrackSoAHeterogeneousT From e4db5066a690c9f9d3b59c18639304e6c79fd59b Mon Sep 17 00:00:00 2001 From: Breno Orzari Date: Tue, 18 Oct 2022 18:12:27 +0200 Subject: [PATCH 045/110] Changing dataFormats in RecoPixelVertexing --- .../plugins/PixelTrackSoAFromCUDA.cc | 65 ++++++++++++++----- .../PixelTriplets/plugins/CAHitNtupletCUDA.cc | 15 +++-- .../plugins/CAHitNtupletGeneratorOnGPU.cc | 23 ++++--- .../plugins/CAHitNtupletGeneratorOnGPU.h | 4 +- 4 files changed, 75 insertions(+), 32 deletions(-) diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc index 57df7ae63b7a0..e43f6b028aa15 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc @@ -2,7 +2,8 @@ #include "CUDADataFormats/Common/interface/Product.h" #include "CUDADataFormats/Common/interface/HostProduct.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" #include "DataFormats/Common/interface/Handle.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" @@ -32,15 +33,27 @@ class PixelTrackSoAFromCUDA : public edm::stream::EDProducer edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; - edm::EDGetTokenT> tokenCUDA_; - edm::EDPutTokenT tokenSOA_; + //edm::EDGetTokenT> tokenCUDA_; + //edm::EDPutTokenT tokenSOA_; - cms::cuda::host::unique_ptr soa_; + //edm::EDGetTokenT>> tokenCUDA_; + edm::EDGetTokenT> tokenCUDA_; + //edm::EDPutTokenT::View> tokenSOA_; + edm::EDPutTokenT tokenSOA_; + + //cms::cuda::host::unique_ptr soa_; + //cms::cuda::host::unique_ptr soa_; + //TrackSoAHeterogeneousT_test<>::View soa_; + pixelTrack::TrackSoA soa_; + pixelTrack::TrackSoAView tmp_view_; }; PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig) - : tokenCUDA_(consumes>(iConfig.getParameter("src"))), - tokenSOA_(produces()) {} + //: tokenCUDA_(consumes>(iConfig.getParameter("src"))), + // tokenSOA_(produces()) {} + : tokenCUDA_(consumes>(iConfig.getParameter("src"))), + //tokenSOA_(produces::View>()) {} + tokenSOA_(produces()) {} void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { edm::ParameterSetDescription desc; @@ -49,7 +62,7 @@ void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& des descriptions.add("pixelTracksSoA", desc); } -void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent, +/*void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { cms::cuda::Product const& inputDataWrapped = iEvent.get(tokenCUDA_); @@ -57,13 +70,33 @@ void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent, auto const& inputData = ctx.get(inputDataWrapped); soa_ = inputData.toHostAsync(ctx.stream()); +}*/ + +void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent, + edm::EventSetup const& iSetup, + edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + cms::cuda::Product const& inputDataWrapped = iEvent.get(tokenCUDA_); + cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; + auto const& inputData = ctx.get(inputDataWrapped); + + //class_ = inputData.toHostAsync(ctx.stream()); + + pixelTrack::TrackSoA soa_(ctx.stream()); + cudaCheck(cudaMemcpy(&soa_,&inputData,sizeof(pixelTrack::TrackSoA),cudaMemcpyDeviceToHost)); + + auto retView = cms::cuda::make_host_unique(inputData.bufferSize(), ctx.stream()); + cudaCheck(cudaMemcpy(retView.get(),inputData.buffer().get(),TrackSoAHeterogeneousT_test<>::computeDataSize(32768),cudaMemcpyDeviceToHost)); + TrackSoAHeterogeneousT_test<> tmp_layout(retView.get(),32768); + TrackSoAHeterogeneousT_test<>::View tmp_view_(tmp_layout); + } void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { // check that the fixed-size SoA does not overflow - auto const& tsoa = *soa_; - auto maxTracks = tsoa.stride(); - auto nTracks = tsoa.view().nTracks(); + //auto tsoa = soa_; + //auto maxTracks = tsoa.stride(); + auto maxTracks = 32768; + auto nTracks = tmp_view_.nTracks(); assert(nTracks < maxTracks); if (nTracks == maxTracks - 1) { edm::LogWarning("PixelTracks") << "Unsorted reconstructed pixel tracks truncated to " << maxTracks - 1 @@ -71,13 +104,13 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i } #ifdef PIXEL_DEBUG_PRODUCE - std::cout << "size of SoA " << sizeof(tsoa) << " stride " << maxTracks << std::endl; - std::cout << "found " << nTracks << " tracks in cpu SoA at " << &tsoa << std::endl; + std::cout << "size of SoA " << sizeof(soa_) << " stride " << maxTracks << std::endl; + std::cout << "found " << nTracks << " tracks in cpu SoA at " << &soa_ << std::endl; int32_t nt = 0; for (int32_t it = 0; it < maxTracks; ++it) { - auto nHits = tsoa.nHits(it); - assert(nHits == int(tsoa.hitIndices.size(it))); + auto nHits = soa_.nHits(it); + assert(nHits == int(soa_.hitIndices.size(it))); if (nHits == 0) break; // this is a guard: maybe we need to move to nTracks... nt++; @@ -86,9 +119,9 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i #endif // DO NOT make a copy (actually TWO....) - iEvent.emplace(tokenSOA_, std::move(soa_)); + iEvent.emplace(tokenSOA_, std::move(soa_));//, std::move(ret)); // view - assert(!soa_); + //assert(!soa_); } DEFINE_FWK_MODULE(PixelTrackSoAFromCUDA); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc index 72c482c6189db..c9831afc01067 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc @@ -20,7 +20,8 @@ #include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h" #include "CAHitNtupletGeneratorOnGPU.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" class CAHitNtupletCUDA : public edm::global::EDProducer<> { @@ -40,9 +41,11 @@ class CAHitNtupletCUDA : public edm::global::EDProducer<> { edm::ESGetToken tokenField_; edm::EDGetTokenT> tokenHitGPU_; - edm::EDPutTokenT> tokenTrackGPU_; + //edm::EDPutTokenT> tokenTrackGPU_; + edm::EDPutTokenT> tokenTrackGPU_; edm::EDGetTokenT tokenHitCPU_; - edm::EDPutTokenT tokenTrackCPU_; + //edm::EDPutTokenT tokenTrackCPU_; + edm::EDPutTokenT tokenTrackCPU_; CAHitNtupletGeneratorOnGPU gpuAlgo_; }; @@ -52,10 +55,12 @@ CAHitNtupletCUDA::CAHitNtupletCUDA(const edm::ParameterSet& iConfig) if (onGPU_) { tokenHitGPU_ = consumes>(iConfig.getParameter("pixelRecHitSrc")); - tokenTrackGPU_ = produces>(); + //tokenTrackGPU_ = produces>(); + tokenTrackGPU_ = produces>(); } else { tokenHitCPU_ = consumes(iConfig.getParameter("pixelRecHitSrc")); - tokenTrackCPU_ = produces(); + //tokenTrackCPU_ = produces(); + tokenTrackCPU_ = produces(); } } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc index 4a5689f572e47..6c5fdb36a9d46 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -20,6 +20,8 @@ #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" #include "TrackingTools/DetLayers/interface/BarrelDetLayer.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" + #include "CAHitNtupletGeneratorOnGPU.h" namespace { @@ -184,13 +186,15 @@ void CAHitNtupletGeneratorOnGPU::endJob() { } } -PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, +/*PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const { - PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique(stream)); - - auto* soa = tracks.get(); - assert(soa); + PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique(stream));*/ + pixelTrack::TrackSoA CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, + float bfield, + cudaStream_t stream) const { + pixelTrack::TrackSoA tracks(stream); + auto* soa = &tracks; CAHitNtupletGeneratorKernelsGPU kernels(m_params); kernels.setCounters(m_counters); @@ -217,11 +221,12 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH return tracks; } -PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const { - PixelTrackHeterogeneous tracks(std::make_unique()); +pixelTrack::TrackSoA CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const { + //PixelTrackHeterogeneous tracks(std::make_unique()); + pixelTrack::TrackSoA tracks; - auto* soa = tracks.get(); - assert(soa); + auto* soa = &tracks; + //assert(soa); CAHitNtupletGeneratorKernelsCPU kernels(m_params); kernels.setCounters(m_counters); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h index 36212298aac2f..ff13d09c1361a 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h @@ -47,9 +47,9 @@ class CAHitNtupletGeneratorOnGPU { void beginJob(); void endJob(); - PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const; + pixelTrack::TrackSoA makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const; - PixelTrackHeterogeneous makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const; + pixelTrack::TrackSoA makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const; private: void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream) const; From ba895c004bdf3153ca02d2e2b584fe44873211fc Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Wed, 19 Oct 2022 18:17:38 +0200 Subject: [PATCH 046/110] HitContainer added as SOA_SCALAR --- .../interface/TrackSoAHeterogeneousT_test.h | 27 +++++++++++-------- .../Track/test/TrackSoAHeterogeneous_test.cpp | 3 ++- .../Track/test/TrackSoAHeterogeneous_test.cu | 2 ++ 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h index cfb895f0c40b6..998f63e608244 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h @@ -1,6 +1,7 @@ #ifndef CUDADataFormats_Track_TrackHeterogeneousT_H #define CUDADataFormats_Track_TrackHeterogeneousT_H +#include #include #include @@ -19,6 +20,15 @@ namespace pixelTrack { auto qp = std::find(qualityName, qualityName + qualitySize, name) - qualityName; return static_cast(qp); } + +#ifdef GPU_SMALL_EVENTS + // kept for testing and debugging + constexpr uint32_t maxNumber() { return 2 * 1024; } +#else + // tested on MC events with 55-75 pileup events + constexpr uint32_t maxNumber() { return 32 * 1024; } +#endif + } // namespace pixelTrack using Vector5f = Eigen::Matrix; @@ -26,6 +36,7 @@ using Vector15f = Eigen::Matrix; using Vector5d = Eigen::Matrix; using Matrix5d = Eigen::Matrix; +using HitContainer = cms::cuda::OneToManyAssoc; GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test, SOA_COLUMN(uint8_t, quality), @@ -35,7 +46,9 @@ GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test, SOA_COLUMN(float, pt), SOA_EIGEN_COLUMN(Vector5f, state), SOA_EIGEN_COLUMN(Vector15f, covariance), - SOA_SCALAR(int, nTracks)) + SOA_SCALAR(int, nTracks), + SOA_SCALAR(HitContainer, hitIndices), + SOA_SCALAR(HitContainer, detIndices)) // Previous TrajectoryStateSoAT class methods namespace pixelTrack { @@ -110,7 +123,7 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection; + // using HitContainer = cms::cuda::OneToManyAssoc; // Always check quality is at least loose! // CUDA does not support enums in __lgc ... @@ -142,19 +155,11 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection; using TrackSoAView = cms::cuda::PortableDeviceCollection>::View; using TrackSoAConstView = cms::cuda::PortableDeviceCollection>::ConstView; - using HitContainer = TrackSoA::HitContainer; + // using HitContainer = TrackSoA::HitContainer; } // namespace pixelTrack diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp index 431dbd4577297..d40ec10af2fa4 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp @@ -58,8 +58,9 @@ int main() { for (int i = 0; i < tracks_h.stride(); ++i) { std::cout << tmp_view[i].pt() << "\t" << tmp_view[i].eta() << "\t" << tmp_view[i].chi2() << "\t" << (int)tmp_view[i].quality() << "\t" << (int)tmp_view[i].nLayers() << "\t" - << tracks_h.hitIndices.off[i] << std::endl; + << tmp_view.hitIndices().off[i] << std::endl; } + cudaCheck(cudaFree(mem)); } cudaCheck(cudaStreamDestroy(stream)); diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu index 80b8e1c7ce140..3ca5fc4994257 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu @@ -13,6 +13,7 @@ namespace testTrackSoAHeterogeneousT { tracks_view[j].chi2() = (float)j; tracks_view[j].quality() = (uint8_t)j % 256; tracks_view[j].nLayers() = j % 128; + tracks_view.hitIndices().off[j] = j; tracks->hitIndices.off[j] = j; } } @@ -30,6 +31,7 @@ namespace testTrackSoAHeterogeneousT { assert(abs(tracks_view[j].chi2() - (float)j) < .0001); assert(tracks_view[j].quality() == j % 256); assert(tracks_view[j].nLayers() == j % 128); + assert(tracks_view.hitIndices().off[j] == j); assert(tracks->hitIndices.off[j] == j); } } From 8d927933354b07c8eac74355a285bdfd82f18793 Mon Sep 17 00:00:00 2001 From: Breno Orzari Date: Wed, 19 Oct 2022 18:36:07 +0200 Subject: [PATCH 047/110] Changing objects type in plugins files to pixelTrack::TrackSoA --- .../plugins/PixelTrackDumpCUDA.cc | 22 ++++++++++++------- .../plugins/PixelTrackProducerFromSoA.cc | 13 ++++++----- .../plugins/CAHitNtupletGeneratorKernels.h | 3 ++- .../plugins/CAHitNtupletGeneratorOnGPU.h | 3 ++- .../PixelTriplets/plugins/GPUCACell.h | 3 ++- .../PixelTriplets/plugins/HelixFitOnGPU.h | 3 ++- .../plugins/PixelVertexProducerCUDA.cc | 20 +++++++++++------ .../plugins/gpuVertexFinder.cc | 21 ++++++++++-------- .../plugins/gpuVertexFinder.h | 4 +++- 9 files changed, 58 insertions(+), 34 deletions(-) diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc index f3d6022e21654..59489c8e11f5f 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc @@ -1,7 +1,8 @@ #include #include "CUDADataFormats/Common/interface/Product.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" #include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" #include "DataFormats/Common/interface/Handle.h" @@ -30,9 +31,11 @@ class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> { private: void analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const override; const bool m_onGPU; - edm::EDGetTokenT> tokenGPUTrack_; + //edm::EDGetTokenT> tokenGPUTrack_; + edm::EDGetTokenT> tokenGPUTrack_; edm::EDGetTokenT> tokenGPUVertex_; - edm::EDGetTokenT tokenSoATrack_; + //edm::EDGetTokenT tokenSoATrack_; + edm::EDGetTokenT tokenSoATrack_; edm::EDGetTokenT tokenSoAVertex_; }; @@ -40,11 +43,13 @@ PixelTrackDumpCUDA::PixelTrackDumpCUDA(const edm::ParameterSet& iConfig) : m_onGPU(iConfig.getParameter("onGPU")) { if (m_onGPU) { tokenGPUTrack_ = - consumes>(iConfig.getParameter("pixelTrackSrc")); + //consumes>(iConfig.getParameter("pixelTrackSrc")); + consumes>(iConfig.getParameter("pixelTrackSrc")); tokenGPUVertex_ = consumes>(iConfig.getParameter("pixelVertexSrc")); } else { - tokenSoATrack_ = consumes(iConfig.getParameter("pixelTrackSrc")); + //tokenSoATrack_ = consumes(iConfig.getParameter("pixelTrackSrc")); + tokenSoATrack_ = consumes(iConfig.getParameter("pixelTrackSrc")); tokenSoAVertex_ = consumes(iConfig.getParameter("pixelVertexSrc")); } } @@ -66,7 +71,8 @@ void PixelTrackDumpCUDA::analyze(edm::StreamID streamID, cms::cuda::ScopedContextProduce ctx{hTracks}; auto const& tracks = ctx.get(hTracks); - auto const* tsoa = tracks.get(); + //auto const* tsoa = tracks.get(); + auto const* tsoa = &tracks; assert(tsoa); auto const& vertices = ctx.get(iEvent.get(tokenGPUVertex_)); @@ -74,8 +80,8 @@ void PixelTrackDumpCUDA::analyze(edm::StreamID streamID, assert(vsoa); } else { - auto const* tsoa = iEvent.get(tokenSoATrack_).get(); - assert(tsoa); + auto const& tsoa = iEvent.get(tokenSoATrack_); + assert(tsoa.buffer()); auto const* vsoa = iEvent.get(tokenSoAVertex_).get(); assert(vsoa); diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc index e6d49cde90d6a..212d2571c09c7 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc @@ -27,7 +27,8 @@ #include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h" #include "CUDADataFormats/Common/interface/HostProduct.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" #include "storeTracks.h" @@ -35,7 +36,7 @@ /** * This class creates "leagcy" reco::Track - * objects from the output of SoA CA. + * objects from the output of SoA CA. */ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> { public: @@ -54,7 +55,8 @@ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> { // Event Data tokens const edm::EDGetTokenT tBeamSpot_; - const edm::EDGetTokenT tokenTrack_; + //const edm::EDGetTokenT tokenTrack_; + const edm::EDGetTokenT tokenTrack_; const edm::EDGetTokenT cpuHits_; const edm::EDGetTokenT hmsToken_; // Event Setup tokens @@ -67,7 +69,8 @@ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> { PixelTrackProducerFromSoA::PixelTrackProducerFromSoA(const edm::ParameterSet &iConfig) : tBeamSpot_(consumes(iConfig.getParameter("beamSpot"))), - tokenTrack_(consumes(iConfig.getParameter("trackSrc"))), + //tokenTrack_(consumes(iConfig.getParameter("trackSrc"))), + tokenTrack_(consumes(iConfig.getParameter("trackSrc"))), cpuHits_(consumes(iConfig.getParameter("pixelRecHitLegacySrc"))), hmsToken_(consumes(iConfig.getParameter("pixelRecHitLegacySrc"))), idealMagneticFieldToken_(esConsumes()), @@ -152,7 +155,7 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, std::vector hits; hits.reserve(5); - const auto &tsoa = *iEvent.get(tokenTrack_); + const auto &tsoa = iEvent.get(tokenTrack_); auto const *quality = tsoa.qualityData(); // auto const &fit = tsoa.stateAtBS; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h index fcab52e96d210..372c7ccd3f96c 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h @@ -3,7 +3,8 @@ // #define GPU_DEBUG -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" #include "GPUCACell.h" // #define DUMP_GPU_TK_TUPLES diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h index ff13d09c1361a..ad64ae19037a3 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h @@ -3,7 +3,8 @@ #include #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" #include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h" #include "FWCore/ParameterSet/interface/ParameterSet.h" diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h index 4ec7069ac8e1b..a0c3930d1a739 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h @@ -14,7 +14,8 @@ #include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" #include "CAConstants.h" class GPUCACell { diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h index 67a180c53e887..7a356cf2d7dea 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h @@ -1,7 +1,8 @@ #ifndef RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h #define RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" #include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h" diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc index 34b0ed9e29fc1..f240e77727293 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc @@ -16,6 +16,7 @@ #include "FWCore/Utilities/interface/EDGetToken.h" #include "FWCore/Utilities/interface/RunningAverage.h" #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" #include "gpuVertexFinder.h" @@ -35,9 +36,11 @@ class PixelVertexProducerCUDA : public edm::global::EDProducer<> { bool onGPU_; - edm::EDGetTokenT> tokenGPUTrack_; + //edm::EDGetTokenT> tokenGPUTrack_; + edm::EDGetTokenT> tokenGPUTrack_; edm::EDPutTokenT tokenGPUVertex_; - edm::EDGetTokenT tokenCPUTrack_; + //edm::EDGetTokenT tokenCPUTrack_; + edm::EDGetTokenT tokenCPUTrack_; edm::EDPutTokenT tokenCPUVertex_; const gpuVertexFinder::Producer gpuAlgo_; @@ -62,10 +65,12 @@ PixelVertexProducerCUDA::PixelVertexProducerCUDA(const edm::ParameterSet& conf) { if (onGPU_) { tokenGPUTrack_ = - consumes>(conf.getParameter("pixelTrackSrc")); + //consumes>(conf.getParameter("pixelTrackSrc")); + consumes>(conf.getParameter("pixelTrackSrc")); tokenGPUVertex_ = produces(); } else { - tokenCPUTrack_ = consumes(conf.getParameter("pixelTrackSrc")); + //tokenCPUTrack_ = consumes(conf.getParameter("pixelTrackSrc")); + tokenCPUTrack_ = consumes(conf.getParameter("pixelTrackSrc")); tokenCPUVertex_ = produces(); } } @@ -97,11 +102,12 @@ void PixelVertexProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions& d void PixelVertexProducerCUDA::produceOnGPU(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { - edm::Handle> hTracks; + //edm::Handle> hTracks; + edm::Handle> hTracks; iEvent.getByToken(tokenGPUTrack_, hTracks); cms::cuda::ScopedContextProduce ctx{*hTracks}; - auto const* tracks = ctx.get(*hTracks).get(); + auto const* tracks = &ctx.get(*hTracks); assert(tracks); @@ -111,7 +117,7 @@ void PixelVertexProducerCUDA::produceOnGPU(edm::StreamID streamID, void PixelVertexProducerCUDA::produceOnCPU(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { - auto const* tracks = iEvent.get(tokenCPUTrack_).get(); + auto const* tracks = &iEvent.get(tokenCPUTrack_); assert(tracks); #ifdef PIXVERTEX_DEBUG_PRODUCE diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc index 20b007d2d029f..2fbd44147db33 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc @@ -7,6 +7,8 @@ #include "gpuSortByPt2.h" #include "gpuSplitVertices.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" + #undef PIXVERTEX_DEBUG_PRODUCE namespace gpuVertexFinder { @@ -17,28 +19,29 @@ namespace gpuVertexFinder { // split vertices with a chi2/NDoF greater than this constexpr float maxChi2ForSplit = 9.f; + using TkSoAConstView = pixelTrack::TrackSoAConstView; - __global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin, float ptMax) { + __global__ void loadTracks(TkSoA const* ptracks, TkSoAConstView ptracksView, ZVertexSoA* soa, WorkSpace* pws, float ptMin, float ptMax) { assert(ptracks); assert(soa); auto const& tracks = *ptracks; - auto const& fit = tracks.stateAtBS; + //auto const& fit = tracks.stateAtBS; auto const* quality = tracks.qualityData(); auto first = blockIdx.x * blockDim.x + threadIdx.x; - for (int idx = first, nt = tracks.nTracks(); idx < nt; idx += gridDim.x * blockDim.x) { + for (int idx = first, nt = ptracksView.nTracks(); idx < nt; idx += gridDim.x * blockDim.x) { auto nHits = tracks.nHits(idx); assert(nHits >= 3); // initialize soa... soa->idv[idx] = -1; - if (tracks.isTriplet(idx)) + if (pixelTrack::utilities::isTriplet(ptracksView,idx)) continue; // no triplets if (quality[idx] < pixelTrack::Quality::highPurity) continue; - auto pt = tracks.pt(idx); + auto pt = ptracksView[idx].pt(); if (pt < ptMin) continue; @@ -49,8 +52,8 @@ namespace gpuVertexFinder { auto& data = *pws; auto it = atomicAdd(&data.ntrks, 1); data.itrk[it] = idx; - data.zt[it] = tracks.zip(idx); - data.ezt2[it] = fit.covariance(idx)(14); + data.zt[it] = pixelTrack::utilities::zip(ptracksView,idx); + data.ezt2[it] = ptracksView[idx].covariance()(14); data.ptt2[it] = pt * pt; } } @@ -121,11 +124,11 @@ namespace gpuVertexFinder { init<<<1, 1, 0, stream>>>(soa, ws_d.get()); auto blockSize = 128; auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize; - loadTracks<<>>(tksoa, soa, ws_d.get(), ptMin, ptMax); + loadTracks<<>>(tksoa, tksoa->view(), soa, ws_d.get(), ptMin, ptMax); cudaCheck(cudaGetLastError()); #else init(soa, ws_d.get()); - loadTracks(tksoa, soa, ws_d.get(), ptMin, ptMax); + loadTracks(tksoa, tksoa->view(), soa, ws_d.get(), ptMin, ptMax); #endif #ifdef __CUDACC__ diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h index 2b6a8107d927f..66b70409b58fd 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h @@ -5,7 +5,8 @@ #include #include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" namespace gpuVertexFinder { @@ -43,6 +44,7 @@ namespace gpuVertexFinder { using ZVertices = ZVertexSoA; using WorkSpace = gpuVertexFinder::WorkSpace; using TkSoA = pixelTrack::TrackSoA; + using TkSoAConstView = pixelTrack::TrackSoAConstView; Producer(bool oneKernel, bool useDensity, From d1a14c2d79491ec68e302777da9238a3db60aff7 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Wed, 19 Oct 2022 19:52:33 +0200 Subject: [PATCH 048/110] Refactored Track class, updated test --- .../interface/TrackSoAHeterogeneousT_test.h | 39 +++++++------------ .../Track/test/TrackSoAHeterogeneous_test.cpp | 10 ++--- .../Track/test/TrackSoAHeterogeneous_test.cu | 30 +++++++------- 3 files changed, 35 insertions(+), 44 deletions(-) diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h index 998f63e608244..298bd276390b0 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h @@ -107,13 +107,27 @@ namespace pixelTrack { cov(k, j) = cov(j, k) = tracks[i].covariance()(ind++); } } + + __host__ __device__ inline int computeNumberOfLayers(TrackSoAConstView tracks, int32_t i) { + auto pdet = tracks.detIndices().begin(i); + int nl = 1; + auto ol = phase1PixelTopology::getLayer(*pdet); + for (; pdet < tracks.detIndices().end(i); ++pdet) { + auto il = phase1PixelTopology::getLayer(*pdet); + if (il != ol) + ++nl; + ol = il; + } + return nl; + } + + __host__ __device__ inline int nHits(TrackSoAConstView tracks, int i) { return tracks.detIndices().size(i); } } // namespace utilities } // namespace pixelTrack template class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection> { public: - // using cms::cuda::PortableDeviceCollection>::PortableDeviceCollection; TrackSoAHeterogeneousT() = default; explicit TrackSoAHeterogeneousT(cudaStream_t stream) @@ -123,34 +137,13 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection; // Always check quality is at least loose! // CUDA does not support enums in __lgc ... private: public: - // TODO: static did not work; using reinterpret_cast constexpr Quality const *qualityData() const { return reinterpret_cast(view().quality()); } constexpr Quality *qualityData() { return reinterpret_cast(view().quality()); } - - constexpr int nHits(int i) const { return detIndices.size(i); } - - constexpr int computeNumberOfLayers(int32_t i) const { - // layers are in order and we assume tracks are either forward or backward - auto pdet = detIndices.begin(i); - int nl = 1; - auto ol = phase1PixelTopology::getLayer(*pdet); - for (; pdet < detIndices.end(i); ++pdet) { - auto il = phase1PixelTopology::getLayer(*pdet); - if (il != ol) - ++nl; - ol = il; - } - return nl; - } - - HitContainer hitIndices; - HitContainer detIndices; }; namespace pixelTrack { @@ -159,8 +152,6 @@ namespace pixelTrack { using TrackSoAView = cms::cuda::PortableDeviceCollection>::View; using TrackSoAConstView = cms::cuda::PortableDeviceCollection>::ConstView; - // using HitContainer = TrackSoA::HitContainer; - } // namespace pixelTrack #endif // CUDADataFormats_Track_TrackHeterogeneousT_H diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp index d40ec10af2fa4..34704f16c1840 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp @@ -1,3 +1,4 @@ +#include #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" @@ -6,7 +7,7 @@ namespace testTrackSoAHeterogeneousT { - void runKernels(pixelTrack::TrackSoA *tracks, pixelTrack::TrackSoAView tracks_view); + void runKernels(pixelTrack::TrackSoAView tracks_view, uint32_t soaSize); } int main() { @@ -28,8 +29,7 @@ int main() { cudaCheck(cudaMemcpy(mem, &tracks_h, sizeof(pixelTrack::TrackSoA), cudaMemcpyHostToDevice)); // Run the tests - pixelTrack::TrackSoA *tracks_d = reinterpret_cast(mem); - testTrackSoAHeterogeneousT::runKernels(tracks_d, tracks_h.view()); + testTrackSoAHeterogeneousT::runKernels(tracks_h.view(), tracks_h->metadata().size()); // Copy SoA data back to host auto ret = cms::cuda::make_host_unique(tracks_h.bufferSize(), stream); @@ -38,7 +38,6 @@ int main() { TrackSoAHeterogeneousT_test<>::computeDataSize(tracks_h.stride()), cudaMemcpyDeviceToHost)); - // Copy tracks_d back to tracks_h cudaCheck(cudaMemcpy(&tracks_h, mem, sizeof(pixelTrack::TrackSoA), cudaMemcpyDeviceToHost)); // Create a view to access the copied data @@ -55,7 +54,8 @@ int main() { << "nLayers" << "\t" << "hitIndices off" << std::endl; - for (int i = 0; i < tracks_h.stride(); ++i) { + // for (int i = 0; i < tracks_h.stride(); ++i) { + for (int i = 0; i < 10; ++i) { std::cout << tmp_view[i].pt() << "\t" << tmp_view[i].eta() << "\t" << tmp_view[i].chi2() << "\t" << (int)tmp_view[i].quality() << "\t" << (int)tmp_view[i].nLayers() << "\t" << tmp_view.hitIndices().off[i] << std::endl; diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu index 3ca5fc4994257..38c7ab61eeece 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu @@ -3,43 +3,43 @@ namespace testTrackSoAHeterogeneousT { - __global__ void fill(pixelTrack::TrackSoA* __restrict__ tracks, pixelTrack::TrackSoAView tracks_view) { - assert(tracks); - + __global__ void fill(pixelTrack::TrackSoAView tracks_view) { int i = threadIdx.x; - for (int j = i; j < tracks->stride(); j += blockDim.x) { + if (i == 0) { + tracks_view.nTracks() = 420; + } + + for (int j = i; j < tracks_view.metadata().size(); j += blockDim.x) { tracks_view[j].pt() = (float)j; tracks_view[j].eta() = (float)j; tracks_view[j].chi2() = (float)j; tracks_view[j].quality() = (uint8_t)j % 256; tracks_view[j].nLayers() = j % 128; tracks_view.hitIndices().off[j] = j; - tracks->hitIndices.off[j] = j; } } - __global__ void verify(pixelTrack::TrackSoA* const __restrict__ tracks, pixelTrack::TrackSoAConstView tracks_view) { - assert(tracks); - + __global__ void verify(pixelTrack::TrackSoAView tracks_view) { int i = threadIdx.x; + if (i == 0) { - printf("Stride: %d, block dims: %d\n", tracks->stride(), blockDim.x); + printf("SoA size: % d, block dims: % d\n", tracks_view.metadata().size(), blockDim.x); + assert(tracks_view.nTracks() == 420); } - for (int j = i; j < tracks->stride(); j += blockDim.x) { + for (int j = i; j < tracks_view.metadata().size(); j += blockDim.x) { assert(abs(tracks_view[j].pt() - (float)j) < .0001); assert(abs(tracks_view[j].eta() - (float)j) < .0001); assert(abs(tracks_view[j].chi2() - (float)j) < .0001); assert(tracks_view[j].quality() == j % 256); assert(tracks_view[j].nLayers() == j % 128); assert(tracks_view.hitIndices().off[j] == j); - assert(tracks->hitIndices.off[j] == j); } } - void runKernels(pixelTrack::TrackSoA* tracks, pixelTrack::TrackSoAView tracks_view) { - assert(tracks); - fill<<<1, 1024>>>(tracks, tracks_view); - verify<<<1, 1024>>>(tracks, tracks_view); + void runKernels(pixelTrack::TrackSoAView tracks_view, uint32_t soaSize) { + fill<<<1, 1024>>>(tracks_view); + cudaDeviceSynchronize(); + verify<<<1, 1024>>>(tracks_view); } } // namespace testTrackSoAHeterogeneousT From 4739777264d8137ee5031c5e6d1d7b19e605d463 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Thu, 20 Oct 2022 11:22:03 +0200 Subject: [PATCH 049/110] Cleanup test for TrackSoA --- .../Track/test/TrackSoAHeterogeneous_test.cpp | 33 +++++++------------ .../Track/test/TrackSoAHeterogeneous_test.cu | 2 +- 2 files changed, 13 insertions(+), 22 deletions(-) diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp index 34704f16c1840..28dbd0c9b029b 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp @@ -7,7 +7,7 @@ namespace testTrackSoAHeterogeneousT { - void runKernels(pixelTrack::TrackSoAView tracks_view, uint32_t soaSize); + void runKernels(pixelTrack::TrackSoAView tracks_view); } int main() { @@ -16,33 +16,26 @@ int main() { cudaStream_t stream; cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - // inner scope to deallocate memory before destroying the stream + // Inner scope to deallocate memory before destroying the stream { // Instantiate tracks on host. Portabledevicecollection allocates // SoA on device automatically. - int dev = cms::cuda::currentDevice(); - pixelTrack::TrackSoA tracks_h(stream); - - // Make a copy of tracks_h to device, so that we can - // modify hitIndices. - void *mem = cms::cuda::allocate_device(dev, sizeof(pixelTrack::TrackSoA), stream); - cudaCheck(cudaMemcpy(mem, &tracks_h, sizeof(pixelTrack::TrackSoA), cudaMemcpyHostToDevice)); + pixelTrack::TrackSoA tracks(stream); + uint32_t soaSize = tracks.bufferSize(); // SoA Layout size (bytes) + uint32_t soaNumElements = tracks->metadata().size(); // Length of each SoA array in elements // Run the tests - testTrackSoAHeterogeneousT::runKernels(tracks_h.view(), tracks_h->metadata().size()); + testTrackSoAHeterogeneousT::runKernels(tracks.view()); // Copy SoA data back to host - auto ret = cms::cuda::make_host_unique(tracks_h.bufferSize(), stream); - cudaCheck(cudaMemcpy(ret.get(), - tracks_h.buffer().get(), - TrackSoAHeterogeneousT_test<>::computeDataSize(tracks_h.stride()), - cudaMemcpyDeviceToHost)); - - cudaCheck(cudaMemcpy(&tracks_h, mem, sizeof(pixelTrack::TrackSoA), cudaMemcpyDeviceToHost)); + auto tracks_h_soa = cms::cuda::make_host_unique(soaSize, stream); + cudaCheck(cudaMemcpy(tracks_h_soa.get(), tracks.const_buffer().get(), soaSize, cudaMemcpyDeviceToHost)); // Create a view to access the copied data - TrackSoAHeterogeneousT_test<> tmp_layout(ret.get(), tracks_h.stride()); + TrackSoAHeterogeneousT_test<> tmp_layout(tracks_h_soa.get(), soaNumElements); TrackSoAHeterogeneousT_test<>::View tmp_view(tmp_layout); + + // Print results std::cout << "pt" << "\t" << "eta" @@ -54,14 +47,12 @@ int main() { << "nLayers" << "\t" << "hitIndices off" << std::endl; - // for (int i = 0; i < tracks_h.stride(); ++i) { + for (int i = 0; i < 10; ++i) { std::cout << tmp_view[i].pt() << "\t" << tmp_view[i].eta() << "\t" << tmp_view[i].chi2() << "\t" << (int)tmp_view[i].quality() << "\t" << (int)tmp_view[i].nLayers() << "\t" << tmp_view.hitIndices().off[i] << std::endl; } - - cudaCheck(cudaFree(mem)); } cudaCheck(cudaStreamDestroy(stream)); diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu index 38c7ab61eeece..4e3f7ee6c9388 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu @@ -36,7 +36,7 @@ namespace testTrackSoAHeterogeneousT { } } - void runKernels(pixelTrack::TrackSoAView tracks_view, uint32_t soaSize) { + void runKernels(pixelTrack::TrackSoAView tracks_view) { fill<<<1, 1024>>>(tracks_view); cudaDeviceSynchronize(); verify<<<1, 1024>>>(tracks_view); From c3599aeb170c648b8484c5950d6d40ded8e3594d Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Thu, 20 Oct 2022 11:23:02 +0200 Subject: [PATCH 050/110] Added TODO --- CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu index 4e3f7ee6c9388..9c59d867629b2 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu @@ -19,6 +19,7 @@ namespace testTrackSoAHeterogeneousT { } } + // TODO: Using TrackSoAConstView fails to assert hitIndices correctly __global__ void verify(pixelTrack::TrackSoAView tracks_view) { int i = threadIdx.x; From f131e237bcf0e0c86c3cf0efa84b24d6be3133d5 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Thu, 20 Oct 2022 11:25:44 +0200 Subject: [PATCH 051/110] Docstring for test --- .../Track/test/TrackSoAHeterogeneous_test.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp index 28dbd0c9b029b..f473fd2023b8f 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp @@ -1,3 +1,15 @@ +/** + Simple test of the pixelTrack::TrackSoA data structure + which inherits from PortableDeviceCollection. + + Creates an instance of the class (automatically allocates + memory on device), passes the view of the SoA data to + the CUDA kernels which: + - Fill the SoA with data. + - Verify that the data written is correct. + + */ + #include #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" From 0e110223163d718e17a81bd3d068be79d195ffbc Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Thu, 20 Oct 2022 11:27:20 +0200 Subject: [PATCH 052/110] More details in docstring --- CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp index f473fd2023b8f..9bfd445bd786c 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp @@ -1,5 +1,5 @@ /** - Simple test of the pixelTrack::TrackSoA data structure + Simple test for the pixelTrack::TrackSoA data structure which inherits from PortableDeviceCollection. Creates an instance of the class (automatically allocates @@ -7,7 +7,10 @@ the CUDA kernels which: - Fill the SoA with data. - Verify that the data written is correct. - + + Then, the SoA data are copied back to Host, where + a temporary host-side view (tmp_view) is created using + the same Layout to access the data on host and print it. */ #include From ffd6f3ef202bc1f9a71b18bf9f5f47b22f844f9e Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Thu, 20 Oct 2022 11:38:15 +0200 Subject: [PATCH 053/110] Multiplicity kernels --- .../plugins/CAHitNtupletGeneratorKernels.cu | 8 ++++---- .../plugins/CAHitNtupletGeneratorKernelsImpl.h | 15 ++++++--------- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index 168ba3b0c8144..c5f3ff3a1b649 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -4,8 +4,8 @@ template <> void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { // these are pointer on GPU! - auto *tuples_d = &tracks_d->hitIndices; - auto *detId_d = &tracks_d->detIndices; + auto *tuples_d = &tracks_d->hitIndices(); + auto *detId_d = &tracks_d->detIndices(); auto *quality_d = tracks_d->qualityData(); // zero tuples @@ -102,8 +102,8 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * blockSize = 128; numberOfBlocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize; - kernel_countMultiplicity<<>>( - tuples_d, tracks_d->view(), device_tupleMultiplicity_.get()); + kernel_countMultiplicity<<>>(tracks_d->view(), + device_tupleMultiplicity_.get()); cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream); kernel_fillMultiplicity<<>>( tuples_d, tracks_d->view(), device_tupleMultiplicity_.get()); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index f38c042ed15c2..34079fe3c714b 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -375,12 +375,11 @@ __global__ void kernel_mark_used(GPUCACell *__restrict__ cells, uint32_t const * } } -__global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets, - TkSoAConstView tracks_view, +__global__ void kernel_countMultiplicity(TkSoAConstView tracks_view, caConstants::TupleMultiplicity *tupleMultiplicity) { auto first = blockIdx.x * blockDim.x + threadIdx.x; - for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) { - auto nhits = foundNtuplets->size(it); + for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) { + auto nhits = tracks_view.hitIndices().size(it); if (nhits < 3) continue; if (tracks_view[it].quality() == (uint8_t)pixelTrack::Quality::edup) @@ -393,12 +392,10 @@ __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundN } } -__global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets, - TkSoAConstView tracks_view, - caConstants::TupleMultiplicity *tupleMultiplicity) { +__global__ void kernel_fillMultiplicity(TkSoAConstView tracks_view, caConstants::TupleMultiplicity *tupleMultiplicity) { auto first = blockIdx.x * blockDim.x + threadIdx.x; - for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) { - auto nhits = foundNtuplets->size(it); + for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) { + auto nhits = tracks_view.hitIndices().size(it); if (nhits < 3) continue; if (tracks_view[it].quality() == (uint8_t)pixelTrack::Quality::edup) From d21b49f65b1a409c48d791567dd8e8435d64475f Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Thu, 20 Oct 2022 12:12:18 +0200 Subject: [PATCH 054/110] Replacing hitIndices in several parts in PixelTriplets --- .../plugins/CAHitNtupletGeneratorKernels.cu | 22 ++++++------ .../CAHitNtupletGeneratorKernelsImpl.h | 36 ++++++++++--------- 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index c5f3ff3a1b649..abf4c28d4ee6f 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -4,12 +4,12 @@ template <> void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { // these are pointer on GPU! - auto *tuples_d = &tracks_d->hitIndices(); - auto *detId_d = &tracks_d->detIndices(); + // auto *tuples_d = &tracks_d->hitIndices(); + auto *detId_d = tracks_d->view().detIndices(); auto *quality_d = tracks_d->qualityData(); // zero tuples - cms::cuda::launchZero(tuples_d, cudaStream); + cms::cuda::launchZero(tracks_d->view().hitIndices(), cudaStream); int32_t nhits = hh.nHits(); @@ -70,9 +70,8 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * device_theCells_.get(), device_nCells_, device_theCellTracks_.get(), - tuples_d, + tracks_d->view(), device_hitTuple_apc_, - quality_d, params_.minHitsPerNtuplet_); cudaCheck(cudaGetLastError()); @@ -87,9 +86,11 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * blockSize = 128; numberOfBlocks = (HitContainer::ctNOnes() + blockSize - 1) / blockSize; - cms::cuda::finalizeBulk<<>>(device_hitTuple_apc_, tuples_d); + cms::cuda::finalizeBulk<<>>(device_hitTuple_apc_, + tracks_d->view().hitIndices()); - kernel_fillHitDetIndices<<>>(tuples_d, hh.view(), detId_d); + kernel_fillHitDetIndices<<>>( + tracks_d->view().hitIndices(), hh.view(), tracks_d->view().detIndices()); cudaCheck(cudaGetLastError()); kernel_fillNLayers<<>>(tracks_d, tracks_d->view(), device_hitTuple_apc_); cudaCheck(cudaGetLastError()); @@ -105,8 +106,8 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * kernel_countMultiplicity<<>>(tracks_d->view(), device_tupleMultiplicity_.get()); cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream); - kernel_fillMultiplicity<<>>( - tuples_d, tracks_d->view(), device_tupleMultiplicity_.get()); + kernel_fillMultiplicity<<>>(tracks_d->view(), + device_tupleMultiplicity_.get()); cudaCheck(cudaGetLastError()); // do not run the fishbone if there are hits only in BPIX1 @@ -233,8 +234,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA // classify tracks based on kinematics auto numberOfBlocks = nQuadrupletBlocks(blockSize); - kernel_classifyTracks<<>>( - tuples_d, tracks_d->view(), quality_d, params_.cuts_); + kernel_classifyTracks<<>>(tracks_d->view(), quality_d, params_.cuts_); cudaCheck(cudaGetLastError()); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index 34079fe3c714b..360aed23e90e9 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -338,9 +338,8 @@ __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp, GPUCACell *__restrict__ cells, uint32_t const *nCells, gpuPixelDoublets::CellTracksVector *cellTracks, - HitContainer *foundNtuplets, + pixelTrack::TrackSoaView tracks_view, cms::cuda::AtomicPairCounter *apc, - Quality *__restrict__ quality, unsigned int minHitsPerNtuplet) { // recursive: not obvious to widen auto const &hh = *hhp; @@ -358,8 +357,15 @@ __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp, if (doit) { GPUCACell::TmpTuple stack; stack.reset(); - thisCell.find_ntuplets<6>( - hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3); + thisCell.find_ntuplets<6>(hh, + cells, + *cellTracks, + tracks_view.hitIndices(), + *apc, + tracks_view.qualityData(), + stack, + minHitsPerNtuplet, + pid < 3); assert(stack.empty()); // printf("in %d found quadruplets: %d\n", cellIndex, apc->get()); } @@ -412,14 +418,13 @@ __global__ void kernel_fillMultiplicity(TkSoAConstView tracks_view, caConstants: Supply both the original TkSoA and the TkSoAView which contains the SoA Data */ -__global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, - TkSoAView tracks_view, +__global__ void kernel_classifyTracks(TkSoAView tracks_view, Quality *__restrict__ quality, CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts) { int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int it = first, nt = tuples->nOnes(); it < nt; it += gridDim.x * blockDim.x) { - auto nhits = tuples->size(it); + for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) { + auto nhits = tracks_view.hitIndices().size(it); if (nhits == 0) break; // guard @@ -440,7 +445,7 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, } if (isNaN) { #ifdef NTUPLE_DEBUG - printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks_view[it].chi2()); + printf("NaN in fit %d size %d chi2 %f\n", it, tracks_view.hitIndices().size(it), tracks_view[it].chi2()); #endif continue; } @@ -477,7 +482,7 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, #ifdef NTUPLE_FIT_DEBUG printf("Bad chi2 %d size %d pt %f eta %f chi2 %f\n", it, - tuples->size(it), + tracks_view.hitIndices().size(it), tracks_view[it].pt(), tracks_view[it].eta(), tracks_view[it].chi2()); @@ -561,19 +566,16 @@ __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples /* Needs both TkSoA and TkSoAView for accessing SoA, computeNumberOfLayers(), nHits(), stride() */ -__global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks, - TkSoAView tracks_view, - cms::cuda::AtomicPairCounter *apc) { - auto &tracks = *ptracks; +__global__ void kernel_fillNLayers(TkSoAView tracks_view, cms::cuda::AtomicPairCounter *apc) { auto first = blockIdx.x * blockDim.x + threadIdx.x; // clamp the number of tracks to the capacity of the SoA - auto ntracks = std::min(apc->get().m, tracks.stride() - 1); + auto ntracks = std::min(apc->get().m, tracks_view.metadata().size() - 1); if (0 == first) tracks_view.nTracks() = ntracks; for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) { - auto nHits = tracks.nHits(idx); + auto nHits = pixelTrack::nHits(tracks_view, idx); assert(nHits >= 3); - tracks_view[idx].nLayers() = tracks.computeNumberOfLayers(idx); + tracks_view[idx].nLayers() = pixelTrack::computeNumberOfLayers(tracks_view, idx); } } From 7eb507103b69b61bff4bd397a7e9b1b668623ef0 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Thu, 20 Oct 2022 12:18:32 +0200 Subject: [PATCH 055/110] Classify, checkOverflows kernels --- .../plugins/CAHitNtupletGeneratorKernels.cu | 8 ++--- .../CAHitNtupletGeneratorKernelsImpl.h | 32 +++++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index abf4c28d4ee6f..1bd491d53f509 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -225,7 +225,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr template <> void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { // these are pointer on GPU! - auto const *tuples_d = &tracks_d->hitIndices; + // auto const *tuples_d = &tracks_d->hitIndices; auto *quality_d = tracks_d->qualityData(); int32_t nhits = hh.nHits(); @@ -259,13 +259,13 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA // fill hit->track "map" assert(hitToTupleView_.offSize > nhits); numberOfBlocks = nQuadrupletBlocks(blockSize); - kernel_countHitInTracks<<>>(tuples_d, device_hitToTuple_.get()); + kernel_countHitInTracks<<>>(tracks_d->view(), device_hitToTuple_.get()); cudaCheck(cudaGetLastError()); assert((hitToTupleView_.assoc == device_hitToTuple_.get()) && (hitToTupleView_.offStorage == device_hitToTupleStorage_.get()) && (hitToTupleView_.offSize > 0)); cms::cuda::launchFinalize(hitToTupleView_, cudaStream); cudaCheck(cudaGetLastError()); - kernel_fillHitInTracks<<>>(tuples_d, device_hitToTuple_.get()); + kernel_fillHitInTracks<<>>(tracks_d->view(), device_hitToTuple_.get()); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaCheck(cudaDeviceSynchronize()); @@ -297,7 +297,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA if (params_.doStats_) { numberOfBlocks = (std::max(nhits, int(params_.maxNumberOfDoublets_)) + blockSize - 1) / blockSize; - kernel_checkOverflows<<>>(tuples_d, + kernel_checkOverflows<<>>(tracks_d->view(), device_tupleMultiplicity_.get(), device_hitToTuple_.get(), device_hitTuple_apc_, diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index 360aed23e90e9..6833296cc89d5 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -43,7 +43,7 @@ namespace { } // namespace -__global__ void kernel_checkOverflows(HitContainer const *foundNtuplets, +__global__ void kernel_checkOverflows(pixelTrack::TrackSoAView tracks_view, caConstants::TupleMultiplicity const *tupleMultiplicity, CAHitNtupletGeneratorKernelsGPU::HitToTuple const *hitToTuple, cms::cuda::AtomicPairCounter *apc, @@ -76,16 +76,16 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets, nHits, hitToTuple->totOnes()); if (apc->get().m < caConstants::maxNumberOfQuadruplets) { - assert(foundNtuplets->size(apc->get().m) == 0); - assert(foundNtuplets->size() == apc->get().n); + assert(tracks_view.hitIndices().size(apc->get().m) == 0); + assert(tracks_view.hitIndices().size() == apc->get().n); } } - for (int idx = first, nt = foundNtuplets->nOnes(); idx < nt; idx += gridDim.x * blockDim.x) { - if (foundNtuplets->size(idx) > 7) // current real limit - printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx)); - assert(foundNtuplets->size(idx) <= caConstants::maxHitsOnTrack); - for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih) + for (int idx = first, nt = tracks_view.hitIndices().nOnes(); idx < nt; idx += gridDim.x * blockDim.x) { + if (tracks_view.hitIndices().size(idx) > 7) // current real limit + printf("ERROR %d, %d\n", idx, tracks_view.hitIndices().size(idx)); + assert(tracks_view.hitIndices().size(idx) <= caConstants::maxHitsOnTrack); + for (auto ih = tracks_view.hitIndices().begin(idx); ih != tracks_view.hitIndices().end(idx); ++ih) assert(int(*ih) < nHits); } #endif @@ -524,24 +524,24 @@ __global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples, } } -__global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples, +__global__ void kernel_countHitInTracks(pixelTrack::TrackSoAView tracks_view, CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) { int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { - if (tuples->size(idx) == 0) + for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (tracks_view.hitIndices().size(idx) == 0) break; // guard - for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h) + for (auto h = tracks_view.hitIndices().begin(idx); h != tracks_view.hitIndices().end(idx); ++h) hitToTuple->count(*h); } } -__global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples, +__global__ void kernel_fillHitInTracks(pixelTrack::TrackSoAView tracks_view, // TODO: Make ConstView CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) { int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { - if (tuples->size(idx) == 0) + for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (tracks_view.hitIndices().size(idx) == 0) break; // guard - for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h) + for (auto h = tracks_view.hitIndices().begin(idx); h != tracks_view.hitIndices().end(idx); ++h) hitToTuple->fill(*h, idx); } } From 4794b6cc23803e81fcfe898a949264666f6ffcfc Mon Sep 17 00:00:00 2001 From: Breno Orzari Date: Thu, 20 Oct 2022 15:36:38 +0200 Subject: [PATCH 056/110] Confiscating everything from class --- .../interface/TrackSoAHeterogeneousT_test.h | 22 ++++++++----------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h index 298bd276390b0..a9b2bb987fcec 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h @@ -29,6 +29,8 @@ namespace pixelTrack { constexpr uint32_t maxNumber() { return 32 * 1024; } #endif + using HitContainer = cms::cuda::OneToManyAssoc; + } // namespace pixelTrack using Vector5f = Eigen::Matrix; @@ -36,7 +38,7 @@ using Vector15f = Eigen::Matrix; using Vector5d = Eigen::Matrix; using Matrix5d = Eigen::Matrix; -using HitContainer = cms::cuda::OneToManyAssoc; +using HitContainer = pixelTrack::HitContainer; GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test, SOA_COLUMN(uint8_t, quality), @@ -55,6 +57,8 @@ namespace pixelTrack { namespace utilities { using TrackSoAView = cms::cuda::PortableDeviceCollection>::View; using TrackSoAConstView = cms::cuda::PortableDeviceCollection>::ConstView; + using Quality = pixelTrack::Quality; + using hindex_type = uint32_t; // State at the Beam spot // phi,tip,1/pt,cotan(theta),zip __host__ __device__ inline float charge(TrackSoAConstView tracks, int32_t i) { @@ -121,6 +125,9 @@ namespace pixelTrack { return nl; } + __host__ __device__ inline const Quality *qualityData(TrackSoAConstView tracks) { return reinterpret_cast(tracks.quality()); } + __host__ __device__ inline Quality *qualityData(TrackSoAView tracks) { return reinterpret_cast(tracks.quality()); } + __host__ __device__ inline int nHits(TrackSoAConstView tracks, int i) { return tracks.detIndices().size(i); } } // namespace utilities } // namespace pixelTrack @@ -133,18 +140,7 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection>(S, stream) {} - static constexpr int32_t stride() { return S; } - - using Quality = pixelTrack::Quality; - using hindex_type = uint32_t; - - // Always check quality is at least loose! - // CUDA does not support enums in __lgc ... -private: -public: - constexpr Quality const *qualityData() const { return reinterpret_cast(view().quality()); } - constexpr Quality *qualityData() { return reinterpret_cast(view().quality()); } -}; +} namespace pixelTrack { From 8de9dfe22d5f2acac5027ac7177b4ee5d69dbfc7 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Thu, 20 Oct 2022 17:10:32 +0200 Subject: [PATCH 057/110] Removed unused names --- CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h index a9b2bb987fcec..7814cfd32ae4d 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h @@ -35,9 +35,6 @@ namespace pixelTrack { using Vector5f = Eigen::Matrix; using Vector15f = Eigen::Matrix; - -using Vector5d = Eigen::Matrix; -using Matrix5d = Eigen::Matrix; using HitContainer = pixelTrack::HitContainer; GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test, From 53f4a91b6a840455b3ca8a7f14885a2951a0f973 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Fri, 21 Oct 2022 10:59:29 +0200 Subject: [PATCH 058/110] Simplified View alias, fixed calls in KernelsImpl, comments --- .../interface/TrackSoAHeterogeneousT_test.h | 32 ++++++++++++------- .../plugins/PixelTrackProducerFromSoA.cc | 6 ++-- .../CAHitNtupletGeneratorKernelsImpl.h | 4 +-- 3 files changed, 26 insertions(+), 16 deletions(-) diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h index 7814cfd32ae4d..e5485588ecb8a 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h @@ -33,6 +33,8 @@ namespace pixelTrack { } // namespace pixelTrack +// Aliases in order to not confuse the GENERATE_SOA_LAYOUT +// macro with weird colons and angled brackets. using Vector5f = Eigen::Matrix; using Vector15f = Eigen::Matrix; using HitContainer = pixelTrack::HitContainer; @@ -49,11 +51,12 @@ GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test, SOA_SCALAR(HitContainer, hitIndices), SOA_SCALAR(HitContainer, detIndices)) -// Previous TrajectoryStateSoAT class methods +// Previous TrajectoryStateSoAT class methods. +// They operate on View and ConstView of the TrackSoA. namespace pixelTrack { namespace utilities { - using TrackSoAView = cms::cuda::PortableDeviceCollection>::View; - using TrackSoAConstView = cms::cuda::PortableDeviceCollection>::ConstView; + using TrackSoAView = TrackSoAHeterogeneousT_test<>::View; + using TrackSoAConstView = TrackSoAHeterogeneousT_test<>::ConstView; using Quality = pixelTrack::Quality; using hindex_type = uint32_t; // State at the Beam spot @@ -109,7 +112,8 @@ namespace pixelTrack { } } - __host__ __device__ inline int computeNumberOfLayers(TrackSoAConstView tracks, int32_t i) { + // TODO: Not using TrackSoAConstView due to weird bugs with HitContainer + __host__ __device__ inline int computeNumberOfLayers(TrackSoAView tracks, int32_t i) { auto pdet = tracks.detIndices().begin(i); int nl = 1; auto ol = phase1PixelTopology::getLayer(*pdet); @@ -121,11 +125,17 @@ namespace pixelTrack { } return nl; } + __host__ __device__ inline int nHits(TrackSoAConstView tracks, int i) { return tracks.detIndices().size(i); } - __host__ __device__ inline const Quality *qualityData(TrackSoAConstView tracks) { return reinterpret_cast(tracks.quality()); } - __host__ __device__ inline Quality *qualityData(TrackSoAView tracks) { return reinterpret_cast(tracks.quality()); } + // Casts quality SoA data (uint8_t) to pixelTrack::Quality. This is required + // to use the data as an enum instead of a plain uint8_t + __host__ __device__ inline const Quality *qualityData(TrackSoAConstView tracks) { + return reinterpret_cast(tracks.quality()); + } + __host__ __device__ inline Quality *qualityData(TrackSoAView tracks) { + return reinterpret_cast(tracks.quality()); + } - __host__ __device__ inline int nHits(TrackSoAConstView tracks, int i) { return tracks.detIndices().size(i); } } // namespace utilities } // namespace pixelTrack @@ -134,16 +144,16 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection>(S, stream) {} - -} +}; namespace pixelTrack { using TrackSoA = TrackSoAHeterogeneousT; - using TrackSoAView = cms::cuda::PortableDeviceCollection>::View; - using TrackSoAConstView = cms::cuda::PortableDeviceCollection>::ConstView; + using TrackSoAView = TrackSoAHeterogeneousT_test<>::View; + using TrackSoAConstView = TrackSoAHeterogeneousT_test<>::ConstView; } // namespace pixelTrack diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc index e6d49cde90d6a..c5d31764b0fcb 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc @@ -154,9 +154,9 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, const auto &tsoa = *iEvent.get(tokenTrack_); - auto const *quality = tsoa.qualityData(); + auto const *quality = pixelTrack::utilities::qualityData(tsoa.view()); // auto const &fit = tsoa.stateAtBS; - auto const &hitIndices = tsoa.hitIndices; + auto const &hitIndices = tsoa.view().hitIndices(); auto nTracks = tsoa.view().nTracks(); tracks.reserve(nTracks); @@ -173,7 +173,7 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, //store the index of the SoA: indToEdm[index_SoAtrack] -> index_edmTrack (if it exists) indToEdm.resize(sortIdxs.size(), -1); for (const auto &it : sortIdxs) { - auto nHits = tsoa.nHits(it); + auto nHits = pixelTrack::utilities::nHits(tsoa.view(), it); assert(nHits >= 3); auto q = quality[it]; if (q < minQuality_) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index 6833296cc89d5..cdf1ab2193be1 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -338,7 +338,7 @@ __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp, GPUCACell *__restrict__ cells, uint32_t const *nCells, gpuPixelDoublets::CellTracksVector *cellTracks, - pixelTrack::TrackSoaView tracks_view, + TkSoAView tracks_view, cms::cuda::AtomicPairCounter *apc, unsigned int minHitsPerNtuplet) { // recursive: not obvious to widen @@ -575,7 +575,7 @@ __global__ void kernel_fillNLayers(TkSoAView tracks_view, cms::cuda::AtomicPairC for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) { auto nHits = pixelTrack::nHits(tracks_view, idx); assert(nHits >= 3); - tracks_view[idx].nLayers() = pixelTrack::computeNumberOfLayers(tracks_view, idx); + tracks_view[idx].nLayers() = pixelTrack::utilities::computeNumberOfLayers(tracks_view, idx); } } From 2a16c84609773436654a7532e25a8197ac530cf5 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Fri, 21 Oct 2022 13:07:35 +0200 Subject: [PATCH 059/110] I DID IT --- .../plugins/CAHitNtupletGeneratorKernels.cc | 37 +++++++------- .../CAHitNtupletGeneratorKernelsImpl.h | 51 +++++++++---------- .../plugins/CAHitNtupletGeneratorOnGPU.cc | 8 +-- 3 files changed, 46 insertions(+), 50 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc index a34e0f280dd9d..80497d3dd706b 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -79,14 +79,14 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr template <> void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { - auto *tuples_d = &tracks_d->hitIndices; - auto *detId_d = &tracks_d->detIndices; - auto *quality_d = tracks_d->qualityData(); + // auto *tuples_d = tracks_d->view().hitIndices(); + // auto *detId_d = tracks_d->view().detIndices(); + // auto *quality_d = tracks_d->qualityData(); // assert(tuples_d && quality_d); // TODO Find equivalent for View // zero tuples - cms::cuda::launchZero(tuples_d, cudaStream); + cms::cuda::launchZero(&tracks_d->view().hitIndices(), cudaStream); auto nhits = hh.nHits(); @@ -119,23 +119,22 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * device_theCells_.get(), device_nCells_, device_theCellTracks_.get(), - tuples_d, + tracks_d->view(), device_hitTuple_apc_, - quality_d, params_.minHitsPerNtuplet_); if (params_.doStats_) kernel_mark_used(device_theCells_.get(), device_nCells_); - cms::cuda::finalizeBulk(device_hitTuple_apc_, tuples_d); + cms::cuda::finalizeBulk(device_hitTuple_apc_, &tracks_d->view().hitIndices()); - kernel_fillHitDetIndices(tuples_d, hh.view(), detId_d); - kernel_fillNLayers(tracks_d, tracks_d->view(), device_hitTuple_apc_); + kernel_fillHitDetIndices(tracks_d->view(), hh.view()); + kernel_fillNLayers(tracks_d->view(), device_hitTuple_apc_); // remove duplicates (tracks that share a doublet) kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_); - kernel_countMultiplicity(tuples_d, tracks_d->view(), device_tupleMultiplicity_.get()); + kernel_countMultiplicity(tracks_d->view(), device_tupleMultiplicity_.get()); cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream); - kernel_fillMultiplicity(tuples_d, tracks_d->view(), device_tupleMultiplicity_.get()); + kernel_fillMultiplicity(tracks_d->view(), device_tupleMultiplicity_.get()); if (nhits > 1 && params_.lateFishbone_) { gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, true); @@ -146,10 +145,10 @@ template <> void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { int32_t nhits = hh.nHits(); - auto const *tuples_d = &tracks_d->hitIndices; - auto *quality_d = tracks_d->qualityData(); + // auto const *tuples_d = &tracks_d->hitIndices; + auto *quality_d = pixelTrack::utilities::qualityData(tracks_d->view()); // classify tracks based on kinematics - kernel_classifyTracks(tuples_d, tracks_d->view(), quality_d, params_.cuts_); + kernel_classifyTracks(tracks_d->view(), quality_d, params_.cuts_); if (params_.lateFishbone_) { // apply fishbone cleaning to good tracks @@ -161,9 +160,9 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA // fill hit->track "map" if (params_.doSharedHitCut_ || params_.doStats_) { - kernel_countHitInTracks(tuples_d, device_hitToTuple_.get()); + kernel_countHitInTracks(tracks_d->view(), device_hitToTuple_.get()); cms::cuda::launchFinalize(hitToTupleView_, cudaStream); - kernel_fillHitInTracks(tuples_d, device_hitToTuple_.get()); + kernel_fillHitInTracks(tracks_d->view(), device_hitToTuple_.get()); } // remove duplicates (tracks that share at least one hit) @@ -184,7 +183,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA if (params_.doStats_) { std::lock_guard guard(lock_stat); - kernel_checkOverflows(tuples_d, + kernel_checkOverflows(tracks_d->view(), device_tupleMultiplicity_.get(), device_hitToTuple_.get(), device_hitTuple_apc_, @@ -202,7 +201,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA // counters (add flag???) std::lock_guard guard(lock_stat); kernel_doStatsForHitInTracks(device_hitToTuple_.get(), counters_); - kernel_doStatsForTracks(tuples_d, quality_d, counters_); + kernel_doStatsForTracks(tracks_d->view(), quality_d, counters_); } #ifdef DUMP_GPU_TK_TUPLES @@ -211,7 +210,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA { std::lock_guard guard(lock); ++iev; - kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d->view(), device_hitToTuple_.get(), 0, 1000000, iev); + kernel_print_found_ntuplets(hh.view(), tracks_d->view(), device_hitToTuple_.get(), 0, 1000000, iev); } #endif } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index cdf1ab2193be1..a3a8eb97a43d7 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -43,7 +43,7 @@ namespace { } // namespace -__global__ void kernel_checkOverflows(pixelTrack::TrackSoAView tracks_view, +__global__ void kernel_checkOverflows(TkSoAView tracks_view, caConstants::TupleMultiplicity const *tupleMultiplicity, CAHitNtupletGeneratorKernelsGPU::HitToTuple const *hitToTuple, cms::cuda::AtomicPairCounter *apc, @@ -362,7 +362,7 @@ __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp, *cellTracks, tracks_view.hitIndices(), *apc, - tracks_view.qualityData(), + pixelTrack::utilities::qualityData(tracks_view), stack, minHitsPerNtuplet, pid < 3); @@ -508,12 +508,12 @@ __global__ void kernel_classifyTracks(TkSoAView tracks_view, } } -__global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples, +__global__ void kernel_doStatsForTracks(TkSoAView tracks_view, Quality const *__restrict__ quality, CAHitNtupletGeneratorKernelsGPU::Counters *counters) { int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { - if (tuples->size(idx) == 0) + for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (tracks_view.hitIndices().size(idx) == 0) break; //guard if (quality[idx] < pixelTrack::Quality::loose) continue; @@ -524,7 +524,7 @@ __global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples, } } -__global__ void kernel_countHitInTracks(pixelTrack::TrackSoAView tracks_view, +__global__ void kernel_countHitInTracks(TkSoAView tracks_view, CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) { int first = blockDim.x * blockIdx.x + threadIdx.x; for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { @@ -535,7 +535,7 @@ __global__ void kernel_countHitInTracks(pixelTrack::TrackSoAView tracks_view, } } -__global__ void kernel_fillHitInTracks(pixelTrack::TrackSoAView tracks_view, // TODO: Make ConstView +__global__ void kernel_fillHitInTracks(TkSoAView tracks_view, // TODO: Make ConstView CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) { int first = blockDim.x * blockIdx.x + threadIdx.x; for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { @@ -546,20 +546,18 @@ __global__ void kernel_fillHitInTracks(pixelTrack::TrackSoAView tracks_view, // } } -__global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples, - TrackingRecHit2DSOAView const *__restrict__ hhp, - HitContainer *__restrict__ hitDetIndices) { +__global__ void kernel_fillHitDetIndices(TkSoAView tracks_view, TrackingRecHit2DSOAView const *__restrict__ hhp) { int first = blockDim.x * blockIdx.x + threadIdx.x; // copy offsets - for (int idx = first, ntot = tuples->totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { - hitDetIndices->off[idx] = tuples->off[idx]; + for (int idx = first, ntot = tracks_view.hitIndices().totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { + tracks_view.detIndices().off[idx] = tracks_view.hitIndices().off[idx]; } // fill hit indices auto const &hh = *hhp; auto nhits = hh.nHits(); - for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) { - assert(tuples->content[idx] < nhits); - hitDetIndices->content[idx] = hh.detectorIndex(tuples->content[idx]); + for (int idx = first, ntot = tracks_view.hitIndices().size(); idx < ntot; idx += gridDim.x * blockDim.x) { + assert(tracks_view.hitIndices().content[idx] < nhits); + tracks_view.detIndices().content[idx] = hh.detectorIndex(tracks_view.hitIndices().content[idx]); } } @@ -573,7 +571,7 @@ __global__ void kernel_fillNLayers(TkSoAView tracks_view, cms::cuda::AtomicPairC if (0 == first) tracks_view.nTracks() = ntracks; for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) { - auto nHits = pixelTrack::nHits(tracks_view, idx); + auto nHits = pixelTrack::utilities::nHits(tracks_view, idx); assert(nHits >= 3); tracks_view[idx].nLayers() = pixelTrack::utilities::computeNumberOfLayers(tracks_view, idx); } @@ -859,7 +857,6 @@ __global__ void kernel_simpleTripletCleaner( } __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__restrict__ hhp, - HitContainer const *__restrict__ ptuples, TkSoAConstView tracks_view, CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple, int32_t firstPrint, @@ -867,11 +864,11 @@ __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__res int iev) { constexpr auto loose = (uint8_t)pixelTrack::Quality::loose; auto const &hh = *hhp; - auto const &foundNtuplets = *ptuples; + // auto const &foundNtuplets = *ptuples; int first = firstPrint + blockDim.x * blockIdx.x + threadIdx.x; - for (int i = first, np = std::min(lastPrint, foundNtuplets.nOnes()); i < np; i += blockDim.x * gridDim.x) { - auto nh = foundNtuplets.size(i); + for (int i = first, np = std::min(lastPrint, tracks_view.hitIndices().nOnes()); i < np; i += blockDim.x * gridDim.x) { + auto nh = tracks_view.hitIndices().size(i); if (nh < 3) continue; if (tracks_view[i].quality() < loose) @@ -889,13 +886,13 @@ __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__res pixelTrack::utilities::zip(tracks_view, i), // asinhf(fit_results[i].par(3)), tracks_view[i].chi2(), - hh.zGlobal(*foundNtuplets.begin(i)), - hh.zGlobal(*(foundNtuplets.begin(i) + 1)), - hh.zGlobal(*(foundNtuplets.begin(i) + 2)), - nh > 3 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 3))) : 0, - nh > 4 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 4))) : 0, - nh > 5 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 5))) : 0, - nh > 6 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + nh - 1))) : 0); + hh.zGlobal(*tracks_view.hitIndices().begin(i)), + hh.zGlobal(*(tracks_view.hitIndices().begin(i) + 1)), + hh.zGlobal(*(tracks_view.hitIndices().begin(i) + 2)), + nh > 3 ? hh.zGlobal(int(*(tracks_view.hitIndices().begin(i) + 3))) : 0, + nh > 4 ? hh.zGlobal(int(*(tracks_view.hitIndices().begin(i) + 4))) : 0, + nh > 5 ? hh.zGlobal(int(*(tracks_view.hitIndices().begin(i) + 5))) : 0, + nh > 6 ? hh.zGlobal(int(*(tracks_view.hitIndices().begin(i) + nh - 1))) : 0); } } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc index 6c5fdb36a9d46..8aa12a2fe5283 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -240,7 +240,7 @@ pixelTrack::TrackSoA CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU // now fit HelixFitOnGPU fitter(bfield, m_params.fitNas4_); - fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa->view()); + fitter.allocateOnGPU(&(soa->view().hitIndices()), kernels.tupleMultiplicity(), soa->view()); if (m_params.useRiemannFit_) { fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets); @@ -255,9 +255,9 @@ pixelTrack::TrackSoA CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU #endif // check that the fixed-size SoA does not overflow - auto const& tsoa = *soa; - auto maxTracks = tsoa.stride(); - auto nTracks = tsoa.view().nTracks(); + + auto maxTracks = soa->view().metadata().size(); + auto nTracks = soa->view().nTracks(); assert(nTracks < maxTracks); if (nTracks == maxTracks - 1) { edm::LogWarning("PixelTracks") << "Unsorted reconstructed pixel tracks truncated to " << maxTracks - 1 From bd40481320d3126a376b930fcc40313762f5be20 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Fri, 21 Oct 2022 15:18:25 +0200 Subject: [PATCH 060/110] MOstly ported? Unused class rules messages --- CUDADataFormats/Track/src/classes_def.xml | 8 +++---- .../PixelTriplets/plugins/CAHitNtupletCUDA.cc | 15 ++++++------- .../plugins/CAHitNtupletGeneratorKernels.cu | 22 ++++++++----------- .../plugins/CAHitNtupletGeneratorOnGPU.cc | 18 +++++++-------- .../plugins/CAHitNtupletGeneratorOnGPU.h | 4 ++-- .../PixelTriplets/plugins/HelixFitOnGPU.cc | 6 ++--- .../PixelTriplets/plugins/HelixFitOnGPU.h | 2 +- 7 files changed, 34 insertions(+), 41 deletions(-) diff --git a/CUDADataFormats/Track/src/classes_def.xml b/CUDADataFormats/Track/src/classes_def.xml index 9c80ae91baf29..0255d34cf80d5 100644 --- a/CUDADataFormats/Track/src/classes_def.xml +++ b/CUDADataFormats/Track/src/classes_def.xml @@ -1,6 +1,6 @@ - - - - + + + + diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc index c9831afc01067..c539f74b85af8 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc @@ -40,12 +40,13 @@ class CAHitNtupletCUDA : public edm::global::EDProducer<> { bool onGPU_; edm::ESGetToken tokenField_; + // GPU edm::EDGetTokenT> tokenHitGPU_; - //edm::EDPutTokenT> tokenTrackGPU_; - edm::EDPutTokenT> tokenTrackGPU_; + edm::EDPutTokenT> tokenTrackGPU_; + + // CPU edm::EDGetTokenT tokenHitCPU_; - //edm::EDPutTokenT tokenTrackCPU_; - edm::EDPutTokenT tokenTrackCPU_; + edm::EDPutTokenT tokenTrackCPU_; CAHitNtupletGeneratorOnGPU gpuAlgo_; }; @@ -55,12 +56,10 @@ CAHitNtupletCUDA::CAHitNtupletCUDA(const edm::ParameterSet& iConfig) if (onGPU_) { tokenHitGPU_ = consumes>(iConfig.getParameter("pixelRecHitSrc")); - //tokenTrackGPU_ = produces>(); - tokenTrackGPU_ = produces>(); + tokenTrackGPU_ = produces>(); } else { tokenHitCPU_ = consumes(iConfig.getParameter("pixelRecHitSrc")); - //tokenTrackCPU_ = produces(); - tokenTrackCPU_ = produces(); + tokenTrackCPU_ = produces(); } } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index 1bd491d53f509..b9a77bd48737d 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -4,12 +4,10 @@ template <> void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { // these are pointer on GPU! - // auto *tuples_d = &tracks_d->hitIndices(); - auto *detId_d = tracks_d->view().detIndices(); - auto *quality_d = tracks_d->qualityData(); + auto *quality_d = pixelTrack::utilities::qualityData(tracks_d->view()); // zero tuples - cms::cuda::launchZero(tracks_d->view().hitIndices(), cudaStream); + cms::cuda::launchZero(&(tracks_d->view().hitIndices()), cudaStream); int32_t nhits = hh.nHits(); @@ -87,12 +85,11 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * blockSize = 128; numberOfBlocks = (HitContainer::ctNOnes() + blockSize - 1) / blockSize; cms::cuda::finalizeBulk<<>>(device_hitTuple_apc_, - tracks_d->view().hitIndices()); + &tracks_d->view().hitIndices()); - kernel_fillHitDetIndices<<>>( - tracks_d->view().hitIndices(), hh.view(), tracks_d->view().detIndices()); + kernel_fillHitDetIndices<<>>(tracks_d->view(), hh.view()); cudaCheck(cudaGetLastError()); - kernel_fillNLayers<<>>(tracks_d, tracks_d->view(), device_hitTuple_apc_); + kernel_fillNLayers<<>>(tracks_d->view(), device_hitTuple_apc_); cudaCheck(cudaGetLastError()); // remove duplicates (tracks that share a doublet) @@ -225,8 +222,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr template <> void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { // these are pointer on GPU! - // auto const *tuples_d = &tracks_d->hitIndices; - auto *quality_d = tracks_d->qualityData(); + auto *quality_d = pixelTrack::utilities::qualityData(tracks_d->view()); int32_t nhits = hh.nHits(); @@ -318,7 +314,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA kernel_doStatsForHitInTracks<<>>(device_hitToTuple_.get(), counters_); cudaCheck(cudaGetLastError()); numberOfBlocks = (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize; - kernel_doStatsForTracks<<>>(tuples_d, quality_d, counters_); + kernel_doStatsForTracks<<>>(tracks_d->view(), quality_d, counters_); cudaCheck(cudaGetLastError()); } #ifdef GPU_DEBUG @@ -334,11 +330,11 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA ++iev; for (int k = 0; k < 20000; k += 500) { kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>( - hh.view(), tuples_d, tracks_d->view(), device_hitToTuple_.get(), k, k + 500, iev); + hh.view(), tracks_d->view(), device_hitToTuple_.get(), k, k + 500, iev); cudaDeviceSynchronize(); } kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>( - hh.view(), tuples_d, tracks_d->view(), device_hitToTuple_.get(), 20000, 1000000, iev); + hh.view(), tracks_d->view(), device_hitToTuple_.get(), 20000, 1000000, iev); cudaDeviceSynchronize(); // cudaStreamSynchronize(cudaStream); } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc index 8aa12a2fe5283..5dd3de3e232f8 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -190,9 +190,9 @@ void CAHitNtupletGeneratorOnGPU::endJob() { float bfield, cudaStream_t stream) const { PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique(stream));*/ - pixelTrack::TrackSoA CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, - float bfield, - cudaStream_t stream) const { +pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, + float bfield, + cudaStream_t stream) const { pixelTrack::TrackSoA tracks(stream); auto* soa = &tracks; @@ -204,7 +204,7 @@ void CAHitNtupletGeneratorOnGPU::endJob() { kernels.launchKernels(hits_d, soa, stream); HelixFitOnGPU fitter(bfield, m_params.fitNas4_); - fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa->view()); + fitter.allocateOnGPU(kernels.tupleMultiplicity(), soa->view()); if (m_params.useRiemannFit_) { fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream); } else { @@ -218,10 +218,10 @@ void CAHitNtupletGeneratorOnGPU::endJob() { std::cout << "finished building pixel tracks on GPU" << std::endl; #endif - return tracks; + return tracks.view(); } -pixelTrack::TrackSoA CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const { +pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const { //PixelTrackHeterogeneous tracks(std::make_unique()); pixelTrack::TrackSoA tracks; @@ -236,11 +236,11 @@ pixelTrack::TrackSoA CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU kernels.launchKernels(hits_d, soa, nullptr); if (0 == hits_d.nHits()) - return tracks; + return tracks.view(); // now fit HelixFitOnGPU fitter(bfield, m_params.fitNas4_); - fitter.allocateOnGPU(&(soa->view().hitIndices()), kernels.tupleMultiplicity(), soa->view()); + fitter.allocateOnGPU(kernels.tupleMultiplicity(), soa->view()); if (m_params.useRiemannFit_) { fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets); @@ -264,5 +264,5 @@ pixelTrack::TrackSoA CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU << " candidates"; } - return tracks; + return tracks.view(); } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h index ff13d09c1361a..0c5b9531fed0c 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h @@ -47,9 +47,9 @@ class CAHitNtupletGeneratorOnGPU { void beginJob(); void endJob(); - pixelTrack::TrackSoA makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const; + pixelTrack::TrackSoAView makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const; - pixelTrack::TrackSoA makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const; + pixelTrack::TrackSoAView makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const; private: void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream) const; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc index 624934645338b..f757f574f6142 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc @@ -1,10 +1,8 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HelixFitOnGPU.h" -void HelixFitOnGPU::allocateOnGPU(Tuples const *tuples, - TupleMultiplicity const *tupleMultiplicity, - OutputSoAView helix_fit_results) { - tuples_ = tuples; +void HelixFitOnGPU::allocateOnGPU(TupleMultiplicity const *tupleMultiplicity, OutputSoAView helix_fit_results) { + tuples_ = &helix_fit_results.hitIndices(); tupleMultiplicity_ = tupleMultiplicity; outputSoa_ = helix_fit_results; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h index 67a180c53e887..9bda40749c052 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h @@ -50,7 +50,7 @@ class HelixFitOnGPU { void launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples); void launchBrokenLineKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples); - void allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoAView outputSoA); + void allocateOnGPU(TupleMultiplicity const *tupleMultiplicity, OutputSoAView outputSoA); void deallocateOnGPU(); private: From 21eb31b0a35f91bb75eb197c876e8ff8552143e8 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Fri, 21 Oct 2022 16:52:02 +0200 Subject: [PATCH 061/110] PixelTrackSoAFromCUDA adapted to new formats --- CUDADataFormats/Track/src/classes.h | 2 +- CUDADataFormats/Track/src/classes_def.xml | 4 + .../plugins/PixelTrackSoAFromCUDA.cc | 76 ++++++++----------- 3 files changed, 35 insertions(+), 47 deletions(-) diff --git a/CUDADataFormats/Track/src/classes.h b/CUDADataFormats/Track/src/classes.h index 97c116f6c88d3..5870985315f14 100644 --- a/CUDADataFormats/Track/src/classes.h +++ b/CUDADataFormats/Track/src/classes.h @@ -3,7 +3,7 @@ #include "CUDADataFormats/Common/interface/Product.h" #include "CUDADataFormats/Common/interface/HostProduct.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" #include "DataFormats/Common/interface/Wrapper.h" #endif // CUDADataFormats_Track_src_classes_h diff --git a/CUDADataFormats/Track/src/classes_def.xml b/CUDADataFormats/Track/src/classes_def.xml index 0255d34cf80d5..9f320a3833ff0 100644 --- a/CUDADataFormats/Track/src/classes_def.xml +++ b/CUDADataFormats/Track/src/classes_def.xml @@ -3,4 +3,8 @@ + + + + diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc index e43f6b028aa15..7cffdcb80a273 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc @@ -33,27 +33,16 @@ class PixelTrackSoAFromCUDA : public edm::stream::EDProducer edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; - //edm::EDGetTokenT> tokenCUDA_; - //edm::EDPutTokenT tokenSOA_; - - //edm::EDGetTokenT>> tokenCUDA_; - edm::EDGetTokenT> tokenCUDA_; - //edm::EDPutTokenT::View> tokenSOA_; - edm::EDPutTokenT tokenSOA_; - - //cms::cuda::host::unique_ptr soa_; - //cms::cuda::host::unique_ptr soa_; - //TrackSoAHeterogeneousT_test<>::View soa_; - pixelTrack::TrackSoA soa_; - pixelTrack::TrackSoAView tmp_view_; + edm::EDGetTokenT> tokenCUDA_; + edm::EDPutTokenT tokenSOA_; + + pixelTrack::TrackSoAView soa_view_h; + pixelTrack::TrackSoALayout soa_layout_h; }; PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig) - //: tokenCUDA_(consumes>(iConfig.getParameter("src"))), - // tokenSOA_(produces()) {} - : tokenCUDA_(consumes>(iConfig.getParameter("src"))), - //tokenSOA_(produces::View>()) {} - tokenSOA_(produces()) {} + : tokenCUDA_(consumes>(iConfig.getParameter("src"))), + tokenSOA_(produces()) {} void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { edm::ParameterSetDescription desc; @@ -62,41 +51,36 @@ void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& des descriptions.add("pixelTracksSoA", desc); } -/*void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent, - edm::EventSetup const& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - cms::cuda::Product const& inputDataWrapped = iEvent.get(tokenCUDA_); - cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; - auto const& inputData = ctx.get(inputDataWrapped); - - soa_ = inputData.toHostAsync(ctx.stream()); -}*/ - void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - cms::cuda::Product const& inputDataWrapped = iEvent.get(tokenCUDA_); + cms::cuda::Product const& inputDataWrapped = iEvent.get(tokenCUDA_); cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; - auto const& inputData = ctx.get(inputDataWrapped); + auto const& soa_layout_d = ctx.get(inputDataWrapped); // Layout of data on device - //class_ = inputData.toHostAsync(ctx.stream()); + auto soa_buffer_h = cms::cuda::make_host_unique(soa_layout_d.metadata().byteSize(), ctx.stream()); - pixelTrack::TrackSoA soa_(ctx.stream()); - cudaCheck(cudaMemcpy(&soa_,&inputData,sizeof(pixelTrack::TrackSoA),cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpyAsync(soa_buffer_h.get(), + soa_layout_d.metadata().data(), + soa_layout_d.metadata().byteSize(), + cudaMemcpyDeviceToHost, + ctx.stream())); + pixelTrack::TrackSoALayout soa_layout_h(soa_buffer_h.get(), soa_layout_d.metadata().size()); + pixelTrack::TrackSoAView soa_view_h(soa_layout_h); - auto retView = cms::cuda::make_host_unique(inputData.bufferSize(), ctx.stream()); - cudaCheck(cudaMemcpy(retView.get(),inputData.buffer().get(),TrackSoAHeterogeneousT_test<>::computeDataSize(32768),cudaMemcpyDeviceToHost)); - TrackSoAHeterogeneousT_test<> tmp_layout(retView.get(),32768); - TrackSoAHeterogeneousT_test<>::View tmp_view_(tmp_layout); + // // Allocate enough host memory to fit the SoA data in the input view + // auto soa_buffer_host = cms::cuda::make_host_unique(soa_.layout()., ctx.stream()); + // // Copy data from the view on device to host memory + // cudaCheck(cudaMemcpy(soa_buffer_host.get(), soa_.buffer().get(), soa_.metadata().byteSize(), cudaMemcpyDeviceToHost)); + // TrackSoAHeterogeneousT_test<> soa_layout(soa_buffer_host.get(), soa_.metadata().size()); + // TrackSoAHeterogeneousT_test<>::View soa_host_view_(soa_layout); // Store the host-side view } void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { // check that the fixed-size SoA does not overflow - //auto tsoa = soa_; - //auto maxTracks = tsoa.stride(); - auto maxTracks = 32768; - auto nTracks = tmp_view_.nTracks(); + auto maxTracks = soa_layout_h.metadata().size(); + auto nTracks = soa_view_h.nTracks(); assert(nTracks < maxTracks); if (nTracks == maxTracks - 1) { edm::LogWarning("PixelTracks") << "Unsorted reconstructed pixel tracks truncated to " << maxTracks - 1 @@ -104,13 +88,13 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i } #ifdef PIXEL_DEBUG_PRODUCE - std::cout << "size of SoA " << sizeof(soa_) << " stride " << maxTracks << std::endl; - std::cout << "found " << nTracks << " tracks in cpu SoA at " << &soa_ << std::endl; + std::cout << "size of SoA " << soa_layout_h.metadata().byteSize() << " stride " << maxTracks << std::endl; + std::cout << "found " << nTracks << " tracks in cpu SoA at " << soa_layout_h.metadata().data() << std::endl; int32_t nt = 0; for (int32_t it = 0; it < maxTracks; ++it) { - auto nHits = soa_.nHits(it); - assert(nHits == int(soa_.hitIndices.size(it))); + auto nHits = pixelTrack::utilities::nHits(soa_view_h, it); + assert(nHits == int(soa_view_h.hitIndices().size(it))); if (nHits == 0) break; // this is a guard: maybe we need to move to nTracks... nt++; @@ -119,7 +103,7 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i #endif // DO NOT make a copy (actually TWO....) - iEvent.emplace(tokenSOA_, std::move(soa_));//, std::move(ret)); // view + iEvent.emplace(tokenSOA_, std::move(soa_view_h)); //, std::move(ret)); // view //assert(!soa_); } From 2234467c25abad84e0b92948dd74067e1d2cc216 Mon Sep 17 00:00:00 2001 From: Breno Orzari Date: Fri, 21 Oct 2022 16:57:40 +0200 Subject: [PATCH 062/110] Adding copyToHost function --- .../Track/interface/TrackSoAHeterogeneousT_test.h | 12 ++++++++++++ .../Track/test/TrackSoAHeterogeneous_test.cpp | 8 ++------ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h index e5485588ecb8a..630c5b70bb22d 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h @@ -11,6 +11,10 @@ #include "DataFormats/SoATemplate/interface/SoALayout.h" #include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" #include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h" +#include "HeterogeneousCore/CUDAUtilities/interface/allocate_host.h" namespace pixelTrack { enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity, notQuality }; @@ -147,11 +151,19 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection>(S, stream) {} + + // Copy data from device to host + __host__ cms::cuda::host::unique_ptr copyToHost(cudaStream_t stream) { + auto tracks_h_soa = cms::cuda::make_host_unique(bufferSize(), stream); + cudaCheck(cudaMemcpy(tracks_h_soa.get(), const_buffer().get(), bufferSize(), cudaMemcpyDeviceToHost)); + return tracks_h_soa; + } }; namespace pixelTrack { using TrackSoA = TrackSoAHeterogeneousT; + using TrackSoALayout = TrackSoAHeterogeneousT_test<>; using TrackSoAView = TrackSoAHeterogeneousT_test<>::View; using TrackSoAConstView = TrackSoAHeterogeneousT_test<>::ConstView; diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp index 9bfd445bd786c..4be8343e3474d 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp @@ -7,7 +7,7 @@ the CUDA kernels which: - Fill the SoA with data. - Verify that the data written is correct. - + Then, the SoA data are copied back to Host, where a temporary host-side view (tmp_view) is created using the same Layout to access the data on host and print it. @@ -36,17 +36,13 @@ int main() { // Instantiate tracks on host. Portabledevicecollection allocates // SoA on device automatically. pixelTrack::TrackSoA tracks(stream); - uint32_t soaSize = tracks.bufferSize(); // SoA Layout size (bytes) uint32_t soaNumElements = tracks->metadata().size(); // Length of each SoA array in elements // Run the tests testTrackSoAHeterogeneousT::runKernels(tracks.view()); - // Copy SoA data back to host - auto tracks_h_soa = cms::cuda::make_host_unique(soaSize, stream); - cudaCheck(cudaMemcpy(tracks_h_soa.get(), tracks.const_buffer().get(), soaSize, cudaMemcpyDeviceToHost)); - // Create a view to access the copied data + auto tracks_h_soa = tracks.copyToHost(stream); TrackSoAHeterogeneousT_test<> tmp_layout(tracks_h_soa.get(), soaNumElements); TrackSoAHeterogeneousT_test<>::View tmp_view(tmp_layout); From f98898e2fef9ad932da862d65c1859f749b84a91 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Fri, 21 Oct 2022 17:50:34 +0200 Subject: [PATCH 063/110] CAHitNtuplet outputs modified to Layout(GPU) or View(CPU) --- .../PixelTriplets/plugins/CAHitNtupletCUDA.cc | 6 ++++-- .../plugins/CAHitNtupletGeneratorOnGPU.cc | 10 ++++++---- .../PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h | 6 +++++- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc index c539f74b85af8..219dc21ec93d9 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc @@ -41,10 +41,12 @@ class CAHitNtupletCUDA : public edm::global::EDProducer<> { edm::ESGetToken tokenField_; // GPU + // Produces a view on GPU, which is used by PixelTrackSoAFromCUDA edm::EDGetTokenT> tokenHitGPU_; - edm::EDPutTokenT> tokenTrackGPU_; + edm::EDPutTokenT> tokenTrackGPU_; // CPU + // Produces a view on CPU, which is used by PixelTrackProducerFromSoA edm::EDGetTokenT tokenHitCPU_; edm::EDPutTokenT tokenTrackCPU_; @@ -56,7 +58,7 @@ CAHitNtupletCUDA::CAHitNtupletCUDA(const edm::ParameterSet& iConfig) if (onGPU_) { tokenHitGPU_ = consumes>(iConfig.getParameter("pixelRecHitSrc")); - tokenTrackGPU_ = produces>(); + tokenTrackGPU_ = produces>(); } else { tokenHitCPU_ = consumes(iConfig.getParameter("pixelRecHitSrc")); tokenTrackCPU_ = produces(); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc index 5dd3de3e232f8..7233e0e241fcc 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -190,10 +190,12 @@ void CAHitNtupletGeneratorOnGPU::endJob() { float bfield, cudaStream_t stream) const { PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique(stream));*/ -pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, - float bfield, - cudaStream_t stream) const { +pixelTrack::TrackSoALayout CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, + float bfield, + cudaStream_t stream) const { pixelTrack::TrackSoA tracks(stream); + auto soaNumElements = tracks->metadata().size(); + TrackSoAHeterogeneousT_test<> tmp_layout(tracks.buffer().get(), soaNumElements); auto* soa = &tracks; CAHitNtupletGeneratorKernelsGPU kernels(m_params); @@ -218,7 +220,7 @@ pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRec std::cout << "finished building pixel tracks on GPU" << std::endl; #endif - return tracks.view(); + return tmp_layout; } pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const { diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h index 0c5b9531fed0c..621503f3e22a4 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h @@ -47,8 +47,12 @@ class CAHitNtupletGeneratorOnGPU { void beginJob(); void endJob(); - pixelTrack::TrackSoAView makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const; + // On GPU + pixelTrack::TrackSoALayout makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, + float bfield, + cudaStream_t stream) const; + // On CPU pixelTrack::TrackSoAView makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const; private: From d566db8155ead426f7e9dfd91e8170b824f09015 Mon Sep 17 00:00:00 2001 From: Breno Orzari Date: Fri, 21 Oct 2022 18:15:44 +0200 Subject: [PATCH 064/110] Changing products to soa views --- .../plugins/PixelTrackSoAFromCUDA.cc | 22 +++++++++---------- .../PixelTriplets/plugins/CAHitNtupletCUDA.cc | 4 ++-- .../plugins/CAHitNtupletGeneratorOnGPU.cc | 6 ++--- .../plugins/CAHitNtupletGeneratorOnGPU.h | 2 +- .../PixelTriplets/src/classes.h | 1 + .../PixelTriplets/src/classes_def.xml | 4 ++++ .../plugins/PixelVertexProducerCUDA.cc | 11 +++++----- 7 files changed, 27 insertions(+), 23 deletions(-) diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc index 7cffdcb80a273..e31f195578f35 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc @@ -33,15 +33,15 @@ class PixelTrackSoAFromCUDA : public edm::stream::EDProducer edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; - edm::EDGetTokenT> tokenCUDA_; + edm::EDGetTokenT tokenCUDA_; edm::EDPutTokenT tokenSOA_; pixelTrack::TrackSoAView soa_view_h; - pixelTrack::TrackSoALayout soa_layout_h; + //pixelTrack::TrackSoALayout soa_layout_h; }; PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig) - : tokenCUDA_(consumes>(iConfig.getParameter("src"))), + : tokenCUDA_(consumes(iConfig.getParameter("src"))), tokenSOA_(produces()) {} void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { @@ -54,11 +54,11 @@ void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& des void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - cms::cuda::Product const& inputDataWrapped = iEvent.get(tokenCUDA_); - cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; - auto const& soa_layout_d = ctx.get(inputDataWrapped); // Layout of data on device + soa_view_h = iEvent.get(tokenCUDA_); + //cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; + //auto const& soa_view_h = ctx.get(inputDataWrapped); // Layout of data on device - auto soa_buffer_h = cms::cuda::make_host_unique(soa_layout_d.metadata().byteSize(), ctx.stream()); + /*auto soa_buffer_h = cms::cuda::make_host_unique(soa_layout_d.metadata().byteSize(), ctx.stream()); cudaCheck(cudaMemcpyAsync(soa_buffer_h.get(), soa_layout_d.metadata().data(), @@ -66,7 +66,7 @@ void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent, cudaMemcpyDeviceToHost, ctx.stream())); pixelTrack::TrackSoALayout soa_layout_h(soa_buffer_h.get(), soa_layout_d.metadata().size()); - pixelTrack::TrackSoAView soa_view_h(soa_layout_h); + pixelTrack::TrackSoAView soa_view_h(soa_layout_h);*/ // // Allocate enough host memory to fit the SoA data in the input view // auto soa_buffer_host = cms::cuda::make_host_unique(soa_.layout()., ctx.stream()); @@ -79,7 +79,7 @@ void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent, void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { // check that the fixed-size SoA does not overflow - auto maxTracks = soa_layout_h.metadata().size(); + auto maxTracks = soa_view_h.metadata().size(); auto nTracks = soa_view_h.nTracks(); assert(nTracks < maxTracks); if (nTracks == maxTracks - 1) { @@ -88,8 +88,8 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i } #ifdef PIXEL_DEBUG_PRODUCE - std::cout << "size of SoA " << soa_layout_h.metadata().byteSize() << " stride " << maxTracks << std::endl; - std::cout << "found " << nTracks << " tracks in cpu SoA at " << soa_layout_h.metadata().data() << std::endl; + std::cout << " stride " << maxTracks << std::endl; + std::cout << "found " << nTracks << std::endl; int32_t nt = 0; for (int32_t it = 0; it < maxTracks; ++it) { diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc index 219dc21ec93d9..502717f263a90 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc @@ -43,7 +43,7 @@ class CAHitNtupletCUDA : public edm::global::EDProducer<> { // GPU // Produces a view on GPU, which is used by PixelTrackSoAFromCUDA edm::EDGetTokenT> tokenHitGPU_; - edm::EDPutTokenT> tokenTrackGPU_; + edm::EDPutTokenT tokenTrackGPU_; // CPU // Produces a view on CPU, which is used by PixelTrackProducerFromSoA @@ -58,7 +58,7 @@ CAHitNtupletCUDA::CAHitNtupletCUDA(const edm::ParameterSet& iConfig) if (onGPU_) { tokenHitGPU_ = consumes>(iConfig.getParameter("pixelRecHitSrc")); - tokenTrackGPU_ = produces>(); + tokenTrackGPU_ = produces(); } else { tokenHitCPU_ = consumes(iConfig.getParameter("pixelRecHitSrc")); tokenTrackCPU_ = produces(); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc index 7233e0e241fcc..6a8de7fc49f66 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -190,12 +190,10 @@ void CAHitNtupletGeneratorOnGPU::endJob() { float bfield, cudaStream_t stream) const { PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique(stream));*/ -pixelTrack::TrackSoALayout CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, +pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const { pixelTrack::TrackSoA tracks(stream); - auto soaNumElements = tracks->metadata().size(); - TrackSoAHeterogeneousT_test<> tmp_layout(tracks.buffer().get(), soaNumElements); auto* soa = &tracks; CAHitNtupletGeneratorKernelsGPU kernels(m_params); @@ -220,7 +218,7 @@ pixelTrack::TrackSoALayout CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingR std::cout << "finished building pixel tracks on GPU" << std::endl; #endif - return tmp_layout; + return tracks.view(); } pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const { diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h index 621503f3e22a4..85457f30fd19d 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h @@ -48,7 +48,7 @@ class CAHitNtupletGeneratorOnGPU { void endJob(); // On GPU - pixelTrack::TrackSoALayout makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, + pixelTrack::TrackSoAView makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const; diff --git a/RecoPixelVertexing/PixelTriplets/src/classes.h b/RecoPixelVertexing/PixelTriplets/src/classes.h index 4f495027ac186..db84e140b26de 100644 --- a/RecoPixelVertexing/PixelTriplets/src/classes.h +++ b/RecoPixelVertexing/PixelTriplets/src/classes.h @@ -1,5 +1,6 @@ #include "RecoPixelVertexing/PixelTriplets/interface/IntermediateHitTriplets.h" #include "DataFormats/Common/interface/Wrapper.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" #include diff --git a/RecoPixelVertexing/PixelTriplets/src/classes_def.xml b/RecoPixelVertexing/PixelTriplets/src/classes_def.xml index ea89a65a45dbb..78018a50bfff3 100644 --- a/RecoPixelVertexing/PixelTriplets/src/classes_def.xml +++ b/RecoPixelVertexing/PixelTriplets/src/classes_def.xml @@ -1,4 +1,8 @@ + + + + diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc index 34b0ed9e29fc1..5b316a53a691e 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc @@ -16,6 +16,7 @@ #include "FWCore/Utilities/interface/EDGetToken.h" #include "FWCore/Utilities/interface/RunningAverage.h" #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" #include "gpuVertexFinder.h" @@ -35,9 +36,9 @@ class PixelVertexProducerCUDA : public edm::global::EDProducer<> { bool onGPU_; - edm::EDGetTokenT> tokenGPUTrack_; + edm::EDGetTokenT> tokenGPUTrack_; edm::EDPutTokenT tokenGPUVertex_; - edm::EDGetTokenT tokenCPUTrack_; + edm::EDGetTokenT tokenCPUTrack_; edm::EDPutTokenT tokenCPUVertex_; const gpuVertexFinder::Producer gpuAlgo_; @@ -62,10 +63,10 @@ PixelVertexProducerCUDA::PixelVertexProducerCUDA(const edm::ParameterSet& conf) { if (onGPU_) { tokenGPUTrack_ = - consumes>(conf.getParameter("pixelTrackSrc")); + consumes>(conf.getParameter("pixelTrackSrc")); tokenGPUVertex_ = produces(); } else { - tokenCPUTrack_ = consumes(conf.getParameter("pixelTrackSrc")); + tokenCPUTrack_ = consumes(conf.getParameter("pixelTrackSrc")); tokenCPUVertex_ = produces(); } } @@ -97,7 +98,7 @@ void PixelVertexProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions& d void PixelVertexProducerCUDA::produceOnGPU(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { - edm::Handle> hTracks; + edm::Handle> hTracks; iEvent.getByToken(tokenGPUTrack_, hTracks); cms::cuda::ScopedContextProduce ctx{*hTracks}; From d544dc3308fe1bb803656f4d074bcc7561c82c63 Mon Sep 17 00:00:00 2001 From: Breno Orzari Date: Tue, 25 Oct 2022 16:43:57 +0200 Subject: [PATCH 065/110] Solving issue with ctx.emplace --- RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc | 4 ++-- .../PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc index 502717f263a90..d8a634328af7a 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc @@ -43,7 +43,7 @@ class CAHitNtupletCUDA : public edm::global::EDProducer<> { // GPU // Produces a view on GPU, which is used by PixelTrackSoAFromCUDA edm::EDGetTokenT> tokenHitGPU_; - edm::EDPutTokenT tokenTrackGPU_; + edm::EDPutTokenT> tokenTrackGPU_; // CPU // Produces a view on CPU, which is used by PixelTrackProducerFromSoA @@ -58,7 +58,7 @@ CAHitNtupletCUDA::CAHitNtupletCUDA(const edm::ParameterSet& iConfig) if (onGPU_) { tokenHitGPU_ = consumes>(iConfig.getParameter("pixelRecHitSrc")); - tokenTrackGPU_ = produces(); + tokenTrackGPU_ = produces>(); } else { tokenHitCPU_ = consumes(iConfig.getParameter("pixelRecHitSrc")); tokenTrackCPU_ = produces(); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h index 85457f30fd19d..6b9a00ef9757f 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h @@ -3,7 +3,8 @@ #include #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" #include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h" #include "FWCore/ParameterSet/interface/ParameterSet.h" From 656d9d1acd28e332958046806f7f6b9452e1bae4 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Tue, 25 Oct 2022 17:54:35 +0200 Subject: [PATCH 066/110] PixelVertexProducer adapted to new inputs --- .../PixelTriplets/src/classes_def.xml | 4 +-- .../plugins/PixelVertexProducerCUDA.cc | 29 ++++++++--------- .../plugins/gpuVertexFinder.cc | 31 +++++++++---------- .../plugins/gpuVertexFinder.h | 6 ++-- 4 files changed, 32 insertions(+), 38 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/src/classes_def.xml b/RecoPixelVertexing/PixelTriplets/src/classes_def.xml index 78018a50bfff3..405eedfe74760 100644 --- a/RecoPixelVertexing/PixelTriplets/src/classes_def.xml +++ b/RecoPixelVertexing/PixelTriplets/src/classes_def.xml @@ -3,6 +3,6 @@ - - + + diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc index 5b316a53a691e..16b3267a326ce 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc @@ -36,7 +36,7 @@ class PixelVertexProducerCUDA : public edm::global::EDProducer<> { bool onGPU_; - edm::EDGetTokenT> tokenGPUTrack_; + edm::EDGetTokenT> tokenGPUTrack_; edm::EDPutTokenT tokenGPUVertex_; edm::EDGetTokenT tokenCPUTrack_; edm::EDPutTokenT tokenCPUVertex_; @@ -63,7 +63,7 @@ PixelVertexProducerCUDA::PixelVertexProducerCUDA(const edm::ParameterSet& conf) { if (onGPU_) { tokenGPUTrack_ = - consumes>(conf.getParameter("pixelTrackSrc")); + consumes>(conf.getParameter("pixelTrackSrc")); tokenGPUVertex_ = produces(); } else { tokenCPUTrack_ = consumes(conf.getParameter("pixelTrackSrc")); @@ -98,40 +98,37 @@ void PixelVertexProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions& d void PixelVertexProducerCUDA::produceOnGPU(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { - edm::Handle> hTracks; + edm::Handle> hTracks; iEvent.getByToken(tokenGPUTrack_, hTracks); cms::cuda::ScopedContextProduce ctx{*hTracks}; - auto const* tracks = ctx.get(*hTracks).get(); + auto tracks_view = ctx.get(*hTracks); - assert(tracks); - - ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks, ptMin_, ptMax_)); + ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks_view, ptMin_, ptMax_)); } void PixelVertexProducerCUDA::produceOnCPU(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { - auto const* tracks = iEvent.get(tokenCPUTrack_).get(); - assert(tracks); + auto tracks_view = iEvent.get(tokenCPUTrack_); #ifdef PIXVERTEX_DEBUG_PRODUCE - auto const& tsoa = *tracks; - auto maxTracks = tsoa.stride(); - std::cout << "size of SoA " << sizeof(tsoa) << " stride " << maxTracks << std::endl; + + auto maxTracks = tracks_view.metadata().size(); + // std::cout << "size of SoA " << sizeof(tsoa) << " stride " << maxTracks << std::endl; int32_t nt = 0; for (int32_t it = 0; it < maxTracks; ++it) { - auto nHits = tsoa.nHits(it); - assert(nHits == int(tsoa.hitIndices.size(it))); + auto nHits = pixelTrack::utilities::nHits(tracks_view, it); + assert(nHits == int(tracks_view.hitIndices().size(it))); if (nHits == 0) break; // this is a guard: maybe we need to move to nTracks... nt++; } - std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl; + // std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE - iEvent.emplace(tokenCPUVertex_, gpuAlgo_.make(tracks, ptMin_, ptMax_)); + iEvent.emplace(tokenCPUVertex_, gpuAlgo_.make(tracks_view, ptMin_, ptMax_)); } void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc index 20b007d2d029f..fe2f00f91b495 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc @@ -18,27 +18,24 @@ namespace gpuVertexFinder { // split vertices with a chi2/NDoF greater than this constexpr float maxChi2ForSplit = 9.f; - __global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin, float ptMax) { - assert(ptracks); + __global__ void loadTracks(TkSoAView tracks_view, ZVertexSoA* soa, WorkSpace* pws, float ptMin, float ptMax) { assert(soa); - auto const& tracks = *ptracks; - auto const& fit = tracks.stateAtBS; - auto const* quality = tracks.qualityData(); + auto const* quality = pixelTrack::utilities::qualityData(tracks_view); auto first = blockIdx.x * blockDim.x + threadIdx.x; - for (int idx = first, nt = tracks.nTracks(); idx < nt; idx += gridDim.x * blockDim.x) { - auto nHits = tracks.nHits(idx); + for (int idx = first, nt = tracks_view.nTracks(); idx < nt; idx += gridDim.x * blockDim.x) { + auto nHits = pixelTrack::utilities::nHits(tracks_view, idx); assert(nHits >= 3); // initialize soa... soa->idv[idx] = -1; - if (tracks.isTriplet(idx)) + if (pixelTrack::utilities::isTriplet(tracks_view, idx)) continue; // no triplets if (quality[idx] < pixelTrack::Quality::highPurity) continue; - auto pt = tracks.pt(idx); + auto pt = tracks_view[idx].pt(); if (pt < ptMin) continue; @@ -49,8 +46,8 @@ namespace gpuVertexFinder { auto& data = *pws; auto it = atomicAdd(&data.ntrks, 1); data.itrk[it] = idx; - data.zt[it] = tracks.zip(idx); - data.ezt2[it] = fit.covariance(idx)(14); + data.zt[it] = pixelTrack::utilities::zip(tracks_view, idx); + data.ezt2[it] = tracks_view[idx].covariance()(14); data.ptt2[it] = pt * pt; } } @@ -95,19 +92,19 @@ namespace gpuVertexFinder { #endif #ifdef __CUDACC__ - ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin, float ptMax) const { + ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoAView tracks_view, float ptMin, float ptMax) const { #ifdef PIXVERTEX_DEBUG_PRODUCE std::cout << "producing Vertices on GPU" << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE ZVertexHeterogeneous vertices(cms::cuda::make_device_unique(stream)); #else - ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin, float ptMax) const { + ZVertexHeterogeneous Producer::make(TkSoAView tracks_view, float ptMin, float ptMax) const { #ifdef PIXVERTEX_DEBUG_PRODUCE std::cout << "producing Vertices on CPU" << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE ZVertexHeterogeneous vertices(std::make_unique()); #endif - assert(tksoa); + // assert(tksoa); auto* soa = vertices.get(); assert(soa); @@ -120,12 +117,12 @@ namespace gpuVertexFinder { #ifdef __CUDACC__ init<<<1, 1, 0, stream>>>(soa, ws_d.get()); auto blockSize = 128; - auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize; - loadTracks<<>>(tksoa, soa, ws_d.get(), ptMin, ptMax); + auto numberOfBlocks = (tracks_view.metadata().size() + blockSize - 1) / blockSize; + loadTracks<<>>(tracks_view, soa, ws_d.get(), ptMin, ptMax); cudaCheck(cudaGetLastError()); #else init(soa, ws_d.get()); - loadTracks(tksoa, soa, ws_d.get(), ptMin, ptMax); + loadTracks(tracks_view, soa, ws_d.get(), ptMin, ptMax); #endif #ifdef __CUDACC__ diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h index 2b6a8107d927f..514c9b6a881fd 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h @@ -11,6 +11,7 @@ namespace gpuVertexFinder { using ZVertices = ZVertexSoA; using TkSoA = pixelTrack::TrackSoA; + using TkSoAView = pixelTrack::TrackSoAView; // workspace used in the vertex reco algos struct WorkSpace { @@ -42,7 +43,6 @@ namespace gpuVertexFinder { public: using ZVertices = ZVertexSoA; using WorkSpace = gpuVertexFinder::WorkSpace; - using TkSoA = pixelTrack::TrackSoA; Producer(bool oneKernel, bool useDensity, @@ -64,8 +64,8 @@ namespace gpuVertexFinder { ~Producer() = default; - ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin, float ptMax) const; - ZVertexHeterogeneous make(TkSoA const* tksoa, float ptMin, float ptMax) const; + ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoAView tracks_view, float ptMin, float ptMax) const; + ZVertexHeterogeneous make(TkSoAView tracks_view, float ptMin, float ptMax) const; private: const bool oneKernel_; From 3cdfab43ba24dcd3874c616afa29296d153ffae3 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Wed, 26 Oct 2022 12:15:35 +0200 Subject: [PATCH 067/110] Removed duplicate entries in classes_def --- RecoPixelVertexing/PixelTriplets/src/classes_def.xml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/src/classes_def.xml b/RecoPixelVertexing/PixelTriplets/src/classes_def.xml index 405eedfe74760..ea89a65a45dbb 100644 --- a/RecoPixelVertexing/PixelTriplets/src/classes_def.xml +++ b/RecoPixelVertexing/PixelTriplets/src/classes_def.xml @@ -1,8 +1,4 @@ - - - - From f01e173eb624eaf16237ec5d1dd207e819484a5e Mon Sep 17 00:00:00 2001 From: Breno Orzari Date: Wed, 26 Oct 2022 14:37:49 +0200 Subject: [PATCH 068/110] Changing PixelTrackProducerFromSoA to use view --- .../plugins/PixelTrackProducerFromSoA.cc | 38 ++++++++++++------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc index c5d31764b0fcb..9e4839ec8b644 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc @@ -27,7 +27,8 @@ #include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h" #include "CUDADataFormats/Common/interface/HostProduct.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" #include "storeTracks.h" @@ -35,7 +36,7 @@ /** * This class creates "leagcy" reco::Track - * objects from the output of SoA CA. + * objects from the output of SoA CA. */ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> { public: @@ -54,7 +55,8 @@ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> { // Event Data tokens const edm::EDGetTokenT tBeamSpot_; - const edm::EDGetTokenT tokenTrack_; + //const edm::EDGetTokenT tokenTrack_; + const edm::EDGetTokenT tokenTrack_; const edm::EDGetTokenT cpuHits_; const edm::EDGetTokenT hmsToken_; // Event Setup tokens @@ -67,7 +69,8 @@ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> { PixelTrackProducerFromSoA::PixelTrackProducerFromSoA(const edm::ParameterSet &iConfig) : tBeamSpot_(consumes(iConfig.getParameter("beamSpot"))), - tokenTrack_(consumes(iConfig.getParameter("trackSrc"))), + //tokenTrack_(consumes(iConfig.getParameter("trackSrc"))), + tokenTrack_(consumes(iConfig.getParameter("trackSrc"))), cpuHits_(consumes(iConfig.getParameter("pixelRecHitLegacySrc"))), hmsToken_(consumes(iConfig.getParameter("pixelRecHitLegacySrc"))), idealMagneticFieldToken_(esConsumes()), @@ -152,12 +155,16 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, std::vector hits; hits.reserve(5); - const auto &tsoa = *iEvent.get(tokenTrack_); + //const auto &tsoa = *iEvent.get(tokenTrack_); + auto tsoa = iEvent.get(tokenTrack_); - auto const *quality = pixelTrack::utilities::qualityData(tsoa.view()); + //auto const *quality = pixelTrack::utilities::qualityData(tsoa.view()); // auto const &fit = tsoa.stateAtBS; - auto const &hitIndices = tsoa.view().hitIndices(); - auto nTracks = tsoa.view().nTracks(); + //auto const &hitIndices = tsoa.view().hitIndices(); + //auto nTracks = tsoa.view().nTracks(); + auto const *quality = pixelTrack::utilities::qualityData(tsoa); + auto const hitIndices = tsoa.hitIndices(); + auto nTracks = tsoa.nTracks(); tracks.reserve(nTracks); @@ -167,13 +174,15 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, std::vector sortIdxs(nTracks); std::iota(sortIdxs.begin(), sortIdxs.end(), 0); std::sort(sortIdxs.begin(), sortIdxs.end(), [&](int32_t const i1, int32_t const i2) { - return tsoa.view()[i1].pt() > tsoa.view()[i2].pt(); + //return tsoa.view()[i1].pt() > tsoa.view()[i2].pt(); + return tsoa[i1].pt() > tsoa[i2].pt(); }); //store the index of the SoA: indToEdm[index_SoAtrack] -> index_edmTrack (if it exists) indToEdm.resize(sortIdxs.size(), -1); for (const auto &it : sortIdxs) { - auto nHits = pixelTrack::utilities::nHits(tsoa.view(), it); + //auto nHits = pixelTrack::utilities::nHits(tsoa.view(), it); + auto nHits = pixelTrack::utilities::nHits(tsoa, it); assert(nHits >= 3); auto q = quality[it]; if (q < minQuality_) @@ -190,12 +199,15 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, // mind: this values are respect the beamspot! - float chi2 = tsoa.view()[it].chi2(); - float phi = pixelTrack::utilities::phi(tsoa.view(), it); + //float chi2 = tsoa.view()[it].chi2(); + //float phi = pixelTrack::utilities::phi(tsoa.view(), it); + float chi2 = tsoa[it].chi2(); + float phi = pixelTrack::utilities::phi(tsoa, it); riemannFit::Vector5d ipar, opar; riemannFit::Matrix5d icov, ocov; - pixelTrack::utilities::copyToDense(tsoa.view(), ipar, icov, it); + //pixelTrack::utilities::copyToDense(tsoa.view(), ipar, icov, it); + pixelTrack::utilities::copyToDense(tsoa, ipar, icov, it); riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov); LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.); From 598ea08fc25579899b027fb317d31666c9d33340 Mon Sep 17 00:00:00 2001 From: Breno Orzari Date: Wed, 26 Oct 2022 15:07:30 +0200 Subject: [PATCH 069/110] Changing layout name to avoid underscore --- .../interface/TrackSoAHeterogeneousT_test.h | 16 ++++++++-------- .../Track/test/TrackSoAHeterogeneous_test.cpp | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h index 630c5b70bb22d..f7edf60840fce 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h @@ -43,7 +43,7 @@ using Vector5f = Eigen::Matrix; using Vector15f = Eigen::Matrix; using HitContainer = pixelTrack::HitContainer; -GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test, +GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousLayout, SOA_COLUMN(uint8_t, quality), SOA_COLUMN(float, chi2), // this is chi2/ndof as not necessarely all hits are used in the fit SOA_COLUMN(int8_t, nLayers), @@ -59,8 +59,8 @@ GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousT_test, // They operate on View and ConstView of the TrackSoA. namespace pixelTrack { namespace utilities { - using TrackSoAView = TrackSoAHeterogeneousT_test<>::View; - using TrackSoAConstView = TrackSoAHeterogeneousT_test<>::ConstView; + using TrackSoAView = TrackSoAHeterogeneousLayout<>::View; + using TrackSoAConstView = TrackSoAHeterogeneousLayout<>::ConstView; using Quality = pixelTrack::Quality; using hindex_type = uint32_t; // State at the Beam spot @@ -144,13 +144,13 @@ namespace pixelTrack { } // namespace pixelTrack template -class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection> { +class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection> { public: TrackSoAHeterogeneousT() = default; // Constructor which specifies the SoA size explicit TrackSoAHeterogeneousT(cudaStream_t stream) - : PortableDeviceCollection>(S, stream) {} + : PortableDeviceCollection>(S, stream) {} // Copy data from device to host __host__ cms::cuda::host::unique_ptr copyToHost(cudaStream_t stream) { @@ -163,9 +163,9 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection; - using TrackSoALayout = TrackSoAHeterogeneousT_test<>; - using TrackSoAView = TrackSoAHeterogeneousT_test<>::View; - using TrackSoAConstView = TrackSoAHeterogeneousT_test<>::ConstView; + using TrackSoALayout = TrackSoAHeterogeneousLayout<>; + using TrackSoAView = TrackSoAHeterogeneousLayout<>::View; + using TrackSoAConstView = TrackSoAHeterogeneousLayout<>::ConstView; } // namespace pixelTrack diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp index 4be8343e3474d..db26e83428f56 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp @@ -43,8 +43,8 @@ int main() { // Create a view to access the copied data auto tracks_h_soa = tracks.copyToHost(stream); - TrackSoAHeterogeneousT_test<> tmp_layout(tracks_h_soa.get(), soaNumElements); - TrackSoAHeterogeneousT_test<>::View tmp_view(tmp_layout); + TrackSoAHeterogeneousLayout<> tmp_layout(tracks_h_soa.get(), soaNumElements); + TrackSoAHeterogeneousLayout<>::View tmp_view(tmp_layout); // Print results std::cout << "pt" From 11acc8dad5fe537c3ec877716a2bd561b1affac8 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Wed, 26 Oct 2022 15:34:40 +0200 Subject: [PATCH 070/110] Correct instantiation of tracks on host-side --- .../interface/TrackSoAHeterogeneousT_test.h | 5 +- .../plugins/CAHitNtupletGeneratorKernels.cc | 48 +++++++++-------- .../plugins/CAHitNtupletGeneratorKernels.cu | 54 ++++++++++--------- .../plugins/CAHitNtupletGeneratorKernels.h | 4 +- .../plugins/CAHitNtupletGeneratorOnGPU.cc | 29 +++++----- 5 files changed, 75 insertions(+), 65 deletions(-) diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h index f7edf60840fce..323b41226bee0 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h @@ -153,8 +153,9 @@ class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection>(S, stream) {} // Copy data from device to host - __host__ cms::cuda::host::unique_ptr copyToHost(cudaStream_t stream) { - auto tracks_h_soa = cms::cuda::make_host_unique(bufferSize(), stream); + // Copy data from device to host + __host__ std::unique_ptr copyToHost(cudaStream_t stream) { + auto tracks_h_soa = std::make_unique(bufferSize()); cudaCheck(cudaMemcpy(tracks_h_soa.get(), const_buffer().get(), bufferSize(), cudaMemcpyDeviceToHost)); return tracks_h_soa; } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc index 80497d3dd706b..cdefeab9e36b7 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -78,7 +78,9 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr } template <> -void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { +void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, + TkSoAView tracks_view, + cudaStream_t cudaStream) { // auto *tuples_d = tracks_d->view().hitIndices(); // auto *detId_d = tracks_d->view().detIndices(); // auto *quality_d = tracks_d->qualityData(); @@ -86,7 +88,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * // assert(tuples_d && quality_d); // TODO Find equivalent for View // zero tuples - cms::cuda::launchZero(&tracks_d->view().hitIndices(), cudaStream); + cms::cuda::launchZero(&tracks_view.hitIndices(), cudaStream); auto nhits = hh.nHits(); @@ -119,22 +121,22 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * device_theCells_.get(), device_nCells_, device_theCellTracks_.get(), - tracks_d->view(), + tracks_view, device_hitTuple_apc_, params_.minHitsPerNtuplet_); if (params_.doStats_) kernel_mark_used(device_theCells_.get(), device_nCells_); - cms::cuda::finalizeBulk(device_hitTuple_apc_, &tracks_d->view().hitIndices()); + cms::cuda::finalizeBulk(device_hitTuple_apc_, &tracks_view.hitIndices()); - kernel_fillHitDetIndices(tracks_d->view(), hh.view()); - kernel_fillNLayers(tracks_d->view(), device_hitTuple_apc_); + kernel_fillHitDetIndices(tracks_view, hh.view()); + kernel_fillNLayers(tracks_view, device_hitTuple_apc_); // remove duplicates (tracks that share a doublet) - kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_); - kernel_countMultiplicity(tracks_d->view(), device_tupleMultiplicity_.get()); + kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_view, params_.dupPassThrough_); + kernel_countMultiplicity(tracks_view, device_tupleMultiplicity_.get()); cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream); - kernel_fillMultiplicity(tracks_d->view(), device_tupleMultiplicity_.get()); + kernel_fillMultiplicity(tracks_view, device_tupleMultiplicity_.get()); if (nhits > 1 && params_.lateFishbone_) { gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, true); @@ -142,13 +144,15 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * } template <> -void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { +void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, + TkSoAView tracks_view, + cudaStream_t cudaStream) { int32_t nhits = hh.nHits(); // auto const *tuples_d = &tracks_d->hitIndices; - auto *quality_d = pixelTrack::utilities::qualityData(tracks_d->view()); + auto *quality_d = pixelTrack::utilities::qualityData(tracks_view); // classify tracks based on kinematics - kernel_classifyTracks(tracks_d->view(), quality_d, params_.cuts_); + kernel_classifyTracks(tracks_view, quality_d, params_.cuts_); if (params_.lateFishbone_) { // apply fishbone cleaning to good tracks @@ -156,34 +160,34 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA } // remove duplicates (tracks that share a doublet) - kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_); + kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_view, params_.dupPassThrough_); // fill hit->track "map" if (params_.doSharedHitCut_ || params_.doStats_) { - kernel_countHitInTracks(tracks_d->view(), device_hitToTuple_.get()); + kernel_countHitInTracks(tracks_view, device_hitToTuple_.get()); cms::cuda::launchFinalize(hitToTupleView_, cudaStream); - kernel_fillHitInTracks(tracks_d->view(), device_hitToTuple_.get()); + kernel_fillHitInTracks(tracks_view, device_hitToTuple_.get()); } // remove duplicates (tracks that share at least one hit) if (params_.doSharedHitCut_) { kernel_rejectDuplicate( - tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); kernel_sharedHitCleaner( - hh.view(), tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + hh.view(), tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); if (params_.useSimpleTripletCleaner_) { kernel_simpleTripletCleaner( - tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); } else { kernel_tripletCleaner( - tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); } } if (params_.doStats_) { std::lock_guard guard(lock_stat); - kernel_checkOverflows(tracks_d->view(), + kernel_checkOverflows(tracks_view, device_tupleMultiplicity_.get(), device_hitToTuple_.get(), device_hitTuple_apc_, @@ -201,7 +205,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA // counters (add flag???) std::lock_guard guard(lock_stat); kernel_doStatsForHitInTracks(device_hitToTuple_.get(), counters_); - kernel_doStatsForTracks(tracks_d->view(), quality_d, counters_); + kernel_doStatsForTracks(tracks_view, quality_d, counters_); } #ifdef DUMP_GPU_TK_TUPLES @@ -210,7 +214,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA { std::lock_guard guard(lock); ++iev; - kernel_print_found_ntuplets(hh.view(), tracks_d->view(), device_hitToTuple_.get(), 0, 1000000, iev); + kernel_print_found_ntuplets(hh.view(), tracks_view, device_hitToTuple_.get(), 0, 1000000, iev); } #endif } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index b9a77bd48737d..9cbdcae1a13d8 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -2,12 +2,14 @@ #include template <> -void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { +void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, + TkSoAView tracks_view, + cudaStream_t cudaStream) { // these are pointer on GPU! - auto *quality_d = pixelTrack::utilities::qualityData(tracks_d->view()); + auto *quality_d = pixelTrack::utilities::qualityData(tracks_view); // zero tuples - cms::cuda::launchZero(&(tracks_d->view().hitIndices()), cudaStream); + cms::cuda::launchZero(&(tracks_view.hitIndices()), cudaStream); int32_t nhits = hh.nHits(); @@ -68,7 +70,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * device_theCells_.get(), device_nCells_, device_theCellTracks_.get(), - tracks_d->view(), + tracks_view, device_hitTuple_apc_, params_.minHitsPerNtuplet_); cudaCheck(cudaGetLastError()); @@ -85,26 +87,24 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * blockSize = 128; numberOfBlocks = (HitContainer::ctNOnes() + blockSize - 1) / blockSize; cms::cuda::finalizeBulk<<>>(device_hitTuple_apc_, - &tracks_d->view().hitIndices()); + &tracks_view.hitIndices()); - kernel_fillHitDetIndices<<>>(tracks_d->view(), hh.view()); + kernel_fillHitDetIndices<<>>(tracks_view, hh.view()); cudaCheck(cudaGetLastError()); - kernel_fillNLayers<<>>(tracks_d->view(), device_hitTuple_apc_); + kernel_fillNLayers<<>>(tracks_view, device_hitTuple_apc_); cudaCheck(cudaGetLastError()); // remove duplicates (tracks that share a doublet) numberOfBlocks = nDoubletBlocks(blockSize); kernel_earlyDuplicateRemover<<>>( - device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_); + device_theCells_.get(), device_nCells_, tracks_view, params_.dupPassThrough_); cudaCheck(cudaGetLastError()); blockSize = 128; numberOfBlocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize; - kernel_countMultiplicity<<>>(tracks_d->view(), - device_tupleMultiplicity_.get()); + kernel_countMultiplicity<<>>(tracks_view, device_tupleMultiplicity_.get()); cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream); - kernel_fillMultiplicity<<>>(tracks_d->view(), - device_tupleMultiplicity_.get()); + kernel_fillMultiplicity<<>>(tracks_view, device_tupleMultiplicity_.get()); cudaCheck(cudaGetLastError()); // do not run the fishbone if there are hits only in BPIX1 @@ -220,9 +220,11 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr } template <> -void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { +void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, + TkSoAView tracks_view, + cudaStream_t cudaStream) { // these are pointer on GPU! - auto *quality_d = pixelTrack::utilities::qualityData(tracks_d->view()); + auto *quality_d = pixelTrack::utilities::qualityData(tracks_view); int32_t nhits = hh.nHits(); @@ -230,7 +232,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA // classify tracks based on kinematics auto numberOfBlocks = nQuadrupletBlocks(blockSize); - kernel_classifyTracks<<>>(tracks_d->view(), quality_d, params_.cuts_); + kernel_classifyTracks<<>>(tracks_view, quality_d, params_.cuts_); cudaCheck(cudaGetLastError()); @@ -245,7 +247,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA // mark duplicates (tracks that share a doublet) numberOfBlocks = nDoubletBlocks(blockSize); kernel_fastDuplicateRemover<<>>( - device_theCells_.get(), device_nCells_, tracks_d->view(), params_.dupPassThrough_); + device_theCells_.get(), device_nCells_, tracks_view, params_.dupPassThrough_); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaCheck(cudaDeviceSynchronize()); @@ -255,13 +257,13 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA // fill hit->track "map" assert(hitToTupleView_.offSize > nhits); numberOfBlocks = nQuadrupletBlocks(blockSize); - kernel_countHitInTracks<<>>(tracks_d->view(), device_hitToTuple_.get()); + kernel_countHitInTracks<<>>(tracks_view, device_hitToTuple_.get()); cudaCheck(cudaGetLastError()); assert((hitToTupleView_.assoc == device_hitToTuple_.get()) && (hitToTupleView_.offStorage == device_hitToTupleStorage_.get()) && (hitToTupleView_.offSize > 0)); cms::cuda::launchFinalize(hitToTupleView_, cudaStream); cudaCheck(cudaGetLastError()); - kernel_fillHitInTracks<<>>(tracks_d->view(), device_hitToTuple_.get()); + kernel_fillHitInTracks<<>>(tracks_view, device_hitToTuple_.get()); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaCheck(cudaDeviceSynchronize()); @@ -273,17 +275,17 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA numberOfBlocks = (hitToTupleView_.offSize + blockSize - 1) / blockSize; kernel_rejectDuplicate<<>>( - tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); kernel_sharedHitCleaner<<>>( - hh.view(), tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + hh.view(), tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); if (params_.useSimpleTripletCleaner_) { kernel_simpleTripletCleaner<<>>( - tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); } else { kernel_tripletCleaner<<>>( - tracks_d->view(), params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); } cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG @@ -293,7 +295,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA if (params_.doStats_) { numberOfBlocks = (std::max(nhits, int(params_.maxNumberOfDoublets_)) + blockSize - 1) / blockSize; - kernel_checkOverflows<<>>(tracks_d->view(), + kernel_checkOverflows<<>>(tracks_view, device_tupleMultiplicity_.get(), device_hitToTuple_.get(), device_hitTuple_apc_, @@ -314,7 +316,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA kernel_doStatsForHitInTracks<<>>(device_hitToTuple_.get(), counters_); cudaCheck(cudaGetLastError()); numberOfBlocks = (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize; - kernel_doStatsForTracks<<>>(tracks_d->view(), quality_d, counters_); + kernel_doStatsForTracks<<>>(tracks_view, quality_d, counters_); cudaCheck(cudaGetLastError()); } #ifdef GPU_DEBUG @@ -330,11 +332,11 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA ++iev; for (int k = 0; k < 20000; k += 500) { kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>( - hh.view(), tracks_d->view(), device_hitToTuple_.get(), k, k + 500, iev); + hh.view(), tracks_view, device_hitToTuple_.get(), k, k + 500, iev); cudaDeviceSynchronize(); } kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>( - hh.view(), tracks_d->view(), device_hitToTuple_.get(), 20000, 1000000, iev); + hh.view(), tracks_view, device_hitToTuple_.get(), 20000, 1000000, iev); cudaDeviceSynchronize(); // cudaStreamSynchronize(cudaStream); } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h index fcab52e96d210..5a82798905b13 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h @@ -186,9 +186,9 @@ class CAHitNtupletGeneratorKernels { TupleMultiplicity const* tupleMultiplicity() const { return device_tupleMultiplicity_.get(); } - void launchKernels(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream); + void launchKernels(HitsOnCPU const& hh, TkSoAView tracks_view, cudaStream_t cudaStream); - void classifyTuples(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream); + void classifyTuples(HitsOnCPU const& hh, TkSoAView tracks_view, cudaStream_t cudaStream); void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream); void allocateOnGPU(int32_t nHits, cudaStream_t stream); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc index 6a8de7fc49f66..f4ab7d3e83504 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -191,8 +191,8 @@ void CAHitNtupletGeneratorOnGPU::endJob() { cudaStream_t stream) const { PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique(stream));*/ pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, - float bfield, - cudaStream_t stream) const { + float bfield, + cudaStream_t stream) const { pixelTrack::TrackSoA tracks(stream); auto* soa = &tracks; @@ -201,7 +201,7 @@ pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRec kernels.allocateOnGPU(hits_d.nHits(), stream); kernels.buildDoublets(hits_d, stream); - kernels.launchKernels(hits_d, soa, stream); + kernels.launchKernels(hits_d, soa->view(), stream); HelixFitOnGPU fitter(bfield, m_params.fitNas4_); fitter.allocateOnGPU(kernels.tupleMultiplicity(), soa->view()); @@ -210,7 +210,7 @@ pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRec } else { fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream); } - kernels.classifyTuples(hits_d, soa, stream); + kernels.classifyTuples(hits_d, soa->view(), stream); #ifdef GPU_DEBUG cudaDeviceSynchronize(); @@ -223,9 +223,12 @@ pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRec pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const { //PixelTrackHeterogeneous tracks(std::make_unique()); - pixelTrack::TrackSoA tracks; + // pixelTrack::TrackSoA tracks; - auto* soa = &tracks; + auto tracks_h_soa = + std::make_unique(TrackSoAHeterogeneousLayout<>::computeDataSize(pixelTrack::maxNumber())); + TrackSoAHeterogeneousLayout<> tracks_layout(tracks_h_soa.get(), pixelTrack::maxNumber()); + TrackSoAHeterogeneousLayout<>::View tracks_view(tracks_layout); //assert(soa); CAHitNtupletGeneratorKernelsCPU kernels(m_params); @@ -233,14 +236,14 @@ pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2D kernels.allocateOnGPU(hits_d.nHits(), nullptr); kernels.buildDoublets(hits_d, nullptr); - kernels.launchKernels(hits_d, soa, nullptr); + kernels.launchKernels(hits_d, tracks_view, nullptr); if (0 == hits_d.nHits()) - return tracks.view(); + return tracks_view; // now fit HelixFitOnGPU fitter(bfield, m_params.fitNas4_); - fitter.allocateOnGPU(kernels.tupleMultiplicity(), soa->view()); + fitter.allocateOnGPU(kernels.tupleMultiplicity(), tracks_view); if (m_params.useRiemannFit_) { fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets); @@ -248,7 +251,7 @@ pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2D fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets); } - kernels.classifyTuples(hits_d, soa, nullptr); + kernels.classifyTuples(hits_d, tracks_view, nullptr); #ifdef GPU_DEBUG std::cout << "finished building pixel tracks on CPU" << std::endl; @@ -256,13 +259,13 @@ pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2D // check that the fixed-size SoA does not overflow - auto maxTracks = soa->view().metadata().size(); - auto nTracks = soa->view().nTracks(); + auto maxTracks = tracks_view.metadata().size(); + auto nTracks = tracks_view.nTracks(); assert(nTracks < maxTracks); if (nTracks == maxTracks - 1) { edm::LogWarning("PixelTracks") << "Unsorted reconstructed pixel tracks truncated to " << maxTracks - 1 << " candidates"; } - return tracks.view(); + return tracks_view; } From 34bf18b58c4571601209d443d86d1c0f11ee1777 Mon Sep 17 00:00:00 2001 From: Breno Orzari Date: Mon, 31 Oct 2022 15:48:29 +0100 Subject: [PATCH 071/110] Creating two PortableCollections: host and device --- ...ogeneousT_test.h => PixelTrackUtilities.h} | 35 ++----------------- .../interface/TrackSoAHeterogeneousDevice.h | 35 +++++++++++++++++++ .../interface/TrackSoAHeterogeneousHost.h | 32 +++++++++++++++++ .../Track/src/TrackSoAHeterogeneous_t_test.cc | 3 +- CUDADataFormats/Track/src/classes.h | 3 +- .../Track/test/TrackSoAHeterogeneous_t.cpp | 3 +- .../Track/test/TrackSoAHeterogeneous_test.cpp | 35 ++++++++++++------- .../Track/test/TrackSoAHeterogeneous_test.cu | 17 +++++---- 8 files changed, 109 insertions(+), 54 deletions(-) rename CUDADataFormats/Track/interface/{TrackSoAHeterogeneousT_test.h => PixelTrackUtilities.h} (80%) create mode 100644 CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h create mode 100644 CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h b/CUDADataFormats/Track/interface/PixelTrackUtilities.h similarity index 80% rename from CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h rename to CUDADataFormats/Track/interface/PixelTrackUtilities.h index 323b41226bee0..08ed721b20052 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h +++ b/CUDADataFormats/Track/interface/PixelTrackUtilities.h @@ -1,20 +1,10 @@ -#ifndef CUDADataFormats_Track_TrackHeterogeneousT_H -#define CUDADataFormats_Track_TrackHeterogeneousT_H - -#include -#include -#include +#ifndef CUDADataFormats_Track_PixelTrackUtilities_h +#define CUDADataFormats_Track_PixelTrackUtilities_h #include #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" #include "DataFormats/SoATemplate/interface/SoALayout.h" -#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" -#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" -#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h" -#include "HeterogeneousCore/CUDAUtilities/interface/allocate_host.h" namespace pixelTrack { enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity, notQuality }; @@ -143,31 +133,12 @@ namespace pixelTrack { } // namespace utilities } // namespace pixelTrack -template -class TrackSoAHeterogeneousT : public cms::cuda::PortableDeviceCollection> { -public: - TrackSoAHeterogeneousT() = default; - - // Constructor which specifies the SoA size - explicit TrackSoAHeterogeneousT(cudaStream_t stream) - : PortableDeviceCollection>(S, stream) {} - - // Copy data from device to host - // Copy data from device to host - __host__ std::unique_ptr copyToHost(cudaStream_t stream) { - auto tracks_h_soa = std::make_unique(bufferSize()); - cudaCheck(cudaMemcpy(tracks_h_soa.get(), const_buffer().get(), bufferSize(), cudaMemcpyDeviceToHost)); - return tracks_h_soa; - } -}; - namespace pixelTrack { - using TrackSoA = TrackSoAHeterogeneousT; using TrackSoALayout = TrackSoAHeterogeneousLayout<>; using TrackSoAView = TrackSoAHeterogeneousLayout<>::View; using TrackSoAConstView = TrackSoAHeterogeneousLayout<>::ConstView; } // namespace pixelTrack -#endif // CUDADataFormats_Track_TrackHeterogeneousT_H +#endif // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h new file mode 100644 index 0000000000000..cbafb46c9e099 --- /dev/null +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h @@ -0,0 +1,35 @@ +#ifndef CUDADataFormats_Track_TrackHeterogeneousDevice_H +#define CUDADataFormats_Track_TrackHeterogeneousDevice_H + +#include + +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +//#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +//#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h" +//#include "HeterogeneousCore/CUDAUtilities/interface/allocate_host.h" + +template +class TrackSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection> { +public: + //TrackSoAHeterogeneousDevice() = default; + + // Constructor which specifies the SoA size + explicit TrackSoAHeterogeneousDevice(cudaStream_t stream) + : PortableDeviceCollection>(S, stream) {} + + // Copy data from device to host + __host__ void copyToHost(cms::cuda::host::unique_ptr &host_ptr, cudaStream_t stream) { + cudaCheck(cudaMemcpy(host_ptr.get(), const_buffer().get(), bufferSize(), cudaMemcpyDeviceToHost)); + } +}; + +namespace pixelTrack { + + using TrackSoADevice = TrackSoAHeterogeneousDevice; + +} // namespace pixelTrack + +#endif // CUDADataFormats_Track_TrackHeterogeneousT_H diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h new file mode 100644 index 0000000000000..276ddabcc39d4 --- /dev/null +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h @@ -0,0 +1,32 @@ +#ifndef CUDADataFormats_Track_TrackHeterogeneousHost_H +#define CUDADataFormats_Track_TrackHeterogeneousHost_H + +#include + +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/Common/interface/PortableHostCollection.h" +//#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +//#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h" +//#include "HeterogeneousCore/CUDAUtilities/interface/allocate_host.h" + +template +class TrackSoAHeterogeneousHost : public cms::cuda::PortableHostCollection> { +public: + TrackSoAHeterogeneousHost() = default; + + // Constructor which specifies the SoA size + explicit TrackSoAHeterogeneousHost(cudaStream_t stream) + : PortableHostCollection>(S, stream) {} + + + +}; + +namespace pixelTrack { + + using TrackSoAHost = TrackSoAHeterogeneousHost; + +} // namespace pixelTrack + +#endif // CUDADataFormats_Track_TrackHeterogeneousT_H diff --git a/CUDADataFormats/Track/src/TrackSoAHeterogeneous_t_test.cc b/CUDADataFormats/Track/src/TrackSoAHeterogeneous_t_test.cc index b15debe3cb72b..24792bb6350f8 100644 --- a/CUDADataFormats/Track/src/TrackSoAHeterogeneous_t_test.cc +++ b/CUDADataFormats/Track/src/TrackSoAHeterogeneous_t_test.cc @@ -1 +1,2 @@ -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" diff --git a/CUDADataFormats/Track/src/classes.h b/CUDADataFormats/Track/src/classes.h index 5870985315f14..338598e28ebf5 100644 --- a/CUDADataFormats/Track/src/classes.h +++ b/CUDADataFormats/Track/src/classes.h @@ -3,7 +3,8 @@ #include "CUDADataFormats/Common/interface/Product.h" #include "CUDADataFormats/Common/interface/HostProduct.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" #include "DataFormats/Common/interface/Wrapper.h" #endif // CUDADataFormats_Track_src_classes_h diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_t.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_t.cpp index 9708b689dd05b..b3d62ffa810f6 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_t.cpp +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_t.cpp @@ -1,4 +1,5 @@ -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" #include #include diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp index db26e83428f56..be7d5fc7e6c1c 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp @@ -14,7 +14,8 @@ */ #include -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h" @@ -22,7 +23,7 @@ namespace testTrackSoAHeterogeneousT { - void runKernels(pixelTrack::TrackSoAView tracks_view); + void runKernels(pixelTrack::TrackSoAView tracks_view, cudaStream_t stream); } int main() { @@ -35,16 +36,24 @@ int main() { { // Instantiate tracks on host. Portabledevicecollection allocates // SoA on device automatically. - pixelTrack::TrackSoA tracks(stream); - uint32_t soaNumElements = tracks->metadata().size(); // Length of each SoA array in elements + // pixelTrack::TrackSoADevice tracks(stream); + // uint32_t soaNumElements = tracks->metadata().size(); // Length of each SoA array in elements + // + // // Run the tests + // testTrackSoAHeterogeneousT::runKernels(tracks.view()); + // + // // Create a view to access the copied data + // auto tracks_h_soa = tracks.copyToHost(stream); + // TrackSoAHeterogeneousLayout<> tmp_layout(tracks_h_soa.get(), soaNumElements); + // TrackSoAHeterogeneousLayout<>::View tmp_view(tmp_layout); - // Run the tests - testTrackSoAHeterogeneousT::runKernels(tracks.view()); + // pixelTrack::TrackSoAHost tracks_h(stream); + // pixelTrack::TrackSoADevice tracks_d(stream); + // testTrackSoAHeterogeneousT::runKernels(tracks_d.view()); + // tracks_d.copyToHost(tracks_h.buffer(), stream); - // Create a view to access the copied data - auto tracks_h_soa = tracks.copyToHost(stream); - TrackSoAHeterogeneousLayout<> tmp_layout(tracks_h_soa.get(), soaNumElements); - TrackSoAHeterogeneousLayout<>::View tmp_view(tmp_layout); + pixelTrack::TrackSoAHost tracks_h(stream); + testTrackSoAHeterogeneousT::runKernels(tracks_h.view(), stream); // Print results std::cout << "pt" @@ -60,9 +69,9 @@ int main() { << "hitIndices off" << std::endl; for (int i = 0; i < 10; ++i) { - std::cout << tmp_view[i].pt() << "\t" << tmp_view[i].eta() << "\t" << tmp_view[i].chi2() << "\t" - << (int)tmp_view[i].quality() << "\t" << (int)tmp_view[i].nLayers() << "\t" - << tmp_view.hitIndices().off[i] << std::endl; + std::cout << tracks_h.view()[i].pt() << "\t" << tracks_h.view()[i].eta() << "\t" << tracks_h.view()[i].chi2() << "\t" + << (int)tracks_h.view()[i].quality() << "\t" << (int)tracks_h.view()[i].nLayers() << "\t" + << tracks_h.view().hitIndices().off[i] << std::endl; } } cudaCheck(cudaStreamDestroy(stream)); diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu index 9c59d867629b2..b7602fb790752 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu @@ -1,5 +1,7 @@ -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" #include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" namespace testTrackSoAHeterogeneousT { @@ -20,7 +22,7 @@ namespace testTrackSoAHeterogeneousT { } // TODO: Using TrackSoAConstView fails to assert hitIndices correctly - __global__ void verify(pixelTrack::TrackSoAView tracks_view) { + __global__ void verify(pixelTrack::TrackSoAConstView tracks_view) { int i = threadIdx.x; if (i == 0) { @@ -37,10 +39,13 @@ namespace testTrackSoAHeterogeneousT { } } - void runKernels(pixelTrack::TrackSoAView tracks_view) { - fill<<<1, 1024>>>(tracks_view); - cudaDeviceSynchronize(); - verify<<<1, 1024>>>(tracks_view); + void runKernels(pixelTrack::TrackSoAView tracks_view, cudaStream_t stream) { + fill<<<1, 1024, 0, stream>>>(tracks_view); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + verify<<<1, 1024, 0, stream>>>(tracks_view); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); } } // namespace testTrackSoAHeterogeneousT From 45ace5b170a606cc9851822311be355cd7ddb63e Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Mon, 31 Oct 2022 16:31:48 +0100 Subject: [PATCH 072/110] Updating utilities to use references instead of instances --- .../Track/interface/PixelTrackUtilities.h | 24 +++++++++---------- .../Track/test/TrackSoAHeterogeneous_test.cpp | 4 ++-- .../Track/test/TrackSoAHeterogeneous_test.cu | 5 ++-- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/CUDADataFormats/Track/interface/PixelTrackUtilities.h b/CUDADataFormats/Track/interface/PixelTrackUtilities.h index 08ed721b20052..e5b44edb3b752 100644 --- a/CUDADataFormats/Track/interface/PixelTrackUtilities.h +++ b/CUDADataFormats/Track/interface/PixelTrackUtilities.h @@ -55,21 +55,21 @@ namespace pixelTrack { using hindex_type = uint32_t; // State at the Beam spot // phi,tip,1/pt,cotan(theta),zip - __host__ __device__ inline float charge(TrackSoAConstView tracks, int32_t i) { + __host__ __device__ inline float charge(TrackSoAConstView &tracks, int32_t i) { return std::copysign(1.f, tracks[i].state()(2)); } - __host__ __device__ inline float phi(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(0); } + __host__ __device__ inline float phi(TrackSoAConstView &tracks, int32_t i) { return tracks[i].state()(0); } - __host__ __device__ inline float tip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(1); } + __host__ __device__ inline float tip(TrackSoAConstView &tracks, int32_t i) { return tracks[i].state()(1); } - __host__ __device__ inline float zip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(4); } + __host__ __device__ inline float zip(TrackSoAConstView &tracks, int32_t i) { return tracks[i].state()(4); } - __host__ __device__ inline bool isTriplet(TrackSoAConstView tracks, int i) { return tracks[i].nLayers() == 3; } + __host__ __device__ inline bool isTriplet(TrackSoAConstView &tracks, int i) { return tracks[i].nLayers() == 3; } template __host__ __device__ inline void copyFromCircle( - TrackSoAView tracks, V3 const &cp, M3 const &ccov, V2 const &lp, M2 const &lcov, float b, int32_t i) { + TrackSoAView &tracks, V3 const &cp, M3 const &ccov, V2 const &lp, M2 const &lcov, float b, int32_t i) { tracks[i].state() << cp.template cast(), lp.template cast(); tracks[i].state()(2) = tracks[i].state()(2) * b; @@ -89,7 +89,7 @@ namespace pixelTrack { } template - __host__ __device__ inline void copyFromDense(TrackSoAView tracks, V5 const &v, M5 const &cov, int32_t i) { + __host__ __device__ inline void copyFromDense(TrackSoAView &tracks, V5 const &v, M5 const &cov, int32_t i) { tracks[i].state() = v.template cast(); for (int j = 0, ind = 0; j < 5; ++j) for (auto k = j; k < 5; ++k) @@ -97,7 +97,7 @@ namespace pixelTrack { } template - __host__ __device__ inline void copyToDense(TrackSoAConstView tracks, V5 &v, M5 &cov, int32_t i) { + __host__ __device__ inline void copyToDense(TrackSoAConstView &tracks, V5 &v, M5 &cov, int32_t i) { v = tracks[i].state().template cast(); for (int j = 0, ind = 0; j < 5; ++j) { cov(j, j) = tracks[i].covariance()(ind++); @@ -107,7 +107,7 @@ namespace pixelTrack { } // TODO: Not using TrackSoAConstView due to weird bugs with HitContainer - __host__ __device__ inline int computeNumberOfLayers(TrackSoAView tracks, int32_t i) { + __host__ __device__ inline int computeNumberOfLayers(TrackSoAView &tracks, int32_t i) { auto pdet = tracks.detIndices().begin(i); int nl = 1; auto ol = phase1PixelTopology::getLayer(*pdet); @@ -119,14 +119,14 @@ namespace pixelTrack { } return nl; } - __host__ __device__ inline int nHits(TrackSoAConstView tracks, int i) { return tracks.detIndices().size(i); } + __host__ __device__ inline int nHits(TrackSoAConstView &tracks, int i) { return tracks.detIndices().size(i); } // Casts quality SoA data (uint8_t) to pixelTrack::Quality. This is required // to use the data as an enum instead of a plain uint8_t - __host__ __device__ inline const Quality *qualityData(TrackSoAConstView tracks) { + __host__ __device__ inline const Quality *qualityData(TrackSoAConstView &tracks) { return reinterpret_cast(tracks.quality()); } - __host__ __device__ inline Quality *qualityData(TrackSoAView tracks) { + __host__ __device__ inline Quality *qualityData(TrackSoAView &tracks) { return reinterpret_cast(tracks.quality()); } diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp index be7d5fc7e6c1c..ef04698c3a104 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp @@ -69,8 +69,8 @@ int main() { << "hitIndices off" << std::endl; for (int i = 0; i < 10; ++i) { - std::cout << tracks_h.view()[i].pt() << "\t" << tracks_h.view()[i].eta() << "\t" << tracks_h.view()[i].chi2() << "\t" - << (int)tracks_h.view()[i].quality() << "\t" << (int)tracks_h.view()[i].nLayers() << "\t" + std::cout << tracks_h.view()[i].pt() << "\t" << tracks_h.view()[i].eta() << "\t" << tracks_h.view()[i].chi2() + << "\t" << (int)tracks_h.view()[i].quality() << "\t" << (int)tracks_h.view()[i].nLayers() << "\t" << tracks_h.view().hitIndices().off[i] << std::endl; } } diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu index b7602fb790752..8273f011ace80 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu @@ -21,8 +21,8 @@ namespace testTrackSoAHeterogeneousT { } } - // TODO: Using TrackSoAConstView fails to assert hitIndices correctly - __global__ void verify(pixelTrack::TrackSoAConstView tracks_view) { + // TODO: Use TrackSoAConstView when https://github.com/cms-sw/cmssw/pull/39919 is merged + __global__ void verify(pixelTrack::TrackSoAView tracks_view) { int i = threadIdx.x; if (i == 0) { @@ -43,6 +43,7 @@ namespace testTrackSoAHeterogeneousT { fill<<<1, 1024, 0, stream>>>(tracks_view); cudaCheck(cudaGetLastError()); cudaCheck(cudaDeviceSynchronize()); + verify<<<1, 1024, 0, stream>>>(tracks_view); cudaCheck(cudaGetLastError()); cudaCheck(cudaDeviceSynchronize()); From eff7567607df8d3f00a6e8f2632c68e934f42524 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Mon, 31 Oct 2022 16:35:34 +0100 Subject: [PATCH 073/110] Cleanup test --- .../Track/test/TrackSoAHeterogeneous_test.cpp | 23 +++++-------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp index ef04698c3a104..0ad6863d4f8c7 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp @@ -34,26 +34,15 @@ int main() { // Inner scope to deallocate memory before destroying the stream { - // Instantiate tracks on host. Portabledevicecollection allocates + // Instantiate tracks on device. PortableDeviceCollection allocates // SoA on device automatically. - // pixelTrack::TrackSoADevice tracks(stream); - // uint32_t soaNumElements = tracks->metadata().size(); // Length of each SoA array in elements - // - // // Run the tests - // testTrackSoAHeterogeneousT::runKernels(tracks.view()); - // - // // Create a view to access the copied data - // auto tracks_h_soa = tracks.copyToHost(stream); - // TrackSoAHeterogeneousLayout<> tmp_layout(tracks_h_soa.get(), soaNumElements); - // TrackSoAHeterogeneousLayout<>::View tmp_view(tmp_layout); - - // pixelTrack::TrackSoAHost tracks_h(stream); - // pixelTrack::TrackSoADevice tracks_d(stream); - // testTrackSoAHeterogeneousT::runKernels(tracks_d.view()); - // tracks_d.copyToHost(tracks_h.buffer(), stream); + pixelTrack::TrackSoADevice tracks_d(stream); + testTrackSoAHeterogeneousT::runKernels(tracks_d.view(), stream); + // Instantate tracks on host. This is where the data will be + // copied to from device. pixelTrack::TrackSoAHost tracks_h(stream); - testTrackSoAHeterogeneousT::runKernels(tracks_h.view(), stream); + tracks_d.copyToHost(tracks_h.buffer(), stream); // Print results std::cout << "pt" From 8aeb647f3cd9760d74ccfed0a55a9d4fd8c43b43 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Mon, 31 Oct 2022 16:40:46 +0100 Subject: [PATCH 074/110] Minor cleanup, comments, namespace usage --- CUDADataFormats/Track/interface/PixelTrackUtilities.h | 6 +++--- .../Track/interface/TrackSoAHeterogeneousDevice.h | 5 +---- .../Track/interface/TrackSoAHeterogeneousHost.h | 8 +------- 3 files changed, 5 insertions(+), 14 deletions(-) diff --git a/CUDADataFormats/Track/interface/PixelTrackUtilities.h b/CUDADataFormats/Track/interface/PixelTrackUtilities.h index e5b44edb3b752..1c7ffe22711e8 100644 --- a/CUDADataFormats/Track/interface/PixelTrackUtilities.h +++ b/CUDADataFormats/Track/interface/PixelTrackUtilities.h @@ -119,7 +119,7 @@ namespace pixelTrack { } return nl; } - __host__ __device__ inline int nHits(TrackSoAConstView &tracks, int i) { return tracks.detIndices().size(i); } + __host__ __device__ inline int nHits(TrackSoAView &tracks, int i) { return tracks.detIndices().size(i); } // Casts quality SoA data (uint8_t) to pixelTrack::Quality. This is required // to use the data as an enum instead of a plain uint8_t @@ -134,11 +134,11 @@ namespace pixelTrack { } // namespace pixelTrack namespace pixelTrack { - + // Common types for both Host and Device code using TrackSoALayout = TrackSoAHeterogeneousLayout<>; using TrackSoAView = TrackSoAHeterogeneousLayout<>::View; using TrackSoAConstView = TrackSoAHeterogeneousLayout<>::ConstView; } // namespace pixelTrack -#endif // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h +#endif diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h index cbafb46c9e099..a77643de29001 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h @@ -6,10 +6,7 @@ #include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" #include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" -//#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -//#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h" -//#include "HeterogeneousCore/CUDAUtilities/interface/allocate_host.h" template class TrackSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection> { @@ -28,7 +25,7 @@ class TrackSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection; + using TrackSoADevice = TrackSoAHeterogeneousDevice; } // namespace pixelTrack diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h index 276ddabcc39d4..a4b18134066a3 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h @@ -5,10 +5,7 @@ #include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" #include "CUDADataFormats/Common/interface/PortableHostCollection.h" -//#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -//#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h" -//#include "HeterogeneousCore/CUDAUtilities/interface/allocate_host.h" template class TrackSoAHeterogeneousHost : public cms::cuda::PortableHostCollection> { @@ -18,14 +15,11 @@ class TrackSoAHeterogeneousHost : public cms::cuda::PortableHostCollection>(S, stream) {} - - - }; namespace pixelTrack { - using TrackSoAHost = TrackSoAHeterogeneousHost; + using TrackSoAHost = TrackSoAHeterogeneousHost; } // namespace pixelTrack From 672aab19a7333dceba0bcf0f5e33305b575b91be Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Mon, 31 Oct 2022 17:06:45 +0100 Subject: [PATCH 075/110] Updating products in modules --- .../plugins/PixelTrackProducerFromSoA.cc | 7 +-- .../plugins/PixelTrackSoAFromCUDA.cc | 53 +++++++------------ .../PixelTriplets/plugins/CAHitNtupletCUDA.cc | 12 ++--- .../plugins/CAHitNtupletGeneratorOnGPU.cc | 40 ++++++-------- .../plugins/CAHitNtupletGeneratorOnGPU.h | 11 ++-- 5 files changed, 52 insertions(+), 71 deletions(-) diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc index 9e4839ec8b644..5ffa051c27cfc 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc @@ -27,8 +27,9 @@ #include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h" #include "CUDADataFormats/Common/interface/HostProduct.h" -//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" #include "storeTracks.h" @@ -56,7 +57,7 @@ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> { // Event Data tokens const edm::EDGetTokenT tBeamSpot_; //const edm::EDGetTokenT tokenTrack_; - const edm::EDGetTokenT tokenTrack_; + const edm::EDGetTokenT tokenTrack_; const edm::EDGetTokenT cpuHits_; const edm::EDGetTokenT hmsToken_; // Event Setup tokens diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc index e31f195578f35..594081963bb90 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc @@ -2,8 +2,9 @@ #include "CUDADataFormats/Common/interface/Product.h" #include "CUDADataFormats/Common/interface/HostProduct.h" -//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" #include "DataFormats/Common/interface/Handle.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" @@ -33,16 +34,15 @@ class PixelTrackSoAFromCUDA : public edm::stream::EDProducer edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; - edm::EDGetTokenT tokenCUDA_; - edm::EDPutTokenT tokenSOA_; + edm::EDGetTokenT> tokenCUDA_; + edm::EDPutTokenT tokenSOA_; - pixelTrack::TrackSoAView soa_view_h; - //pixelTrack::TrackSoALayout soa_layout_h; + pixelTrack::TrackSoAHost tracks_h; }; PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig) - : tokenCUDA_(consumes(iConfig.getParameter("src"))), - tokenSOA_(produces()) {} + : tokenCUDA_(consumes>(iConfig.getParameter("src"))), + tokenSOA_(produces()) {} void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { edm::ParameterSetDescription desc; @@ -54,33 +54,18 @@ void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& des void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - soa_view_h = iEvent.get(tokenCUDA_); - //cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; - //auto const& soa_view_h = ctx.get(inputDataWrapped); // Layout of data on device - - /*auto soa_buffer_h = cms::cuda::make_host_unique(soa_layout_d.metadata().byteSize(), ctx.stream()); - - cudaCheck(cudaMemcpyAsync(soa_buffer_h.get(), - soa_layout_d.metadata().data(), - soa_layout_d.metadata().byteSize(), - cudaMemcpyDeviceToHost, - ctx.stream())); - pixelTrack::TrackSoALayout soa_layout_h(soa_buffer_h.get(), soa_layout_d.metadata().size()); - pixelTrack::TrackSoAView soa_view_h(soa_layout_h);*/ - - // // Allocate enough host memory to fit the SoA data in the input view - // auto soa_buffer_host = cms::cuda::make_host_unique(soa_.layout()., ctx.stream()); - - // // Copy data from the view on device to host memory - // cudaCheck(cudaMemcpy(soa_buffer_host.get(), soa_.buffer().get(), soa_.metadata().byteSize(), cudaMemcpyDeviceToHost)); - // TrackSoAHeterogeneousT_test<> soa_layout(soa_buffer_host.get(), soa_.metadata().size()); - // TrackSoAHeterogeneousT_test<>::View soa_host_view_(soa_layout); // Store the host-side view + cms::cuda::Product const& inputDataWrapped = iEvent.get(tokenCUDA_); + cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; + auto const tracks_d = ctx.get(inputDataWrapped); // Tracks on device + + pixelTrack::TrackSoAHost tracks_h(ctx.stream()); + tracks_d.copyToHost(tracks_h.buffer(), ctx.stream()); } void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { // check that the fixed-size SoA does not overflow - auto maxTracks = soa_view_h.metadata().size(); - auto nTracks = soa_view_h.nTracks(); + auto maxTracks = tracks_h.view().metadata().size(); + auto nTracks = tracks_h.view().nTracks(); assert(nTracks < maxTracks); if (nTracks == maxTracks - 1) { edm::LogWarning("PixelTracks") << "Unsorted reconstructed pixel tracks truncated to " << maxTracks - 1 @@ -93,8 +78,8 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i int32_t nt = 0; for (int32_t it = 0; it < maxTracks; ++it) { - auto nHits = pixelTrack::utilities::nHits(soa_view_h, it); - assert(nHits == int(soa_view_h.hitIndices().size(it))); + auto nHits = pixelTrack::utilities::nHits(tracks_h.view(), it); + assert(nHits == int(tracks_h.view().hitIndices().size(it))); if (nHits == 0) break; // this is a guard: maybe we need to move to nTracks... nt++; @@ -103,7 +88,7 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i #endif // DO NOT make a copy (actually TWO....) - iEvent.emplace(tokenSOA_, std::move(soa_view_h)); //, std::move(ret)); // view + iEvent.emplace(tokenSOA_, std::move(tracks_h)); //, std::move(ret)); // view //assert(!soa_); } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc index d8a634328af7a..2e48865b682bf 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc @@ -20,8 +20,8 @@ #include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h" #include "CAHitNtupletGeneratorOnGPU.h" -//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" class CAHitNtupletCUDA : public edm::global::EDProducer<> { @@ -43,12 +43,12 @@ class CAHitNtupletCUDA : public edm::global::EDProducer<> { // GPU // Produces a view on GPU, which is used by PixelTrackSoAFromCUDA edm::EDGetTokenT> tokenHitGPU_; - edm::EDPutTokenT> tokenTrackGPU_; + edm::EDPutTokenT> tokenTrackGPU_; // CPU // Produces a view on CPU, which is used by PixelTrackProducerFromSoA edm::EDGetTokenT tokenHitCPU_; - edm::EDPutTokenT tokenTrackCPU_; + edm::EDPutTokenT tokenTrackCPU_; CAHitNtupletGeneratorOnGPU gpuAlgo_; }; @@ -58,10 +58,10 @@ CAHitNtupletCUDA::CAHitNtupletCUDA(const edm::ParameterSet& iConfig) if (onGPU_) { tokenHitGPU_ = consumes>(iConfig.getParameter("pixelRecHitSrc")); - tokenTrackGPU_ = produces>(); + tokenTrackGPU_ = produces>(); } else { tokenHitCPU_ = consumes(iConfig.getParameter("pixelRecHitSrc")); - tokenTrackCPU_ = produces(); + tokenTrackCPU_ = produces(); } } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc index f4ab7d3e83504..180711886c8d1 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -19,8 +19,8 @@ #include "FWCore/Utilities/interface/isFinite.h" #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" #include "TrackingTools/DetLayers/interface/BarrelDetLayer.h" - -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" #include "CAHitNtupletGeneratorOnGPU.h" @@ -190,10 +190,10 @@ void CAHitNtupletGeneratorOnGPU::endJob() { float bfield, cudaStream_t stream) const { PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique(stream));*/ -pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, - float bfield, - cudaStream_t stream) const { - pixelTrack::TrackSoA tracks(stream); +pixelTrack::TrackSoADevice CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, + float bfield, + cudaStream_t stream) const { + pixelTrack::TrackSoADevice tracks(stream); auto* soa = &tracks; CAHitNtupletGeneratorKernelsGPU kernels(m_params); @@ -218,32 +218,26 @@ pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRec std::cout << "finished building pixel tracks on GPU" << std::endl; #endif - return tracks.view(); + return tracks; } -pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const { - //PixelTrackHeterogeneous tracks(std::make_unique()); - // pixelTrack::TrackSoA tracks; - - auto tracks_h_soa = - std::make_unique(TrackSoAHeterogeneousLayout<>::computeDataSize(pixelTrack::maxNumber())); - TrackSoAHeterogeneousLayout<> tracks_layout(tracks_h_soa.get(), pixelTrack::maxNumber()); - TrackSoAHeterogeneousLayout<>::View tracks_view(tracks_layout); - //assert(soa); +pixelTrack::TrackSoAHost CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const { + pixelTrack::TrackSoADevice tracks(stream); + auto* soa = &tracks; CAHitNtupletGeneratorKernelsCPU kernels(m_params); kernels.setCounters(m_counters); kernels.allocateOnGPU(hits_d.nHits(), nullptr); kernels.buildDoublets(hits_d, nullptr); - kernels.launchKernels(hits_d, tracks_view, nullptr); + kernels.launchKernels(hits_d, tracks.view(), nullptr); if (0 == hits_d.nHits()) - return tracks_view; + return tracks; // now fit HelixFitOnGPU fitter(bfield, m_params.fitNas4_); - fitter.allocateOnGPU(kernels.tupleMultiplicity(), tracks_view); + fitter.allocateOnGPU(kernels.tupleMultiplicity(), tracks.view()); if (m_params.useRiemannFit_) { fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets); @@ -251,7 +245,7 @@ pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2D fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets); } - kernels.classifyTuples(hits_d, tracks_view, nullptr); + kernels.classifyTuples(hits_d, tracks.view(), nullptr); #ifdef GPU_DEBUG std::cout << "finished building pixel tracks on CPU" << std::endl; @@ -259,13 +253,13 @@ pixelTrack::TrackSoAView CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2D // check that the fixed-size SoA does not overflow - auto maxTracks = tracks_view.metadata().size(); - auto nTracks = tracks_view.nTracks(); + auto maxTracks = tracks.view().metadata().size(); + auto nTracks = tracks.view().nTracks(); assert(nTracks < maxTracks); if (nTracks == maxTracks - 1) { edm::LogWarning("PixelTracks") << "Unsorted reconstructed pixel tracks truncated to " << maxTracks - 1 << " candidates"; } - return tracks_view; + return tracks; } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h index 6b9a00ef9757f..323ad0d071f0c 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h @@ -3,8 +3,8 @@ #include #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" -//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" #include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h" #include "FWCore/ParameterSet/interface/ParameterSet.h" @@ -29,7 +29,8 @@ class CAHitNtupletGeneratorOnGPU { using hindex_type = TrackingRecHit2DSOAView::hindex_type; using Quality = pixelTrack::Quality; - using OutputSoAView = pixelTrack::TrackSoAView; + using OutputSoAHost = pixelTrack::TrackSoAHost; + using OutputSoADevice = pixelTrack::TrackSoADevice; using HitContainer = pixelTrack::HitContainer; using Tuple = HitContainer; @@ -49,12 +50,12 @@ class CAHitNtupletGeneratorOnGPU { void endJob(); // On GPU - pixelTrack::TrackSoAView makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, + pixelTrack::TrackSoADevice makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const; // On CPU - pixelTrack::TrackSoAView makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const; + pixelTrack::TrackSoAHost makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const; private: void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream) const; From be09c9aa34ed9a18e47cb482032185e40318a8bb Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Mon, 31 Oct 2022 17:17:30 +0100 Subject: [PATCH 076/110] GPUCACell, PixelTrackDumpCUDA fix include --- .../Track/interface/PixelTrackHeterogeneous.h | 6 ------ .../plugins/PixelTrackDumpCUDA.cc | 20 +++++++++---------- .../PixelTriplets/plugins/GPUCACell.h | 4 ++-- 3 files changed, 11 insertions(+), 19 deletions(-) diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h index c0e5c99b6fd28..73ec80e6322a2 100644 --- a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h +++ b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h @@ -1,10 +1,4 @@ #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h #define CUDADataFormats_Track_PixelTrackHeterogeneous_h -#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" -//#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" - -using PixelTrackHeterogeneous = HeterogeneousSoA; - #endif // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc index 59489c8e11f5f..f97dfecfff370 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc @@ -1,8 +1,8 @@ #include #include "CUDADataFormats/Common/interface/Product.h" -//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" #include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" #include "DataFormats/Common/interface/Handle.h" @@ -31,11 +31,12 @@ class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> { private: void analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const override; const bool m_onGPU; - //edm::EDGetTokenT> tokenGPUTrack_; - edm::EDGetTokenT> tokenGPUTrack_; + // GPU + edm::EDGetTokenT> tokenGPUTrack_; edm::EDGetTokenT> tokenGPUVertex_; - //edm::EDGetTokenT tokenSoATrack_; - edm::EDGetTokenT tokenSoATrack_; + + // CPU + edm::EDGetTokenT tokenSoATrack_; edm::EDGetTokenT tokenSoAVertex_; }; @@ -43,13 +44,11 @@ PixelTrackDumpCUDA::PixelTrackDumpCUDA(const edm::ParameterSet& iConfig) : m_onGPU(iConfig.getParameter("onGPU")) { if (m_onGPU) { tokenGPUTrack_ = - //consumes>(iConfig.getParameter("pixelTrackSrc")); - consumes>(iConfig.getParameter("pixelTrackSrc")); + consumes>(iConfig.getParameter("pixelTrackSrc")); tokenGPUVertex_ = consumes>(iConfig.getParameter("pixelVertexSrc")); } else { - //tokenSoATrack_ = consumes(iConfig.getParameter("pixelTrackSrc")); - tokenSoATrack_ = consumes(iConfig.getParameter("pixelTrackSrc")); + tokenSoATrack_ = consumes(iConfig.getParameter("pixelTrackSrc")); tokenSoAVertex_ = consumes(iConfig.getParameter("pixelVertexSrc")); } } @@ -71,7 +70,6 @@ void PixelTrackDumpCUDA::analyze(edm::StreamID streamID, cms::cuda::ScopedContextProduce ctx{hTracks}; auto const& tracks = ctx.get(hTracks); - //auto const* tsoa = tracks.get(); auto const* tsoa = &tracks; assert(tsoa); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h index a0c3930d1a739..0e1c322c051f8 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h @@ -14,8 +14,8 @@ #include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h" -//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" #include "CAConstants.h" class GPUCACell { From a985dc6f28084712bf9bf60e95d00e073ac9ac90 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Mon, 31 Oct 2022 17:18:05 +0100 Subject: [PATCH 077/110] Removed unused file --- RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h index 0e1c322c051f8..b448e16a35e4b 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h @@ -15,7 +15,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h" #include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" + #include "CAConstants.h" class GPUCACell { From 5c4e8c953af09c51b587a5f84c732d30d2bcf7e6 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Mon, 31 Oct 2022 17:22:25 +0100 Subject: [PATCH 078/110] Updated CAHitNtuplet.. inclusions --- .../PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h | 6 ++---- .../plugins/CAHitNtupletGeneratorKernelsImpl.h | 3 +-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h index d7901041902d3..529f7de99ea98 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h @@ -3,8 +3,8 @@ // #define GPU_DEBUG -//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" + #include "GPUCACell.h" // #define DUMP_GPU_TK_TUPLES @@ -35,7 +35,6 @@ namespace cAHitNtupletGenerator { using TupleMultiplicity = caConstants::TupleMultiplicity; using Quality = pixelTrack::Quality; - using TkSoA = pixelTrack::TrackSoA; using TkSoAView = pixelTrack::TrackSoAView; using TkSoAConstView = pixelTrack::TrackSoAConstView; using HitContainer = pixelTrack::HitContainer; @@ -176,7 +175,6 @@ class CAHitNtupletGeneratorKernels { using TupleMultiplicity = caConstants::TupleMultiplicity; using Quality = pixelTrack::Quality; - using TkSoA = pixelTrack::TrackSoA; using TkSoAView = pixelTrack::TrackSoAView; using TkSoAConstView = pixelTrack::TrackSoAConstView; using HitContainer = pixelTrack::HitContainer; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index a3a8eb97a43d7..afe4aaa11f70b 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -15,7 +15,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" -#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" #include "CAConstants.h" #include "CAHitNtupletGeneratorKernels.h" @@ -30,7 +30,6 @@ using HitToTuple = caConstants::HitToTuple; using TupleMultiplicity = caConstants::TupleMultiplicity; using Quality = pixelTrack::Quality; -using TkSoA = pixelTrack::TrackSoA; using TkSoAView = pixelTrack::TrackSoAView; using TkSoAConstView = pixelTrack::TrackSoAConstView; using HitContainer = pixelTrack::HitContainer; From 9c1885a7ab00408ff47ad4d4ac5a644de70fb253 Mon Sep 17 00:00:00 2001 From: Breno Orzari Date: Mon, 31 Oct 2022 17:41:33 +0100 Subject: [PATCH 079/110] Updating RecoPixelVertexing/PixelVertexFinding to Host/Device --- .../plugins/PixelVertexProducerCUDA.cc | 27 ++++++++++--------- .../plugins/gpuVertexFinder.cc | 9 ++++--- .../plugins/gpuVertexFinder.h | 9 +++---- 3 files changed, 23 insertions(+), 22 deletions(-) diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc index 16b3267a326ce..7d8ea3485c447 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc @@ -16,7 +16,8 @@ #include "FWCore/Utilities/interface/EDGetToken.h" #include "FWCore/Utilities/interface/RunningAverage.h" #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" #include "gpuVertexFinder.h" @@ -36,9 +37,9 @@ class PixelVertexProducerCUDA : public edm::global::EDProducer<> { bool onGPU_; - edm::EDGetTokenT> tokenGPUTrack_; + edm::EDGetTokenT> tokenGPUTrack_; edm::EDPutTokenT tokenGPUVertex_; - edm::EDGetTokenT tokenCPUTrack_; + edm::EDGetTokenT tokenCPUTrack_; edm::EDPutTokenT tokenCPUVertex_; const gpuVertexFinder::Producer gpuAlgo_; @@ -63,10 +64,10 @@ PixelVertexProducerCUDA::PixelVertexProducerCUDA(const edm::ParameterSet& conf) { if (onGPU_) { tokenGPUTrack_ = - consumes>(conf.getParameter("pixelTrackSrc")); + consumes>(conf.getParameter("pixelTrackSrc")); tokenGPUVertex_ = produces(); } else { - tokenCPUTrack_ = consumes(conf.getParameter("pixelTrackSrc")); + tokenCPUTrack_ = consumes(conf.getParameter("pixelTrackSrc")); tokenCPUVertex_ = produces(); } } @@ -98,29 +99,29 @@ void PixelVertexProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions& d void PixelVertexProducerCUDA::produceOnGPU(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { - edm::Handle> hTracks; + edm::Handle> hTracks; iEvent.getByToken(tokenGPUTrack_, hTracks); cms::cuda::ScopedContextProduce ctx{*hTracks}; - auto tracks_view = ctx.get(*hTracks); + auto &tracks = ctx.get(*hTracks); - ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks_view, ptMin_, ptMax_)); + ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks.view(), ptMin_, ptMax_)); } void PixelVertexProducerCUDA::produceOnCPU(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { - auto tracks_view = iEvent.get(tokenCPUTrack_); + auto & tracks = iEvent.get(tokenCPUTrack_); #ifdef PIXVERTEX_DEBUG_PRODUCE - auto maxTracks = tracks_view.metadata().size(); + auto maxTracks = tracks.view().metadata().size(); // std::cout << "size of SoA " << sizeof(tsoa) << " stride " << maxTracks << std::endl; int32_t nt = 0; for (int32_t it = 0; it < maxTracks; ++it) { - auto nHits = pixelTrack::utilities::nHits(tracks_view, it); - assert(nHits == int(tracks_view.hitIndices().size(it))); + auto nHits = pixelTrack::utilities::nHits(tracks.view(), it); + assert(nHits == int(tracks.view().hitIndices().size(it))); if (nHits == 0) break; // this is a guard: maybe we need to move to nTracks... nt++; @@ -128,7 +129,7 @@ void PixelVertexProducerCUDA::produceOnCPU(edm::StreamID streamID, // std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE - iEvent.emplace(tokenCPUVertex_, gpuAlgo_.make(tracks_view, ptMin_, ptMax_)); + iEvent.emplace(tokenCPUVertex_, gpuAlgo_.make(tracks.view(), ptMin_, ptMax_)); } void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc index a476c95a5e78a..66de3fe8c99f7 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc @@ -7,7 +7,7 @@ #include "gpuSortByPt2.h" #include "gpuSplitVertices.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" #undef PIXVERTEX_DEBUG_PRODUCE @@ -19,9 +19,10 @@ namespace gpuVertexFinder { // split vertices with a chi2/NDoF greater than this constexpr float maxChi2ForSplit = 9.f; + //using TkSoAView = pixelTrack::TrackSoAView; using TkSoAConstView = pixelTrack::TrackSoAConstView; - __global__ void loadTracks(TkSoAView tracks_view, ZVertexSoA* soa, WorkSpace* pws, float ptMin, float ptMax) { + __global__ void loadTracks(TkSoAConstView tracks_view, ZVertexSoA* soa, WorkSpace* pws, float ptMin, float ptMax) { assert(soa); auto const* quality = pixelTrack::utilities::qualityData(tracks_view); @@ -95,13 +96,13 @@ namespace gpuVertexFinder { #endif #ifdef __CUDACC__ - ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoAView tracks_view, float ptMin, float ptMax) const { + ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoAConstView tracks_view, float ptMin, float ptMax) const { #ifdef PIXVERTEX_DEBUG_PRODUCE std::cout << "producing Vertices on GPU" << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE ZVertexHeterogeneous vertices(cms::cuda::make_device_unique(stream)); #else - ZVertexHeterogeneous Producer::make(TkSoAView tracks_view, float ptMin, float ptMax) const { + ZVertexHeterogeneous Producer::make(TkSoAConstView tracks_view, float ptMin, float ptMax) const { #ifdef PIXVERTEX_DEBUG_PRODUCE std::cout << "producing Vertices on CPU" << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h index a890c53b20cb8..98bb9d75530d4 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h @@ -6,13 +6,12 @@ #include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" //#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" namespace gpuVertexFinder { using ZVertices = ZVertexSoA; - using TkSoA = pixelTrack::TrackSoA; - using TkSoAView = pixelTrack::TrackSoAView; + using TkSoAConstView = pixelTrack::TrackSoAConstView; // workspace used in the vertex reco algos struct WorkSpace { @@ -65,8 +64,8 @@ namespace gpuVertexFinder { ~Producer() = default; - ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoAView tracks_view, float ptMin, float ptMax) const; - ZVertexHeterogeneous make(TkSoAView tracks_view, float ptMin, float ptMax) const; + ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoAConstView tracks_view, float ptMin, float ptMax) const; + ZVertexHeterogeneous make(TkSoAConstView tracks_view, float ptMin, float ptMax) const; private: const bool oneKernel_; From 5b98c63b0ded588e3eb4bef013a2a3ff01bb8cf2 Mon Sep 17 00:00:00 2001 From: Breno Orzari Date: Mon, 31 Oct 2022 17:49:49 +0100 Subject: [PATCH 080/110] Changing nHits to ConstView --- CUDADataFormats/Track/interface/PixelTrackUtilities.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CUDADataFormats/Track/interface/PixelTrackUtilities.h b/CUDADataFormats/Track/interface/PixelTrackUtilities.h index 1c7ffe22711e8..e202dced07307 100644 --- a/CUDADataFormats/Track/interface/PixelTrackUtilities.h +++ b/CUDADataFormats/Track/interface/PixelTrackUtilities.h @@ -119,7 +119,7 @@ namespace pixelTrack { } return nl; } - __host__ __device__ inline int nHits(TrackSoAView &tracks, int i) { return tracks.detIndices().size(i); } + __host__ __device__ inline int nHits(TrackSoAConstView &tracks, int i) { return tracks.detIndices().size(i); } // Casts quality SoA data (uint8_t) to pixelTrack::Quality. This is required // to use the data as an enum instead of a plain uint8_t From f6c5e8ae554696c2afd302172b120aa789a10b0b Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Mon, 31 Oct 2022 17:50:59 +0100 Subject: [PATCH 081/110] Fixes for HelixFit, classes_def --- .../Track/interface/TrackSoAHeterogeneousDevice.h | 2 +- CUDADataFormats/Track/src/classes_def.xml | 14 ++++++-------- .../plugins/CAHitNtupletGeneratorOnGPU.cc | 3 +-- .../PixelTriplets/plugins/HelixFitOnGPU.h | 3 +-- RecoPixelVertexing/PixelTriplets/src/classes.h | 1 - 5 files changed, 9 insertions(+), 14 deletions(-) diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h index a77643de29001..611f98d7d9dae 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h @@ -11,7 +11,7 @@ template class TrackSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection> { public: - //TrackSoAHeterogeneousDevice() = default; + TrackSoAHeterogeneousDevice() = default; // cms::cuda::Product needs this // Constructor which specifies the SoA size explicit TrackSoAHeterogeneousDevice(cudaStream_t stream) diff --git a/CUDADataFormats/Track/src/classes_def.xml b/CUDADataFormats/Track/src/classes_def.xml index 9f320a3833ff0..c4337b0b7ee06 100644 --- a/CUDADataFormats/Track/src/classes_def.xml +++ b/CUDADataFormats/Track/src/classes_def.xml @@ -1,10 +1,8 @@ - - - - - - - - + + + + + + diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc index 180711886c8d1..37454b9065f0a 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -222,8 +222,7 @@ pixelTrack::TrackSoADevice CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingR } pixelTrack::TrackSoAHost CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const { - pixelTrack::TrackSoADevice tracks(stream); - auto* soa = &tracks; + pixelTrack::TrackSoAHost tracks; CAHitNtupletGeneratorKernelsCPU kernels(m_params); kernels.setCounters(m_counters); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h index 1a1283b9079c9..9fd2112476c9d 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h @@ -1,8 +1,7 @@ #ifndef RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h #define RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h -//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" #include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h" diff --git a/RecoPixelVertexing/PixelTriplets/src/classes.h b/RecoPixelVertexing/PixelTriplets/src/classes.h index db84e140b26de..4f495027ac186 100644 --- a/RecoPixelVertexing/PixelTriplets/src/classes.h +++ b/RecoPixelVertexing/PixelTriplets/src/classes.h @@ -1,6 +1,5 @@ #include "RecoPixelVertexing/PixelTriplets/interface/IntermediateHitTriplets.h" #include "DataFormats/Common/interface/Wrapper.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT_test.h" #include From 800e2a6daa26e6b9732481ab97468b420d94e1cd Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Mon, 31 Oct 2022 18:14:10 +0100 Subject: [PATCH 082/110] Fixing types, still not compiling --- .../interface/TrackSoAHeterogeneousDevice.h | 2 +- .../plugins/PixelTrackProducerFromSoA.cc | 30 ++++++++----------- .../plugins/PixelTrackSoAFromCUDA.cc | 2 +- 3 files changed, 14 insertions(+), 20 deletions(-) diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h index 611f98d7d9dae..aaf4035d460e5 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h @@ -18,7 +18,7 @@ class TrackSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection>(S, stream) {} // Copy data from device to host - __host__ void copyToHost(cms::cuda::host::unique_ptr &host_ptr, cudaStream_t stream) { + __host__ void copyToHost(cms::cuda::host::unique_ptr &host_ptr, cudaStream_t stream) const { cudaCheck(cudaMemcpy(host_ptr.get(), const_buffer().get(), bufferSize(), cudaMemcpyDeviceToHost)); } }; diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc index 5ffa051c27cfc..6a38ba45e96d9 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc @@ -56,7 +56,6 @@ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> { // Event Data tokens const edm::EDGetTokenT tBeamSpot_; - //const edm::EDGetTokenT tokenTrack_; const edm::EDGetTokenT tokenTrack_; const edm::EDGetTokenT cpuHits_; const edm::EDGetTokenT hmsToken_; @@ -70,8 +69,7 @@ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> { PixelTrackProducerFromSoA::PixelTrackProducerFromSoA(const edm::ParameterSet &iConfig) : tBeamSpot_(consumes(iConfig.getParameter("beamSpot"))), - //tokenTrack_(consumes(iConfig.getParameter("trackSrc"))), - tokenTrack_(consumes(iConfig.getParameter("trackSrc"))), + tokenTrack_(consumes(iConfig.getParameter("trackSrc"))), cpuHits_(consumes(iConfig.getParameter("pixelRecHitLegacySrc"))), hmsToken_(consumes(iConfig.getParameter("pixelRecHitLegacySrc"))), idealMagneticFieldToken_(esConsumes()), @@ -156,16 +154,15 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, std::vector hits; hits.reserve(5); - //const auto &tsoa = *iEvent.get(tokenTrack_); - auto tsoa = iEvent.get(tokenTrack_); + const auto &tsoa = *iEvent.get(tokenTrack_); //auto const *quality = pixelTrack::utilities::qualityData(tsoa.view()); // auto const &fit = tsoa.stateAtBS; //auto const &hitIndices = tsoa.view().hitIndices(); //auto nTracks = tsoa.view().nTracks(); - auto const *quality = pixelTrack::utilities::qualityData(tsoa); - auto const hitIndices = tsoa.hitIndices(); - auto nTracks = tsoa.nTracks(); + auto const *quality = pixelTrack::utilities::qualityData(tsoa.view()); + auto const hitIndices = tsoa.view().hitIndices(); + auto nTracks = tsoa.view().nTracks(); tracks.reserve(nTracks); @@ -175,15 +172,14 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, std::vector sortIdxs(nTracks); std::iota(sortIdxs.begin(), sortIdxs.end(), 0); std::sort(sortIdxs.begin(), sortIdxs.end(), [&](int32_t const i1, int32_t const i2) { - //return tsoa.view()[i1].pt() > tsoa.view()[i2].pt(); - return tsoa[i1].pt() > tsoa[i2].pt(); + return tsoa.view()[i1].pt() > tsoa.view()[i2].pt(); }); //store the index of the SoA: indToEdm[index_SoAtrack] -> index_edmTrack (if it exists) indToEdm.resize(sortIdxs.size(), -1); for (const auto &it : sortIdxs) { - //auto nHits = pixelTrack::utilities::nHits(tsoa.view(), it); - auto nHits = pixelTrack::utilities::nHits(tsoa, it); + auto nHits = pixelTrack::utilities::nHits(tsoa.view(), it); + assert(nHits >= 3); auto q = quality[it]; if (q < minQuality_) @@ -200,15 +196,13 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, // mind: this values are respect the beamspot! - //float chi2 = tsoa.view()[it].chi2(); - //float phi = pixelTrack::utilities::phi(tsoa.view(), it); - float chi2 = tsoa[it].chi2(); - float phi = pixelTrack::utilities::phi(tsoa, it); + float chi2 = tsoa.view()[it].chi2(); + float phi = pixelTrack::utilities::phi(tsoa.view(), it); riemannFit::Vector5d ipar, opar; riemannFit::Matrix5d icov, ocov; - //pixelTrack::utilities::copyToDense(tsoa.view(), ipar, icov, it); - pixelTrack::utilities::copyToDense(tsoa, ipar, icov, it); + pixelTrack::utilities::copyToDense(tsoa.view(), ipar, icov, it); + riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov); LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.); diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc index 594081963bb90..d06b988d4a5f5 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc @@ -56,7 +56,7 @@ void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { cms::cuda::Product const& inputDataWrapped = iEvent.get(tokenCUDA_); cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; - auto const tracks_d = ctx.get(inputDataWrapped); // Tracks on device + auto const& tracks_d = ctx.get(inputDataWrapped); // Tracks on device pixelTrack::TrackSoAHost tracks_h(ctx.stream()); tracks_d.copyToHost(tracks_h.buffer(), ctx.stream()); From 8eff092fc3af646ad936c84cef46b7a2bbf6e0dd Mon Sep 17 00:00:00 2001 From: Breno Orzari Date: Tue, 1 Nov 2022 10:56:29 +0100 Subject: [PATCH 083/110] Const reference everything --- .../Track/interface/PixelTrackUtilities.h | 16 ++++++++-------- .../plugins/PixelTrackProducerFromSoA.cc | 3 ++- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/CUDADataFormats/Track/interface/PixelTrackUtilities.h b/CUDADataFormats/Track/interface/PixelTrackUtilities.h index e202dced07307..828a8261ca259 100644 --- a/CUDADataFormats/Track/interface/PixelTrackUtilities.h +++ b/CUDADataFormats/Track/interface/PixelTrackUtilities.h @@ -59,13 +59,13 @@ namespace pixelTrack { return std::copysign(1.f, tracks[i].state()(2)); } - __host__ __device__ inline float phi(TrackSoAConstView &tracks, int32_t i) { return tracks[i].state()(0); } + __host__ __device__ inline float phi(const TrackSoAConstView &tracks, int32_t i) { return tracks[i].state()(0); } - __host__ __device__ inline float tip(TrackSoAConstView &tracks, int32_t i) { return tracks[i].state()(1); } + __host__ __device__ inline float tip(const TrackSoAConstView &tracks, int32_t i) { return tracks[i].state()(1); } - __host__ __device__ inline float zip(TrackSoAConstView &tracks, int32_t i) { return tracks[i].state()(4); } + __host__ __device__ inline float zip(const TrackSoAConstView &tracks, int32_t i) { return tracks[i].state()(4); } - __host__ __device__ inline bool isTriplet(TrackSoAConstView &tracks, int i) { return tracks[i].nLayers() == 3; } + __host__ __device__ inline bool isTriplet(const TrackSoAConstView &tracks, int i) { return tracks[i].nLayers() == 3; } template __host__ __device__ inline void copyFromCircle( @@ -97,7 +97,7 @@ namespace pixelTrack { } template - __host__ __device__ inline void copyToDense(TrackSoAConstView &tracks, V5 &v, M5 &cov, int32_t i) { + __host__ __device__ inline void copyToDense(const TrackSoAConstView &tracks, V5 &v, M5 &cov, int32_t i) { v = tracks[i].state().template cast(); for (int j = 0, ind = 0; j < 5; ++j) { cov(j, j) = tracks[i].covariance()(ind++); @@ -119,14 +119,14 @@ namespace pixelTrack { } return nl; } - __host__ __device__ inline int nHits(TrackSoAConstView &tracks, int i) { return tracks.detIndices().size(i); } + __host__ __device__ inline int nHits(const TrackSoAConstView &tracks, int i) { return tracks.detIndices().size(i); } // Casts quality SoA data (uint8_t) to pixelTrack::Quality. This is required // to use the data as an enum instead of a plain uint8_t - __host__ __device__ inline const Quality *qualityData(TrackSoAConstView &tracks) { + __host__ __device__ inline const Quality *qualityData(const TrackSoAConstView &tracks) { return reinterpret_cast(tracks.quality()); } - __host__ __device__ inline Quality *qualityData(TrackSoAView &tracks) { + __host__ __device__ inline Quality *qualityData(TrackSoAView tracks) { return reinterpret_cast(tracks.quality()); } diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc index 6a38ba45e96d9..ec5be3b7f05c7 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc @@ -154,7 +154,8 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, std::vector hits; hits.reserve(5); - const auto &tsoa = *iEvent.get(tokenTrack_); + //const auto &tsoa = *iEvent.get(tokenTrack_); + auto & tsoa = iEvent.get(tokenTrack_); //auto const *quality = pixelTrack::utilities::qualityData(tsoa.view()); // auto const &fit = tsoa.stateAtBS; From a55739365ba1e681efcf214317d962ebf2acb184 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Tue, 1 Nov 2022 12:09:34 +0100 Subject: [PATCH 084/110] Updating RecoTauTag --- .../Track/interface/PixelTrackUtilities.h | 2 +- .../HLTProducers/src/L2TauTagNNProducer.cc | 49 ++++++++++--------- 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/CUDADataFormats/Track/interface/PixelTrackUtilities.h b/CUDADataFormats/Track/interface/PixelTrackUtilities.h index 828a8261ca259..4208dfe93f69c 100644 --- a/CUDADataFormats/Track/interface/PixelTrackUtilities.h +++ b/CUDADataFormats/Track/interface/PixelTrackUtilities.h @@ -55,7 +55,7 @@ namespace pixelTrack { using hindex_type = uint32_t; // State at the Beam spot // phi,tip,1/pt,cotan(theta),zip - __host__ __device__ inline float charge(TrackSoAConstView &tracks, int32_t i) { + __host__ __device__ inline float charge(const TrackSoAConstView &tracks, int32_t i) { return std::copysign(1.f, tracks[i].state()(2)); } diff --git a/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc b/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc index 34e04b0f7aedb..db650684e7578 100644 --- a/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc +++ b/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc @@ -45,10 +45,11 @@ #include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h" #include "DataFormats/GeometrySurface/interface/Plane.h" #include "DataFormats/BeamSpot/interface/BeamSpot.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" #include "MagneticField/Records/interface/IdealMagneticFieldRecord.h" #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" #include "CUDADataFormats/Vertex/interface/ZVertexSoA.h" #include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" @@ -179,16 +180,16 @@ class L2TauNNProducer : public edm::stream::EDProducer& allTaus, - const pixelTrack::TrackSoA& patatracks_tsoa, + const pixelTrack::TrackSoAHost& patatracks_tsoa, const ZVertexSoA& patavtx_soa, const reco::BeamSpot& beamspot, const MagneticField* magfi); void selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa, - const pixelTrack::TrackSoA& patatracks_tsoa, + const pixelTrack::TrackSoAHost& patatracks_tsoa, std::vector& trkGood, std::vector& vtxGood); std::pair impactParameter(int it, - const pixelTrack::TrackSoA& patatracks_tsoa, + const pixelTrack::TrackSoAHost& patatracks_tsoa, float patatrackPhi, const reco::BeamSpot& beamspot, const MagneticField* magfi); @@ -208,7 +209,7 @@ class L2TauNNProducer : public edm::stream::EDProducer geometryToken_; const edm::ESGetToken bFieldToken_; const edm::EDGetTokenT pataVerticesToken_; - const edm::EDGetTokenT pataTracksToken_; + const edm::EDGetTokenT pataTracksToken_; const edm::EDGetTokenT beamSpotToken_; const unsigned int maxVtx_; const float fractionSumPt2_; @@ -293,7 +294,7 @@ L2TauNNProducer::L2TauNNProducer(const edm::ParameterSet& cfg, const L2TauNNProd geometryToken_(esConsumes()), bFieldToken_(esConsumes()), pataVerticesToken_(consumes(cfg.getParameter("pataVertices"))), - pataTracksToken_(consumes(cfg.getParameter("pataTracks"))), + pataTracksToken_(consumes(cfg.getParameter("pataTracks"))), beamSpotToken_(consumes(cfg.getParameter("BeamSpot"))), maxVtx_(cfg.getParameter("maxVtx")), fractionSumPt2_(cfg.getParameter("fractionSumPt2")), @@ -570,31 +571,31 @@ void L2TauNNProducer::fillCaloRecHits(tensorflow::Tensor& cellGridMatrix, } void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa, - const pixelTrack::TrackSoA& patatracks_tsoa, + const pixelTrack::TrackSoAHost& patatracks_tsoa, std::vector& trkGood, std::vector& vtxGood) { - const auto maxTracks = patatracks_tsoa.stride(); + const auto maxTracks = patatracks_tsoa.view().metadata().size(); const int nv = patavtx_soa.nvFinal; trkGood.clear(); trkGood.reserve(maxTracks); vtxGood.clear(); vtxGood.reserve(nv); - auto const* quality = patatracks_tsoa.qualityData(); + auto const* quality = pixelTrack::utilities::qualityData(patatracks_tsoa.view()); // No need to sort either as the algorithms is just using the max (not even the location, just the max value of pt2sum). std::vector pTSquaredSum(nv, 0); std::vector nTrkAssociated(nv, 0); for (int32_t trk_idx = 0; trk_idx < maxTracks; ++trk_idx) { - auto nHits = patatracks_tsoa.nHits(trk_idx); + auto nHits = pixelTrack::utilities::nHits(patatracks_tsoa.view(), trk_idx); if (nHits == 0) { break; } int vtx_ass_to_track = patavtx_soa.idv[trk_idx]; if (vtx_ass_to_track >= 0 && vtx_ass_to_track < nv) { - auto patatrackPt = patatracks_tsoa.pt[trk_idx]; + auto patatrackPt = patatracks_tsoa.view()[trk_idx].pt(); ++nTrkAssociated[vtx_ass_to_track]; - if (patatrackPt >= trackPtMin_ && patatracks_tsoa.chi2(trk_idx) <= trackChi2Max_) { + if (patatrackPt >= trackPtMin_ && patatracks_tsoa.const_view()[trk_idx].chi2() <= trackChi2Max_) { patatrackPt = std::min(patatrackPt, trackPtMax_); pTSquaredSum[vtx_ass_to_track] += patatrackPt * patatrackPt; } @@ -617,15 +618,15 @@ void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa, } std::pair L2TauNNProducer::impactParameter(int it, - const pixelTrack::TrackSoA& patatracks_tsoa, + const pixelTrack::TrackSoAHost& patatracks_tsoa, float patatrackPhi, const reco::BeamSpot& beamspot, const MagneticField* magfi) { - auto const& fit = patatracks_tsoa.stateAtBS; + // auto const& fit = patatracks_tsoa.stateAtBS; /* dxy and dz */ riemannFit::Vector5d ipar, opar; riemannFit::Matrix5d icov, ocov; - fit.copyToDense(ipar, icov, it); + pixelTrack::utilities::copyToDense(patatracks_tsoa.view(), ipar, icov, it); riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov); LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.); float sp = std::sin(patatrackPhi); @@ -650,7 +651,7 @@ std::pair L2TauNNProducer::impactParameter(int it, void L2TauNNProducer::fillPatatracks(tensorflow::Tensor& cellGridMatrix, const std::vector& allTaus, - const pixelTrack::TrackSoA& patatracks_tsoa, + const pixelTrack::TrackSoAHost& patatracks_tsoa, const ZVertexSoA& patavtx_soa, const reco::BeamSpot& beamspot, const MagneticField* magfi) { @@ -675,14 +676,14 @@ void L2TauNNProducer::fillPatatracks(tensorflow::Tensor& cellGridMatrix, const float tauPhi = allTaus[tau_idx]->phi(); for (const auto it : trkGood) { - const float patatrackPt = patatracks_tsoa.pt[it]; + const float patatrackPt = patatracks_tsoa.const_view()[it].pt(); if (patatrackPt <= 0) continue; - const float patatrackPhi = patatracks_tsoa.phi(it); - const float patatrackEta = patatracks_tsoa.eta(it); - const float patatrackCharge = patatracks_tsoa.charge(it); - const float patatrackChi2OverNdof = patatracks_tsoa.chi2(it); - const auto nHits = patatracks_tsoa.nHits(it); + const float patatrackPhi = pixelTrack::utilities::phi(patatracks_tsoa.const_view(), it); + const float patatrackEta = patatracks_tsoa.const_view()[it].eta(); + const float patatrackCharge = pixelTrack::utilities::charge(patatracks_tsoa.const_view(), it); + const float patatrackChi2OverNdof = patatracks_tsoa.view()[it].chi2(); + const auto nHits = pixelTrack::utilities::nHits(patatracks_tsoa.const_view(), it); if (nHits <= 0) continue; const int patatrackNdof = 2 * std::min(6, nHits) - 5; @@ -763,7 +764,7 @@ void L2TauNNProducer::produce(edm::Event& event, const edm::EventSetup& eventset const auto eeCal = event.getHandle(eeToken_); const auto hbhe = event.getHandle(hbheToken_); const auto ho = event.getHandle(hoToken_); - const auto& patatracks_SoA = *event.get(pataTracksToken_); + auto& patatracks_SoA = event.get(pataTracksToken_); const auto& vertices_SoA = *event.get(pataVerticesToken_); const auto bsHandle = event.getHandle(beamSpotToken_); From 44663213ef8a764b8bba20831a10704d1ce9a9da Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Tue, 1 Nov 2022 13:21:48 +0100 Subject: [PATCH 085/110] DQM/MonitorTrackSoA adapted --- .../plugins/SiPixelPhase1MonitorTrackSoA.cc | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc index 622895ba07bcc..b4c996afc7055 100644 --- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc +++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc @@ -21,7 +21,8 @@ #include "DQMServices/Core/interface/MonitorElement.h" #include "DQMServices/Core/interface/DQMEDAnalyzer.h" #include "DQMServices/Core/interface/DQMStore.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" // for string manipulations #include @@ -34,7 +35,7 @@ class SiPixelPhase1MonitorTrackSoA : public DQMEDAnalyzer { static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); private: - edm::EDGetTokenT tokenSoATrack_; + edm::EDGetTokenT tokenSoATrack_; std::string topFolderName_; bool useQualityCut_; pixelTrack::Quality minQuality_; @@ -62,7 +63,7 @@ class SiPixelPhase1MonitorTrackSoA : public DQMEDAnalyzer { // SiPixelPhase1MonitorTrackSoA::SiPixelPhase1MonitorTrackSoA(const edm::ParameterSet& iConfig) { - tokenSoATrack_ = consumes(iConfig.getParameter("pixelTrackSrc")); + tokenSoATrack_ = consumes(iConfig.getParameter("pixelTrackSrc")); topFolderName_ = iConfig.getParameter("topFolderName"); //"SiPixelHeterogeneous/PixelTrackSoA"; useQualityCut_ = iConfig.getParameter("useQualityCut"); minQuality_ = pixelTrack::qualityByName(iConfig.getParameter("minQuality")); @@ -78,23 +79,24 @@ void SiPixelPhase1MonitorTrackSoA::analyze(const edm::Event& iEvent, const edm:: return; } - auto const& tsoa = *((tsoaHandle.product())->get()); - auto maxTracks = tsoa.stride(); - auto const* quality = tsoa.qualityData(); + auto& tsoa = *tsoaHandle.product(); + auto maxTracks = tsoa.view().metadata().size(); + auto const* quality = pixelTrack::utilities::qualityData(tsoa.const_view()); int32_t nTracks = 0; int32_t nLooseAndAboveTracks = 0; for (int32_t it = 0; it < maxTracks; ++it) { - auto nHits = tsoa.nHits(it); - auto nLayers = tsoa.nLayers(it); + auto nHits = pixelTrack::utilities::nHits(tsoa.const_view(), it); + auto nLayers = tsoa.view()[it].nLayers(); if (nHits == 0) break; // this is a guard - float pt = tsoa.pt(it); + float pt = tsoa.view()[it].pt(); if (!(pt > 0.)) continue; // fill the quality for all tracks - pixelTrack::Quality qual = tsoa.quality(it); + // pixelTrack::Quality qual = tsoa.view()[it].quality(); + pixelTrack::Quality qual = quality[it]; hquality->Fill(int(qual)); nTracks++; @@ -102,11 +104,11 @@ void SiPixelPhase1MonitorTrackSoA::analyze(const edm::Event& iEvent, const edm:: continue; // fill parameters only for quality >= loose - float chi2 = tsoa.chi2(it); - float phi = tsoa.phi(it); - float zip = tsoa.zip(it); - float eta = tsoa.eta(it); - float tip = tsoa.tip(it); + float chi2 = tsoa.view()[it].chi2(); + float phi = pixelTrack::utilities::phi(tsoa.const_view(), it); + float zip = pixelTrack::utilities::zip(tsoa.const_view(), it); + float eta = tsoa.view()[it].eta(); + float tip = pixelTrack::utilities::tip(tsoa.const_view(), it); hchi2->Fill(chi2); hChi2VsPhi->Fill(phi, chi2); From cf50b7d62d324454bc67d9f12b35294aa4372e60 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Tue, 1 Nov 2022 13:39:49 +0100 Subject: [PATCH 086/110] Fixed utilities function calls arguments --- .../plugins/SiPixelPhase1CompareTrackSoA.cc | 94 +++++++++++-------- 1 file changed, 56 insertions(+), 38 deletions(-) diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc index 7b12f694d4e8c..dedff1f758e8f 100644 --- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc +++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc @@ -2,7 +2,7 @@ // Package: SiPixelPhase1CompareTrackSoA // Class: SiPixelPhase1CompareTrackSoA // -/**\class SiPixelPhase1CompareTrackSoA SiPixelPhase1CompareTrackSoA.cc +/**\class SiPixelPhase1CompareTrackSoA SiPixelPhase1CompareTrackSoA.cc */ // // Author: Suvankar Roy Chowdhury @@ -20,7 +20,9 @@ #include "DQMServices/Core/interface/MonitorElement.h" #include "DQMServices/Core/interface/DQMEDAnalyzer.h" #include "DQMServices/Core/interface/DQMStore.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" // for string manipulations #include @@ -71,8 +73,8 @@ class SiPixelPhase1CompareTrackSoA : public DQMEDAnalyzer { static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); private: - const edm::EDGetTokenT tokenSoATrackCPU_; - const edm::EDGetTokenT tokenSoATrackGPU_; + const edm::EDGetTokenT tokenSoATrackCPU_; + const edm::EDGetTokenT tokenSoATrackGPU_; const std::string topFolderName_; const bool useQualityCut_; const pixelTrack::Quality minQuality_; @@ -113,8 +115,8 @@ class SiPixelPhase1CompareTrackSoA : public DQMEDAnalyzer { // SiPixelPhase1CompareTrackSoA::SiPixelPhase1CompareTrackSoA(const edm::ParameterSet& iConfig) - : tokenSoATrackCPU_(consumes(iConfig.getParameter("pixelTrackSrcCPU"))), - tokenSoATrackGPU_(consumes(iConfig.getParameter("pixelTrackSrcGPU"))), + : tokenSoATrackCPU_(consumes(iConfig.getParameter("pixelTrackSrcCPU"))), + tokenSoATrackGPU_(consumes(iConfig.getParameter("pixelTrackSrcGPU"))), topFolderName_(iConfig.getParameter("topFolderName")), useQualityCut_(iConfig.getParameter("useQualityCut")), minQuality_(pixelTrack::qualityByName(iConfig.getParameter("minQuality"))), @@ -138,12 +140,12 @@ void SiPixelPhase1CompareTrackSoA::analyze(const edm::Event& iEvent, const edm:: return; } - auto const& tsoaCPU = *tsoaHandleCPU->get(); - auto const& tsoaGPU = *tsoaHandleGPU->get(); - auto maxTracksCPU = tsoaCPU.stride(); //this should be same for both? - auto maxTracksGPU = tsoaGPU.stride(); //this should be same for both? - auto const* qualityCPU = tsoaCPU.qualityData(); - auto const* qualityGPU = tsoaGPU.qualityData(); + auto& tsoaCPU = *tsoaHandleCPU.product(); + auto& tsoaGPU = *tsoaHandleGPU.product(); + auto maxTracksCPU = tsoaCPU.view().metadata().size(); //this should be same for both? + auto maxTracksGPU = tsoaGPU.view().metadata().size(); //this should be same for both? + auto const* qualityCPU = pixelTrack::utilities::qualityData(tsoaCPU.view()); + auto const* qualityGPU = pixelTrack::utilities::qualityData(tsoaGPU.view()); int32_t nTracksCPU = 0; int32_t nTracksGPU = 0; int32_t nLooseAndAboveTracksCPU = 0; @@ -153,9 +155,9 @@ void SiPixelPhase1CompareTrackSoA::analyze(const edm::Event& iEvent, const edm:: //Loop over GPU tracks and store the indices of the loose tracks. Whats happens if useQualityCut_ is false? std::vector looseTrkidxGPU; for (int32_t jt = 0; jt < maxTracksGPU; ++jt) { - if (tsoaGPU.nHits(jt) == 0) + if (pixelTrack::utilities::nHits(tsoaGPU.view(), jt) == 0) break; // this is a guard - if (!(tsoaGPU.pt(jt) > 0.)) + if (!(tsoaGPU.view()[jt].pt() > 0.)) continue; nTracksGPU++; if (useQualityCut_ && qualityGPU[jt] < minQuality_) @@ -166,9 +168,18 @@ void SiPixelPhase1CompareTrackSoA::analyze(const edm::Event& iEvent, const edm:: //Now loop over CPU tracks//nested loop for loose gPU tracks for (int32_t it = 0; it < maxTracksCPU; ++it) { - if (tsoaCPU.nHits(it) == 0) + float chi2CPU = tsoaCPU.view()[it].chi2(); + int nHitsCPU = pixelTrack::utilities::nHits(tsoaCPU.view(), it); + int8_t nLayersCPU = tsoaCPU.view()[it].nLayers(); + float ptCPU = tsoaCPU.view()[it].pt(); + float etaCPU = tsoaCPU.view()[it].eta(); + float phiCPU = pixelTrack::utilities::phi(tsoaCPU.view(), it); + float zipCPU = pixelTrack::utilities::zip(tsoaCPU.view(), it); + float tipCPU = pixelTrack::utilities::tip(tsoaCPU.view(), it); + + if (nHitsCPU == 0) break; // this is a guard - if (!(tsoaCPU.pt(it) > 0.)) + if (!(ptCPU > 0.)) continue; nTracksCPU++; if (useQualityCut_ && qualityCPU[it] < minQuality_) @@ -178,12 +189,10 @@ void SiPixelPhase1CompareTrackSoA::analyze(const edm::Event& iEvent, const edm:: const int32_t notFound = -1; int32_t closestTkidx = notFound; float mindr2 = dr2cut_; - float etacpu = tsoaCPU.eta(it); - float phicpu = tsoaCPU.phi(it); for (auto gid : looseTrkidxGPU) { - float etagpu = tsoaGPU.eta(gid); - float phigpu = tsoaGPU.phi(gid); - float dr2 = reco::deltaR2(etacpu, phicpu, etagpu, phigpu); + float etaGPU = tsoaGPU.view()[gid].eta(); + float phiGPU = pixelTrack::utilities::phi(tsoaGPU.view(), gid); + float dr2 = reco::deltaR2(etaCPU, phiCPU, etaGPU, phiGPU); if (dr2 > dr2cut_) continue; // this is arbitrary if (mindr2 > dr2) { @@ -192,27 +201,36 @@ void SiPixelPhase1CompareTrackSoA::analyze(const edm::Event& iEvent, const edm:: } } - hpt_eta_tkAllCPU_->Fill(etacpu, tsoaCPU.pt(it)); //all CPU tk - hphi_z_tkAllCPU_->Fill(phicpu, tsoaCPU.zip(it)); + hpt_eta_tkAllCPU_->Fill(etaCPU, ptCPU); //all CPU tk + hphi_z_tkAllCPU_->Fill(phiCPU, zipCPU); if (closestTkidx == notFound) continue; nLooseAndAboveTracksCPU_matchedGPU++; - hchi2_->Fill(tsoaCPU.chi2(it), tsoaGPU.chi2(closestTkidx)); - hnHits_->Fill(tsoaCPU.nHits(it), tsoaGPU.nHits(closestTkidx)); - hnLayers_->Fill(tsoaCPU.nLayers(it), tsoaGPU.nLayers(closestTkidx)); - hpt_->Fill(tsoaCPU.pt(it), tsoaGPU.pt(closestTkidx)); - hptLogLog_->Fill(tsoaCPU.pt(it), tsoaGPU.pt(closestTkidx)); - heta_->Fill(etacpu, tsoaGPU.eta(closestTkidx)); - hphi_->Fill(phicpu, tsoaGPU.phi(closestTkidx)); - hz_->Fill(tsoaCPU.zip(it), tsoaGPU.zip(closestTkidx)); - htip_->Fill(tsoaCPU.tip(it), tsoaGPU.tip(closestTkidx)); - hptdiffMatched_->Fill(tsoaCPU.pt(it) - tsoaGPU.pt(closestTkidx)); - hetadiffMatched_->Fill(etacpu - tsoaGPU.eta(closestTkidx)); - hphidiffMatched_->Fill(reco::deltaPhi(phicpu, tsoaGPU.phi(closestTkidx))); - hzdiffMatched_->Fill(tsoaCPU.zip(it) - tsoaGPU.zip(closestTkidx)); - hpt_eta_tkAllCPUMatched_->Fill(etacpu, tsoaCPU.pt(it)); //matched to gpu - hphi_z_tkAllCPUMatched_->Fill(phicpu, tsoaCPU.zip(it)); + float chi2GPU = tsoaGPU.view()[closestTkidx].chi2(); + int nHitsGPU = pixelTrack::utilities::nHits(tsoaGPU.view(), closestTkidx); + int8_t nLayersGPU = tsoaGPU.view()[closestTkidx].nLayers(); + float ptGPU = tsoaGPU.view()[closestTkidx].pt(); + float etaGPU = tsoaGPU.view()[closestTkidx].eta(); + float phiGPU = pixelTrack::utilities::phi(tsoaGPU.view(), closestTkidx); + float zipGPU = pixelTrack::utilities::zip(tsoaGPU.view(), closestTkidx); + float tipGPU = pixelTrack::utilities::tip(tsoaGPU.view(), closestTkidx); + + hchi2_->Fill(chi2CPU, chi2GPU); + hnHits_->Fill(nHitsCPU, nHitsGPU); + hnLayers_->Fill(nLayersCPU, nLayersGPU); + hpt_->Fill(ptCPU, ptCPU); + hptLogLog_->Fill(ptCPU, ptGPU); + heta_->Fill(etaCPU, etaGPU); + hphi_->Fill(phiCPU, phiGPU); + hz_->Fill(zipCPU, zipGPU); + htip_->Fill(tipCPU, tipGPU); + hptdiffMatched_->Fill(ptCPU - ptGPU); + hetadiffMatched_->Fill(etaCPU - etaGPU); + hphidiffMatched_->Fill(reco::deltaPhi(phiCPU, phiGPU)); + hzdiffMatched_->Fill(zipCPU - zipGPU); + hpt_eta_tkAllCPUMatched_->Fill(etaCPU, ptCPU); //matched to gpu + hphi_z_tkAllCPUMatched_->Fill(phiCPU, zipCPU); } hnTracks_->Fill(nTracksCPU, nTracksGPU); hnLooseAndAboveTracks_->Fill(nLooseAndAboveTracksCPU, nLooseAndAboveTracksGPU); From ebb47181ea6ebfffd78312a44ac9c708c0142007 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Tue, 1 Nov 2022 13:55:41 +0100 Subject: [PATCH 087/110] SeedProducer adapted to new Data format --- .../plugins/SeedProducerFromSoA.cc | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc index 0e5823fc46c46..a5cc27c338ebe 100644 --- a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc +++ b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc @@ -1,4 +1,6 @@ -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" #include "DataFormats/BeamSpot/interface/BeamSpot.h" #include "DataFormats/GeometrySurface/interface/Plane.h" #include "DataFormats/TrackerCommon/interface/TrackerTopology.h" @@ -45,7 +47,7 @@ class SeedProducerFromSoA : public edm::global::EDProducer<> { // Event data tokens const edm::EDGetTokenT tBeamSpot_; - const edm::EDGetTokenT tokenTrack_; + const edm::EDGetTokenT tokenTrack_; // Event setup tokens const edm::ESGetToken idealMagneticFieldToken_; const edm::ESGetToken trackerDigiGeometryToken_; @@ -55,7 +57,7 @@ class SeedProducerFromSoA : public edm::global::EDProducer<> { SeedProducerFromSoA::SeedProducerFromSoA(const edm::ParameterSet& iConfig) : tBeamSpot_(consumes(iConfig.getParameter("beamSpot"))), - tokenTrack_(consumes(iConfig.getParameter("src"))), + tokenTrack_(consumes(iConfig.getParameter("src"))), idealMagneticFieldToken_(esConsumes()), trackerDigiGeometryToken_(esConsumes()), trackerPropagatorToken_(esConsumes(edm::ESInputTag("PropagatorWithMaterial"))), @@ -89,16 +91,16 @@ void SeedProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, co // std::cout << "beamspot " << bsh.x0() << ' ' << bsh.y0() << ' ' << bsh.z0() << std::endl; GlobalPoint bs(bsh.x0(), bsh.y0(), bsh.z0()); - const auto& tsoa = *(iEvent.get(tokenTrack_)); + auto & tsoa = iEvent.get(tokenTrack_); - auto const* quality = tsoa.qualityData(); - auto const& fit = tsoa.stateAtBS; - auto const& detIndices = tsoa.detIndices; - auto maxTracks = tsoa.stride(); + auto const* quality = pixelTrack::utilities::qualityData(tsoa.view()); + //auto const& fit = tsoa.stateAtBS; + auto const& detIndices = tsoa.view().detIndices(); + auto maxTracks = tsoa.view().metadata().size(); int32_t nt = 0; for (int32_t it = 0; it < maxTracks; ++it) { - auto nHits = tsoa.nHits(it); + auto nHits = pixelTrack::utilities::nHits(tsoa.view(),it); if (nHits == 0) break; // this is a guard: maybe we need to move to nTracks... @@ -120,11 +122,11 @@ void SeedProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, co // mind: this values are respect the beamspot! - float phi = tsoa.phi(it); + float phi = pixelTrack::utilities::phi(tsoa.view(),it); riemannFit::Vector5d ipar, opar; riemannFit::Matrix5d icov, ocov; - fit.copyToDense(ipar, icov, it); + pixelTrack::utilities::copyToDense(tsoa.view(),ipar, icov, it); riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov); LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.); From 9ace379c4e88131602f962153830d7e9cefad519 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Tue, 1 Nov 2022 16:20:14 +0100 Subject: [PATCH 088/110] Fix tracks SoA instantiation on host --- .../PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc index 37454b9065f0a..9e778bae66158 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -222,7 +222,7 @@ pixelTrack::TrackSoADevice CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingR } pixelTrack::TrackSoAHost CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const { - pixelTrack::TrackSoAHost tracks; + pixelTrack::TrackSoAHost tracks(nullptr); CAHitNtupletGeneratorKernelsCPU kernels(m_params); kernels.setCounters(m_counters); From 33a9741ffdfae24d42c50e38d8a29da362a2a8c2 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Wed, 2 Nov 2022 17:04:54 +0100 Subject: [PATCH 089/110] Fixed segfault due to using local variable instead of the class attribute --- .../Track/interface/TrackSoAHeterogeneousDevice.h | 3 ++- .../plugins/PixelTrackSoAFromCUDA.cc | 14 ++++++-------- .../plugins/CAHitNtupletGeneratorKernelsImpl.h | 9 +++++---- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h index aaf4035d460e5..b79f8d959720c 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h @@ -19,7 +19,8 @@ class TrackSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection &host_ptr, cudaStream_t stream) const { - cudaCheck(cudaMemcpy(host_ptr.get(), const_buffer().get(), bufferSize(), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpyAsync(host_ptr.get(), const_buffer().get(), bufferSize(), cudaMemcpyDeviceToHost, stream)); + cudaCheck(cudaGetLastError()); } }; diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc index d06b988d4a5f5..283e5b0292464 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc @@ -17,6 +17,7 @@ #include "FWCore/Utilities/interface/EDGetToken.h" #include "FWCore/Utilities/interface/InputTag.h" #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" // Switch on to enable checks and printout for found tracks // #define PIXEL_DEBUG_PRODUCE @@ -56,10 +57,9 @@ void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { cms::cuda::Product const& inputDataWrapped = iEvent.get(tokenCUDA_); cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; - auto const& tracks_d = ctx.get(inputDataWrapped); // Tracks on device - - pixelTrack::TrackSoAHost tracks_h(ctx.stream()); - tracks_d.copyToHost(tracks_h.buffer(), ctx.stream()); + auto const& tracks_d = ctx.get(inputDataWrapped); // Tracks on device + tracks_h = pixelTrack::TrackSoAHost(ctx.stream()); // Create an instance of Tracks on Host, using the stream + tracks_d.copyToHost(tracks_h.buffer(), ctx.stream()); // Copy data from Device to Host } void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { @@ -67,6 +67,7 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i auto maxTracks = tracks_h.view().metadata().size(); auto nTracks = tracks_h.view().nTracks(); assert(nTracks < maxTracks); + if (nTracks == maxTracks - 1) { edm::LogWarning("PixelTracks") << "Unsorted reconstructed pixel tracks truncated to " << maxTracks - 1 << " candidates"; @@ -86,11 +87,8 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i } assert(nTracks == nt); #endif - // DO NOT make a copy (actually TWO....) - iEvent.emplace(tokenSOA_, std::move(tracks_h)); //, std::move(ret)); // view - - //assert(!soa_); + iEvent.emplace(tokenSOA_, std::move(tracks_h)); } DEFINE_FWK_MODULE(PixelTrackSoAFromCUDA); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index afe4aaa11f70b..f8657cf7bae89 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -380,8 +380,8 @@ __global__ void kernel_mark_used(GPUCACell *__restrict__ cells, uint32_t const * } } -__global__ void kernel_countMultiplicity(TkSoAConstView tracks_view, - caConstants::TupleMultiplicity *tupleMultiplicity) { +// TODO: change arg type to TkSoAConstview +__global__ void kernel_countMultiplicity(TkSoAView tracks_view, caConstants::TupleMultiplicity *tupleMultiplicity) { auto first = blockIdx.x * blockDim.x + threadIdx.x; for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) { auto nhits = tracks_view.hitIndices().size(it); @@ -397,7 +397,8 @@ __global__ void kernel_countMultiplicity(TkSoAConstView tracks_view, } } -__global__ void kernel_fillMultiplicity(TkSoAConstView tracks_view, caConstants::TupleMultiplicity *tupleMultiplicity) { +// TODO: change arg type to TkSoAConstview +__global__ void kernel_fillMultiplicity(TkSoAView tracks_view, caConstants::TupleMultiplicity *tupleMultiplicity) { auto first = blockIdx.x * blockDim.x + threadIdx.x; for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) { auto nhits = tracks_view.hitIndices().size(it); @@ -856,7 +857,7 @@ __global__ void kernel_simpleTripletCleaner( } __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__restrict__ hhp, - TkSoAConstView tracks_view, + TkSoAView tracks_view, CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple, int32_t firstPrint, int32_t lastPrint, From afdec7a7e2d9acfba46d8421866136607028fbfe Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Thu, 3 Nov 2022 10:37:38 +0100 Subject: [PATCH 090/110] Cleanup, removed unused includes --- .../Track/interface/PixelTrackHeterogeneous.h | 4 - .../Track/interface/TrackSoAHeterogeneousT.h | 107 ------------------ .../plugins/PixelTrackProducerFromSoA.cc | 10 +- .../plugins/BrokenLineFitOnGPU.h | 1 - .../plugins/CAHitNtupletGeneratorKernels.cc | 7 -- .../CAHitNtupletGeneratorKernelsImpl.h | 2 - .../plugins/CAHitNtupletGeneratorOnGPU.cc | 4 - .../plugins/PixelVertexProducerCUDA.cc | 4 +- .../plugins/gpuVertexFinder.cc | 7 +- .../plugins/gpuVertexFinder.h | 1 - 10 files changed, 6 insertions(+), 141 deletions(-) delete mode 100644 CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h delete mode 100644 CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h deleted file mode 100644 index 73ec80e6322a2..0000000000000 --- a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h -#define CUDADataFormats_Track_PixelTrackHeterogeneous_h - -#endif // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h deleted file mode 100644 index 356ea3eddeb7f..0000000000000 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h +++ /dev/null @@ -1,107 +0,0 @@ -#ifndef CUDADataFormats_Track_TrackHeterogeneousT_H -#define CUDADataFormats_Track_TrackHeterogeneousT_H - -#include -#include - -#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h" -#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" - -#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" - -namespace pixelTrack { - enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity, notQuality }; - constexpr uint32_t qualitySize{uint8_t(Quality::notQuality)}; - const std::string qualityName[qualitySize]{"bad", "edup", "dup", "loose", "strict", "tight", "highPurity"}; - inline Quality qualityByName(std::string const &name) { - auto qp = std::find(qualityName, qualityName + qualitySize, name) - qualityName; - return static_cast(qp); - } -} // namespace pixelTrack - -template -class TrackSoAHeterogeneousT { -public: - static constexpr int32_t stride() { return S; } - - using Quality = pixelTrack::Quality; - using hindex_type = uint32_t; - using HitContainer = cms::cuda::OneToManyAssoc; - - // Always check quality is at least loose! - // CUDA does not support enums in __lgc ... -private: - eigenSoA::ScalarSoA quality_; - -public: - constexpr Quality quality(int32_t i) const { return (Quality)(quality_(i)); } - constexpr Quality &quality(int32_t i) { return (Quality &)(quality_(i)); } - constexpr Quality const *qualityData() const { return (Quality const *)(quality_.data()); } - constexpr Quality *qualityData() { return (Quality *)(quality_.data()); } - - // this is chi2/ndof as not necessarely all hits are used in the fit - eigenSoA::ScalarSoA chi2; - - eigenSoA::ScalarSoA nLayers; - - constexpr int nTracks() const { return nTracks_; } - constexpr void setNTracks(int n) { nTracks_ = n; } - - constexpr int nHits(int i) const { return detIndices.size(i); } - - constexpr bool isTriplet(int i) const { return nLayers(i) == 3; } - - constexpr int computeNumberOfLayers(int32_t i) const { - // layers are in order and we assume tracks are either forward or backward - auto pdet = detIndices.begin(i); - int nl = 1; - auto ol = phase1PixelTopology::getLayer(*pdet); - for (; pdet < detIndices.end(i); ++pdet) { - auto il = phase1PixelTopology::getLayer(*pdet); - if (il != ol) - ++nl; - ol = il; - } - return nl; - } - - // State at the Beam spot - // phi,tip,1/pt,cotan(theta),zip - TrajectoryStateSoAT stateAtBS; - eigenSoA::ScalarSoA eta; - eigenSoA::ScalarSoA pt; - constexpr float charge(int32_t i) const { return std::copysign(1.f, stateAtBS.state(i)(2)); } - constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); } - constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); } - constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); } - - // state at the detector of the outermost hit - // representation to be decided... - // not yet filled on GPU - // TrajectoryStateSoA stateAtOuterDet; - - HitContainer hitIndices; - HitContainer detIndices; - -private: - int nTracks_; -}; - -namespace pixelTrack { - -#ifdef GPU_SMALL_EVENTS - // kept for testing and debugging - constexpr uint32_t maxNumber() { return 2 * 1024; } -#else - // tested on MC events with 55-75 pileup events - constexpr uint32_t maxNumber() { return 32 * 1024; } -#endif - - using TrackSoA = TrackSoAHeterogeneousT; - using TrajectoryState = TrajectoryStateSoAT; - using HitContainer = TrackSoA::HitContainer; - -} // namespace pixelTrack - -#endif // CUDADataFormats_Track_TrackHeterogeneousT_H diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc index ec5be3b7f05c7..36d3dd8c3dcc7 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc @@ -106,7 +106,6 @@ void PixelTrackProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &iSetup) const { - // enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity }; reco::TrackBase::TrackQuality recoQuality[] = {reco::TrackBase::undefQuality, reco::TrackBase::undefQuality, reco::TrackBase::discarded, @@ -154,13 +153,7 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, std::vector hits; hits.reserve(5); - //const auto &tsoa = *iEvent.get(tokenTrack_); - auto & tsoa = iEvent.get(tokenTrack_); - - //auto const *quality = pixelTrack::utilities::qualityData(tsoa.view()); - // auto const &fit = tsoa.stateAtBS; - //auto const &hitIndices = tsoa.view().hitIndices(); - //auto nTracks = tsoa.view().nTracks(); + auto &tsoa = iEvent.get(tokenTrack_); auto const *quality = pixelTrack::utilities::qualityData(tsoa.view()); auto const hitIndices = tsoa.view().hitIndices(); auto nTracks = tsoa.view().nTracks(); @@ -246,7 +239,6 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, // filter??? tracks.emplace_back(track.release(), hits); } - // std::cout << "processed " << nt << " good tuples " << tracks.size() << "out of " << indToEdm.size() << std::endl; // store tracks storeTracks(iEvent, tracks, httopo); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h index aefde7ac602b1..2b2d93cf7415a 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h @@ -8,7 +8,6 @@ #include -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc index cdefeab9e36b7..65a3f3a8dff4c 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -81,12 +81,6 @@ template <> void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoAView tracks_view, cudaStream_t cudaStream) { - // auto *tuples_d = tracks_d->view().hitIndices(); - // auto *detId_d = tracks_d->view().detIndices(); - // auto *quality_d = tracks_d->qualityData(); - - // assert(tuples_d && quality_d); // TODO Find equivalent for View - // zero tuples cms::cuda::launchZero(&tracks_view.hitIndices(), cudaStream); @@ -149,7 +143,6 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, cudaStream_t cudaStream) { int32_t nhits = hh.nHits(); - // auto const *tuples_d = &tracks_d->hitIndices; auto *quality_d = pixelTrack::utilities::qualityData(tracks_view); // classify tracks based on kinematics kernel_classifyTracks(tracks_view, quality_d, params_.cuts_); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index f8657cf7bae89..4f2272db13354 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -630,10 +630,8 @@ __global__ void kernel_markSharedHit(int const *__restrict__ nshared, HitContainer const *__restrict__ tuples, Quality *__restrict__ quality, bool dupPassThrough) { - // constexpr auto bad = (uint8_t)pixelTrack::Quality::bad; constexpr auto dup = pixelTrack::Quality::dup; constexpr auto loose = pixelTrack::Quality::loose; - // constexpr auto strict = (uint8_t)pixelTrack::Quality::strict; // quality to mark rejected auto const reject = dupPassThrough ? loose : dup; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc index 9e778bae66158..4893ebdcc828f 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -186,10 +186,6 @@ void CAHitNtupletGeneratorOnGPU::endJob() { } } -/*PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, - float bfield, - cudaStream_t stream) const { - PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique(stream));*/ pixelTrack::TrackSoADevice CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const { diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc index 7d8ea3485c447..9dd8a016dc02d 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc @@ -111,12 +111,11 @@ void PixelVertexProducerCUDA::produceOnGPU(edm::StreamID streamID, void PixelVertexProducerCUDA::produceOnCPU(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { - auto & tracks = iEvent.get(tokenCPUTrack_); + auto& tracks = iEvent.get(tokenCPUTrack_); #ifdef PIXVERTEX_DEBUG_PRODUCE auto maxTracks = tracks.view().metadata().size(); - // std::cout << "size of SoA " << sizeof(tsoa) << " stride " << maxTracks << std::endl; int32_t nt = 0; for (int32_t it = 0; it < maxTracks; ++it) { @@ -126,7 +125,6 @@ void PixelVertexProducerCUDA::produceOnCPU(edm::StreamID streamID, break; // this is a guard: maybe we need to move to nTracks... nt++; } - // std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE iEvent.emplace(tokenCPUVertex_, gpuAlgo_.make(tracks.view(), ptMin_, ptMax_)); diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc index 66de3fe8c99f7..c92060f8ba2cc 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc @@ -19,7 +19,6 @@ namespace gpuVertexFinder { // split vertices with a chi2/NDoF greater than this constexpr float maxChi2ForSplit = 9.f; - //using TkSoAView = pixelTrack::TrackSoAView; using TkSoAConstView = pixelTrack::TrackSoAConstView; __global__ void loadTracks(TkSoAConstView tracks_view, ZVertexSoA* soa, WorkSpace* pws, float ptMin, float ptMax) { @@ -96,7 +95,10 @@ namespace gpuVertexFinder { #endif #ifdef __CUDACC__ - ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoAConstView tracks_view, float ptMin, float ptMax) const { + ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, + TkSoAConstView tracks_view, + float ptMin, + float ptMax) const { #ifdef PIXVERTEX_DEBUG_PRODUCE std::cout << "producing Vertices on GPU" << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE @@ -108,7 +110,6 @@ namespace gpuVertexFinder { #endif // PIXVERTEX_DEBUG_PRODUCE ZVertexHeterogeneous vertices(std::make_unique()); #endif - // assert(tksoa); auto* soa = vertices.get(); assert(soa); diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h index 98bb9d75530d4..8c542607812b9 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h @@ -5,7 +5,6 @@ #include #include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" -//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" #include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" namespace gpuVertexFinder { From 182ffb802d8452c2443b75f7228ee9ae302ebf46 Mon Sep 17 00:00:00 2001 From: Breno Orzari Date: Thu, 3 Nov 2022 14:45:48 +0100 Subject: [PATCH 091/110] Changing dataformats to ZVertex{Device/Host} --- .../interface/ZVertexSoAHeterogeneousDevice.h | 33 +++++++++++++++ .../interface/ZVertexSoAHeterogeneousHost.h | 26 ++++++++++++ .../Vertex/interface/ZVertexUtilities.h | 41 +++++++++++++++++++ CUDADataFormats/Vertex/src/classes.h | 4 +- CUDADataFormats/Vertex/src/classes_def.xml | 10 +++-- 5 files changed, 109 insertions(+), 5 deletions(-) create mode 100644 CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h create mode 100644 CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h create mode 100644 CUDADataFormats/Vertex/interface/ZVertexUtilities.h diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h new file mode 100644 index 0000000000000..47cb8af2b4cc6 --- /dev/null +++ b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h @@ -0,0 +1,33 @@ +#ifndef CUDADataFormats_Vertex_ZVertexHeterogeneousDevice_H +#define CUDADataFormats_Vertex_ZVertexHeterogeneousDevice_H + +#include + +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" +#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +template +class ZVertexSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection> { +public: + ZVertexSoAHeterogeneousDevice() = default; // cms::cuda::Product needs this + + // Constructor which specifies the SoA size + explicit ZVertexSoAHeterogeneousDevice(cudaStream_t stream) + : PortableDeviceCollection>(S, stream) {} + + // Copy data from device to host + __host__ void copyToHost(cms::cuda::host::unique_ptr &host_ptr, cudaStream_t stream) const { + cudaCheck(cudaMemcpyAsync(host_ptr.get(), const_buffer().get(), bufferSize(), cudaMemcpyDeviceToHost, stream)); + cudaCheck(cudaGetLastError()); + } +}; + +namespace ZVertex { + + using ZVertexSoADevice = ZVertexSoAHeterogeneousDevice; + +} // namespace pixelTrack + +#endif // CUDADataFormats_Vertex_ZVertexHeterogeneousT_H diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h new file mode 100644 index 0000000000000..e751e2da8f5de --- /dev/null +++ b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h @@ -0,0 +1,26 @@ +#ifndef CUDADataFormats_Vertex_ZVertexHeterogeneousHost_H +#define CUDADataFormats_Vertex_ZVertexHeterogeneousHost_H + +#include + +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" +#include "CUDADataFormats/Common/interface/PortableHostCollection.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +template +class ZVertexSoAHeterogeneousHost : public cms::cuda::PortableHostCollection> { +public: + ZVertexSoAHeterogeneousHost() = default; + + // Constructor which specifies the SoA size + explicit ZVertexSoAHeterogeneousHost(cudaStream_t stream) + : PortableHostCollection>(S, stream) {} +}; + +namespace ZVertex { + + using ZVertexSoAHost = ZVertexSoAHeterogeneousHost; + +} // namespace ZVertex + +#endif // CUDADataFormats_Vertex_ZVertexHeterogeneousT_H diff --git a/CUDADataFormats/Vertex/interface/ZVertexUtilities.h b/CUDADataFormats/Vertex/interface/ZVertexUtilities.h new file mode 100644 index 0000000000000..4c5dece118f50 --- /dev/null +++ b/CUDADataFormats/Vertex/interface/ZVertexUtilities.h @@ -0,0 +1,41 @@ +#ifndef CUDADataFormats_Vertex_ZVertexUtilities_h +#define CUDADataFormats_Vertex_ZVertexUtilities_h + +//#include +//#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "DataFormats/SoATemplate/interface/SoALayout.h" + +GENERATE_SOA_LAYOUT(ZVertexSoAHeterogeneousLayout, + SOA_COLUMN(int16_t, idv), + SOA_COLUMN(float, zv), // this is chi2/ndof as not necessarely all hits are used in the fit + SOA_COLUMN(float, wv), + SOA_COLUMN(float, chi2), + SOA_COLUMN(float, ptv2), + SOA_COLUMN(int32_t, ndof), + SOA_COLUMN(uint16_t, sortInd), + SOA_SCALAR(uint32_t, nvFinal)) + +// Previous TrajectoryStateSoAT class methods. +// They operate on View and ConstView of the TrackSoA. +namespace ZVertex { + namespace utilities { + using ZVertexSoAView = ZVertexSoAHeterogeneousLayout<>::View; + + static constexpr uint32_t MAXTRACKS = 32 * 1024; + static constexpr uint32_t MAXVTX = 1024; + + __host__ __device__ inline void init(ZVertexSoAView &vertices) { vertices.nvFinal() = 0; } + + } // namespace utilities +} // namespace pixelTrack + +namespace ZVertex { + // Common types for both Host and Device code + using ZVertexSoALayout = ZVertexSoAHeterogeneousLayout<>; + using ZVertexSoAView = ZVertexSoAHeterogeneousLayout<>::View; + using ZVertexSoAConstView = ZVertexSoAHeterogeneousLayout<>::ConstView; + +} // namespace pixelTrack + +#endif diff --git a/CUDADataFormats/Vertex/src/classes.h b/CUDADataFormats/Vertex/src/classes.h index 7931beaa8f4bd..6f087ecb2cf46 100644 --- a/CUDADataFormats/Vertex/src/classes.h +++ b/CUDADataFormats/Vertex/src/classes.h @@ -1,7 +1,9 @@ #ifndef CUDADataFormats_Vertex_src_classes_h #define CUDADataFormats_Vertex_src_classes_h -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +//#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" #include "CUDADataFormats/Common/interface/Product.h" #include "DataFormats/Common/interface/Wrapper.h" diff --git a/CUDADataFormats/Vertex/src/classes_def.xml b/CUDADataFormats/Vertex/src/classes_def.xml index ea633080af9af..58616cbb534fa 100644 --- a/CUDADataFormats/Vertex/src/classes_def.xml +++ b/CUDADataFormats/Vertex/src/classes_def.xml @@ -1,6 +1,8 @@ - - - - + + + + + + From 8cb5f20a618077b43608e8035c259fbb3281bec1 Mon Sep 17 00:00:00 2001 From: Breno Orzari Date: Thu, 3 Nov 2022 15:03:31 +0100 Subject: [PATCH 092/110] Fixing headers in ZVertexUtilities.h --- CUDADataFormats/Vertex/interface/ZVertexUtilities.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/CUDADataFormats/Vertex/interface/ZVertexUtilities.h b/CUDADataFormats/Vertex/interface/ZVertexUtilities.h index 4c5dece118f50..05ed34e2e8d69 100644 --- a/CUDADataFormats/Vertex/interface/ZVertexUtilities.h +++ b/CUDADataFormats/Vertex/interface/ZVertexUtilities.h @@ -1,9 +1,7 @@ #ifndef CUDADataFormats_Vertex_ZVertexUtilities_h #define CUDADataFormats_Vertex_ZVertexUtilities_h -//#include -//#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include #include "DataFormats/SoATemplate/interface/SoALayout.h" GENERATE_SOA_LAYOUT(ZVertexSoAHeterogeneousLayout, @@ -16,8 +14,8 @@ GENERATE_SOA_LAYOUT(ZVertexSoAHeterogeneousLayout, SOA_COLUMN(uint16_t, sortInd), SOA_SCALAR(uint32_t, nvFinal)) -// Previous TrajectoryStateSoAT class methods. -// They operate on View and ConstView of the TrackSoA. +// Previous ZVertexSoA class methods. +// They operate on View and ConstView of the ZVertexSoA. namespace ZVertex { namespace utilities { using ZVertexSoAView = ZVertexSoAHeterogeneousLayout<>::View; @@ -28,7 +26,7 @@ namespace ZVertex { __host__ __device__ inline void init(ZVertexSoAView &vertices) { vertices.nvFinal() = 0; } } // namespace utilities -} // namespace pixelTrack +} // namespace ZVertex namespace ZVertex { // Common types for both Host and Device code @@ -36,6 +34,6 @@ namespace ZVertex { using ZVertexSoAView = ZVertexSoAHeterogeneousLayout<>::View; using ZVertexSoAConstView = ZVertexSoAHeterogeneousLayout<>::ConstView; -} // namespace pixelTrack +} // namespace ZVertex #endif From 107cc41317c237fb15e611146018f54a4d1f3eab Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Thu, 3 Nov 2022 16:04:06 +0100 Subject: [PATCH 093/110] Changed input to correct type --- .../plugins/SiPixelPhase1CompareTrackSoA.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc index dedff1f758e8f..36c045582c942 100644 --- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc +++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc @@ -74,7 +74,7 @@ class SiPixelPhase1CompareTrackSoA : public DQMEDAnalyzer { private: const edm::EDGetTokenT tokenSoATrackCPU_; - const edm::EDGetTokenT tokenSoATrackGPU_; + const edm::EDGetTokenT tokenSoATrackGPU_; const std::string topFolderName_; const bool useQualityCut_; const pixelTrack::Quality minQuality_; @@ -113,10 +113,12 @@ class SiPixelPhase1CompareTrackSoA : public DQMEDAnalyzer { // // constructors // +// Note that the GPU TrackSoA is also of type TrackSoAHost, as the data have +// been copied from Device to Host SiPixelPhase1CompareTrackSoA::SiPixelPhase1CompareTrackSoA(const edm::ParameterSet& iConfig) : tokenSoATrackCPU_(consumes(iConfig.getParameter("pixelTrackSrcCPU"))), - tokenSoATrackGPU_(consumes(iConfig.getParameter("pixelTrackSrcGPU"))), + tokenSoATrackGPU_(consumes(iConfig.getParameter("pixelTrackSrcGPU"))), topFolderName_(iConfig.getParameter("topFolderName")), useQualityCut_(iConfig.getParameter("useQualityCut")), minQuality_(pixelTrack::qualityByName(iConfig.getParameter("minQuality"))), From b683719c459eeb34caf2c39edb85b2a3a800735a Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Thu, 3 Nov 2022 16:15:33 +0100 Subject: [PATCH 094/110] Adapted DQM modules to new Vertex type --- .../plugins/SiPixelPhase1CompareVertexSoA.cc | 43 ++++++++++--------- .../plugins/SiPixelPhase1MonitorVertexSoA.cc | 24 +++++------ 2 files changed, 34 insertions(+), 33 deletions(-) diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc index 0113ea50973d8..68b553c45a48a 100644 --- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc +++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc @@ -18,7 +18,7 @@ #include "DQMServices/Core/interface/MonitorElement.h" #include "DQMServices/Core/interface/DQMEDAnalyzer.h" #include "DQMServices/Core/interface/DQMStore.h" -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" #include "DataFormats/BeamSpot/interface/BeamSpot.h" class SiPixelPhase1CompareVertexSoA : public DQMEDAnalyzer { @@ -31,8 +31,9 @@ class SiPixelPhase1CompareVertexSoA : public DQMEDAnalyzer { static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); private: - const edm::EDGetTokenT tokenSoAVertexCPU_; - const edm::EDGetTokenT tokenSoAVertexGPU_; + const edm::EDGetTokenT tokenSoAVertexCPU_; + // Note that this has been copied from device to host, hence is a HostCollection + const edm::EDGetTokenT tokenSoAVertexGPU_; const edm::EDGetTokenT tokenBeamSpot_; const std::string topFolderName_; const float dzCut_; @@ -54,8 +55,8 @@ class SiPixelPhase1CompareVertexSoA : public DQMEDAnalyzer { // SiPixelPhase1CompareVertexSoA::SiPixelPhase1CompareVertexSoA(const edm::ParameterSet& iConfig) - : tokenSoAVertexCPU_(consumes(iConfig.getParameter("pixelVertexSrcCPU"))), - tokenSoAVertexGPU_(consumes(iConfig.getParameter("pixelVertexSrcGPU"))), + : tokenSoAVertexCPU_(consumes(iConfig.getParameter("pixelVertexSrcCPU"))), + tokenSoAVertexGPU_(consumes(iConfig.getParameter("pixelVertexSrcGPU"))), tokenBeamSpot_(consumes(iConfig.getParameter("beamSpotSrc"))), topFolderName_(iConfig.getParameter("topFolderName")), dzCut_(iConfig.getParameter("dzCut")) {} @@ -64,8 +65,8 @@ SiPixelPhase1CompareVertexSoA::SiPixelPhase1CompareVertexSoA(const edm::Paramete // -- Analyze // void SiPixelPhase1CompareVertexSoA::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) { - const auto& vsoaHandleCPU = iEvent.getHandle(tokenSoAVertexCPU_); - const auto& vsoaHandleGPU = iEvent.getHandle(tokenSoAVertexGPU_); + auto& vsoaHandleCPU = iEvent.getHandle(tokenSoAVertexCPU_); + auto& vsoaHandleGPU = iEvent.getHandle(tokenSoAVertexGPU_); if (not vsoaHandleCPU or not vsoaHandleGPU) { edm::LogWarning out("SiPixelPhase1CompareTrackSoA"); if (not vsoaHandleCPU) { @@ -78,10 +79,10 @@ void SiPixelPhase1CompareVertexSoA::analyze(const edm::Event& iEvent, const edm: return; } - auto const& vsoaCPU = *vsoaHandleCPU->get(); - int nVerticesCPU = vsoaCPU.nvFinal; - auto const& vsoaGPU = *vsoaHandleGPU->get(); - int nVerticesGPU = vsoaGPU.nvFinal; + auto& vsoaCPU = *vsoaHandleCPU->get(); + int nVerticesCPU = vsoaCPU.view().nvFinal(); + auto& vsoaGPU = *vsoaHandleGPU->get(); + int nVerticesGPU = vsoaGPU.view().nvFinal(); auto bsHandle = iEvent.getHandle(tokenBeamSpot_); float x0 = 0., y0 = 0., z0 = 0., dxdz = 0., dydz = 0.; @@ -97,22 +98,22 @@ void SiPixelPhase1CompareVertexSoA::analyze(const edm::Event& iEvent, const edm: } for (int ivc = 0; ivc < nVerticesCPU; ivc++) { - auto sic = vsoaCPU.sortInd[ivc]; - auto zc = vsoaCPU.zv[sic]; + auto sic = vsoaCPU.view()[ivc].sortInd(); + auto zc = vsoaCPU.view()[sic].zv(); auto xc = x0 + dxdz * zc; auto yc = y0 + dydz * zc; zc += z0; - auto ndofCPU = vsoaCPU.ndof[sic]; - auto chi2CPU = vsoaCPU.chi2[sic]; + auto ndofCPU = vsoaCPU.view()[sic].ndof(); + auto chi2CPU = vsoaCPU.view()[sic].chi2(); const int32_t notFound = -1; int32_t closestVtxidx = notFound; float mindz = dzCut_; for (int ivg = 0; ivg < nVerticesGPU; ivg++) { - auto sig = vsoaGPU.sortInd[ivg]; - auto zgc = vsoaGPU.zv[sig] + z0; + auto sig = vsoaGPU.view()[ivg].sortInd(); + auto zgc = vsoaGPU.view()[sig].zv() + z0; auto zDist = std::abs(zc - zgc); //insert some matching condition if (zDist > dzCut_) @@ -125,12 +126,12 @@ void SiPixelPhase1CompareVertexSoA::analyze(const edm::Event& iEvent, const edm: if (closestVtxidx == notFound) continue; - auto zg = vsoaGPU.zv[closestVtxidx]; + auto zg = vsoaGPU.view()[closestVtxidx].zv(); auto xg = x0 + dxdz * zg; auto yg = y0 + dydz * zg; zg += z0; - auto ndofGPU = vsoaGPU.ndof[closestVtxidx]; - auto chi2GPU = vsoaGPU.chi2[closestVtxidx]; + auto ndofGPU = vsoaGPU.view()[closestVtxidx].ndof(); + auto chi2GPU = vsoaGPU.view()[closestVtxidx].chi2(); hx_->Fill(xc - x0, xg - x0); hy_->Fill(yc - y0, yg - y0); @@ -140,7 +141,7 @@ void SiPixelPhase1CompareVertexSoA::analyze(const edm::Event& iEvent, const edm: hzdiff_->Fill(zc - zg); hchi2_->Fill(chi2CPU, chi2GPU); hchi2oNdof_->Fill(chi2CPU / ndofCPU, chi2GPU / ndofGPU); - hptv2_->Fill(vsoaCPU.ptv2[sic], vsoaGPU.ptv2[closestVtxidx]); + hptv2_->Fill(vsoaCPU.view()[sic].ptv2(), vsoaGPU.view()[closestVtxidx].ptv2()); hntrks_->Fill(ndofCPU + 1, ndofGPU + 1); } hnVertex_->Fill(nVerticesCPU, nVerticesGPU); diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc index af6c240a69172..23e93816981b3 100644 --- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc +++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc @@ -21,7 +21,7 @@ #include "DQMServices/Core/interface/MonitorElement.h" #include "DQMServices/Core/interface/DQMEDAnalyzer.h" #include "DQMServices/Core/interface/DQMStore.h" -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneousHost.h" #include "DataFormats/BeamSpot/interface/BeamSpot.h" class SiPixelPhase1MonitorVertexSoA : public DQMEDAnalyzer { @@ -34,7 +34,7 @@ class SiPixelPhase1MonitorVertexSoA : public DQMEDAnalyzer { static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); private: - edm::EDGetTokenT tokenSoAVertex_; + edm::EDGetTokenT tokenSoAVertex_; edm::EDGetTokenT tokenBeamSpot_; std::string topFolderName_; MonitorElement* hnVertex; @@ -52,7 +52,7 @@ class SiPixelPhase1MonitorVertexSoA : public DQMEDAnalyzer { // SiPixelPhase1MonitorVertexSoA::SiPixelPhase1MonitorVertexSoA(const edm::ParameterSet& iConfig) { - tokenSoAVertex_ = consumes(iConfig.getParameter("pixelVertexSrc")); + tokenSoAVertex_ = consumes(iConfig.getParameter("pixelVertexSrc")); tokenBeamSpot_ = consumes(iConfig.getParameter("beamSpotSrc")); topFolderName_ = iConfig.getParameter("topFolderName"); } @@ -61,14 +61,14 @@ SiPixelPhase1MonitorVertexSoA::SiPixelPhase1MonitorVertexSoA(const edm::Paramete // -- Analyze // void SiPixelPhase1MonitorVertexSoA::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) { - const auto& vsoaHandle = iEvent.getHandle(tokenSoAVertex_); + auto& vsoaHandle = iEvent.getHandle(tokenSoAVertex_); if (!vsoaHandle.isValid()) { edm::LogWarning("SiPixelPhase1MonitorTrackSoA") << "No Vertex SoA found \n returning!" << std::endl; return; } - auto const& vsoa = *((vsoaHandle.product())->get()); - int nVertices = vsoa.nvFinal; + auto& vsoa = *((vsoaHandle.product())->get()); + int nVertices = vsoa.view().nvFinal(); auto bsHandle = iEvent.getHandle(tokenBeamSpot_); float x0 = 0., y0 = 0., z0 = 0., dxdz = 0., dydz = 0.; if (!bsHandle.isValid()) { @@ -82,18 +82,18 @@ void SiPixelPhase1MonitorVertexSoA::analyze(const edm::Event& iEvent, const edm: dydz = bs.dydz(); } for (int iv = 0; iv < nVertices; iv++) { - auto si = vsoa.sortInd[iv]; - auto z = vsoa.zv[si]; + auto si = vsoa.view()[iv].sortInd(); + auto z = vsoa.view()[si].zv(); auto x = x0 + dxdz * z; auto y = y0 + dydz * z; z += z0; hx->Fill(x); hy->Fill(y); hz->Fill(z); - auto ndof = vsoa.ndof[si]; - hchi2->Fill(vsoa.chi2[si]); - hchi2oNdof->Fill(vsoa.chi2[si] / ndof); - hptv2->Fill(vsoa.ptv2[si]); + auto ndof = vsoa.view()[si].ndof(); + hchi2->Fill(vsoa.view()[si].chi2()); + hchi2oNdof->Fill(vsoa.view()[si].chi2() / ndof); + hptv2->Fill(vsoa.view()[si].ptv2()); hntrks->Fill(ndof + 1); } hnVertex->Fill(nVertices); From 52ca1b07c6ce5184ea4a3be1da5a8a83ee634686 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Thu, 3 Nov 2022 16:27:44 +0100 Subject: [PATCH 095/110] Use alias from namespace --- .../plugins/SiPixelPhase1CompareVertexSoA.cc | 8 ++++---- .../plugins/SiPixelPhase1MonitorVertexSoA.cc | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc index 68b553c45a48a..d14aba06019bf 100644 --- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc +++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc @@ -31,9 +31,9 @@ class SiPixelPhase1CompareVertexSoA : public DQMEDAnalyzer { static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); private: - const edm::EDGetTokenT tokenSoAVertexCPU_; + const edm::EDGetTokenT tokenSoAVertexCPU_; // Note that this has been copied from device to host, hence is a HostCollection - const edm::EDGetTokenT tokenSoAVertexGPU_; + const edm::EDGetTokenT tokenSoAVertexGPU_; const edm::EDGetTokenT tokenBeamSpot_; const std::string topFolderName_; const float dzCut_; @@ -55,8 +55,8 @@ class SiPixelPhase1CompareVertexSoA : public DQMEDAnalyzer { // SiPixelPhase1CompareVertexSoA::SiPixelPhase1CompareVertexSoA(const edm::ParameterSet& iConfig) - : tokenSoAVertexCPU_(consumes(iConfig.getParameter("pixelVertexSrcCPU"))), - tokenSoAVertexGPU_(consumes(iConfig.getParameter("pixelVertexSrcGPU"))), + : tokenSoAVertexCPU_(consumes(iConfig.getParameter("pixelVertexSrcCPU"))), + tokenSoAVertexGPU_(consumes(iConfig.getParameter("pixelVertexSrcGPU"))), tokenBeamSpot_(consumes(iConfig.getParameter("beamSpotSrc"))), topFolderName_(iConfig.getParameter("topFolderName")), dzCut_(iConfig.getParameter("dzCut")) {} diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc index 23e93816981b3..914be969a9ff5 100644 --- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc +++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc @@ -34,7 +34,7 @@ class SiPixelPhase1MonitorVertexSoA : public DQMEDAnalyzer { static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); private: - edm::EDGetTokenT tokenSoAVertex_; + edm::EDGetTokenT tokenSoAVertex_; edm::EDGetTokenT tokenBeamSpot_; std::string topFolderName_; MonitorElement* hnVertex; @@ -52,7 +52,7 @@ class SiPixelPhase1MonitorVertexSoA : public DQMEDAnalyzer { // SiPixelPhase1MonitorVertexSoA::SiPixelPhase1MonitorVertexSoA(const edm::ParameterSet& iConfig) { - tokenSoAVertex_ = consumes(iConfig.getParameter("pixelVertexSrc")); + tokenSoAVertex_ = consumes(iConfig.getParameter("pixelVertexSrc")); tokenBeamSpot_ = consumes(iConfig.getParameter("beamSpotSrc")); topFolderName_ = iConfig.getParameter("topFolderName"); } From 91752db111d55964270713ac713d7ae576d6ae2e Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Thu, 3 Nov 2022 16:43:26 +0100 Subject: [PATCH 096/110] Fixed handle and instance creations --- .../plugins/SiPixelPhase1CompareVertexSoA.cc | 8 ++++---- .../plugins/SiPixelPhase1MonitorVertexSoA.cc | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc index d14aba06019bf..9172824631da2 100644 --- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc +++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc @@ -65,8 +65,8 @@ SiPixelPhase1CompareVertexSoA::SiPixelPhase1CompareVertexSoA(const edm::Paramete // -- Analyze // void SiPixelPhase1CompareVertexSoA::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) { - auto& vsoaHandleCPU = iEvent.getHandle(tokenSoAVertexCPU_); - auto& vsoaHandleGPU = iEvent.getHandle(tokenSoAVertexGPU_); + const auto& vsoaHandleCPU = iEvent.getHandle(tokenSoAVertexCPU_); + const auto& vsoaHandleGPU = iEvent.getHandle(tokenSoAVertexGPU_); if (not vsoaHandleCPU or not vsoaHandleGPU) { edm::LogWarning out("SiPixelPhase1CompareTrackSoA"); if (not vsoaHandleCPU) { @@ -79,9 +79,9 @@ void SiPixelPhase1CompareVertexSoA::analyze(const edm::Event& iEvent, const edm: return; } - auto& vsoaCPU = *vsoaHandleCPU->get(); + auto& vsoaCPU = *vsoaHandleCPU; int nVerticesCPU = vsoaCPU.view().nvFinal(); - auto& vsoaGPU = *vsoaHandleGPU->get(); + auto& vsoaGPU = *vsoaHandleGPU; int nVerticesGPU = vsoaGPU.view().nvFinal(); auto bsHandle = iEvent.getHandle(tokenBeamSpot_); diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc index 914be969a9ff5..27e0df36a17a4 100644 --- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc +++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc @@ -21,7 +21,7 @@ #include "DQMServices/Core/interface/MonitorElement.h" #include "DQMServices/Core/interface/DQMEDAnalyzer.h" #include "DQMServices/Core/interface/DQMStore.h" -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneousHost.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" #include "DataFormats/BeamSpot/interface/BeamSpot.h" class SiPixelPhase1MonitorVertexSoA : public DQMEDAnalyzer { @@ -61,13 +61,13 @@ SiPixelPhase1MonitorVertexSoA::SiPixelPhase1MonitorVertexSoA(const edm::Paramete // -- Analyze // void SiPixelPhase1MonitorVertexSoA::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) { - auto& vsoaHandle = iEvent.getHandle(tokenSoAVertex_); + const auto& vsoaHandle = iEvent.getHandle(tokenSoAVertex_); if (!vsoaHandle.isValid()) { edm::LogWarning("SiPixelPhase1MonitorTrackSoA") << "No Vertex SoA found \n returning!" << std::endl; return; } - auto& vsoa = *((vsoaHandle.product())->get()); + auto& vsoa = *vsoaHandle.product(); int nVertices = vsoa.view().nvFinal(); auto bsHandle = iEvent.getHandle(tokenBeamSpot_); float x0 = 0., y0 = 0., z0 = 0., dxdz = 0., dydz = 0.; From 1c1028dc7c78a44d85421990529a776cd90fccae Mon Sep 17 00:00:00 2001 From: Breno Orzari Date: Thu, 3 Nov 2022 17:18:20 +0100 Subject: [PATCH 097/110] Updating dataformats in vertexing to ZVertex{Device/Host} --- .../plugins/PixelTrackDumpCUDA.cc | 19 +++++++----- .../plugins/PixelVertexProducerCUDA.cc | 10 ++++--- .../plugins/PixelVertexProducerFromSoA.cc | 21 ++++++------- .../plugins/PixelVertexSoAFromCUDA.cc | 23 +++++++------- .../plugins/gpuClusterTracksByDensity.h | 17 ++++++----- .../plugins/gpuClusterTracksDBSCAN.h | 15 +++++----- .../plugins/gpuClusterTracksIterative.h | 15 +++++----- .../plugins/gpuFitVertices.h | 19 ++++++------ .../PixelVertexFinding/plugins/gpuSortByPt2.h | 14 ++++----- .../plugins/gpuSplitVertices.h | 19 ++++++------ .../plugins/gpuVertexFinder.cc | 30 +++++++++++-------- .../plugins/gpuVertexFinder.h | 21 +++++++------ 12 files changed, 122 insertions(+), 101 deletions(-) diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc index f97dfecfff370..a1acc6376e111 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc @@ -4,7 +4,9 @@ #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +//#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" #include "DataFormats/Common/interface/Handle.h" #include "FWCore/Framework/interface/ConsumesCollector.h" #include "FWCore/Framework/interface/Event.h" @@ -33,11 +35,11 @@ class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> { const bool m_onGPU; // GPU edm::EDGetTokenT> tokenGPUTrack_; - edm::EDGetTokenT> tokenGPUVertex_; + edm::EDGetTokenT> tokenGPUVertex_; // CPU edm::EDGetTokenT tokenSoATrack_; - edm::EDGetTokenT tokenSoAVertex_; + edm::EDGetTokenT tokenSoAVertex_; }; PixelTrackDumpCUDA::PixelTrackDumpCUDA(const edm::ParameterSet& iConfig) @@ -46,10 +48,10 @@ PixelTrackDumpCUDA::PixelTrackDumpCUDA(const edm::ParameterSet& iConfig) tokenGPUTrack_ = consumes>(iConfig.getParameter("pixelTrackSrc")); tokenGPUVertex_ = - consumes>(iConfig.getParameter("pixelVertexSrc")); + consumes>(iConfig.getParameter("pixelVertexSrc")); } else { tokenSoATrack_ = consumes(iConfig.getParameter("pixelTrackSrc")); - tokenSoAVertex_ = consumes(iConfig.getParameter("pixelVertexSrc")); + tokenSoAVertex_ = consumes(iConfig.getParameter("pixelVertexSrc")); } } @@ -74,15 +76,16 @@ void PixelTrackDumpCUDA::analyze(edm::StreamID streamID, assert(tsoa); auto const& vertices = ctx.get(iEvent.get(tokenGPUVertex_)); - auto const* vsoa = vertices.get(); + //auto const* vsoa = vertices.get(); + auto const* vsoa = &vertices; assert(vsoa); } else { auto const& tsoa = iEvent.get(tokenSoATrack_); assert(tsoa.buffer()); - auto const* vsoa = iEvent.get(tokenSoAVertex_).get(); - assert(vsoa); + auto const& vsoa = iEvent.get(tokenSoAVertex_); + assert(vsoa.buffer()); } } diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc index 9dd8a016dc02d..45d1a9d52d99e 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc @@ -18,6 +18,8 @@ #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" #include "gpuVertexFinder.h" @@ -38,9 +40,9 @@ class PixelVertexProducerCUDA : public edm::global::EDProducer<> { bool onGPU_; edm::EDGetTokenT> tokenGPUTrack_; - edm::EDPutTokenT tokenGPUVertex_; + edm::EDPutTokenT> tokenGPUVertex_; edm::EDGetTokenT tokenCPUTrack_; - edm::EDPutTokenT tokenCPUVertex_; + edm::EDPutTokenT tokenCPUVertex_; const gpuVertexFinder::Producer gpuAlgo_; @@ -65,10 +67,10 @@ PixelVertexProducerCUDA::PixelVertexProducerCUDA(const edm::ParameterSet& conf) if (onGPU_) { tokenGPUTrack_ = consumes>(conf.getParameter("pixelTrackSrc")); - tokenGPUVertex_ = produces(); + tokenGPUVertex_ = produces>(); } else { tokenCPUTrack_ = consumes(conf.getParameter("pixelTrackSrc")); - tokenCPUVertex_ = produces(); + tokenCPUVertex_ = produces(); } } diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc index 029c619b42e58..61ec3f9a6a5be 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc @@ -1,4 +1,5 @@ -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" #include "DataFormats/BeamSpot/interface/BeamSpot.h" #include "DataFormats/Common/interface/OrphanHandle.h" #include "DataFormats/TrackReco/interface/Track.h" @@ -35,14 +36,14 @@ class PixelVertexProducerFromSoA : public edm::global::EDProducer<> { private: void produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &iSetup) const override; - edm::EDGetTokenT tokenVertex_; + edm::EDGetTokenT tokenVertex_; edm::EDGetTokenT tokenBeamSpot_; edm::EDGetTokenT tokenTracks_; edm::EDGetTokenT tokenIndToEdm_; }; PixelVertexProducerFromSoA::PixelVertexProducerFromSoA(const edm::ParameterSet &conf) - : tokenVertex_(consumes(conf.getParameter("src"))), + : tokenVertex_(consumes(conf.getParameter("src"))), tokenBeamSpot_(consumes(conf.getParameter("beamSpot"))), tokenTracks_(consumes(conf.getParameter("TrackCollection"))), tokenIndToEdm_(consumes(conf.getParameter("TrackCollection"))) { @@ -81,9 +82,9 @@ void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEv dydz = bs.dydz(); } - auto const &soa = *(iEvent.get(tokenVertex_).get()); + auto const &soa = iEvent.get(tokenVertex_); - int nv = soa.nvFinal; + int nv = soa.view().nvFinal(); #ifdef PIXVERTEX_DEBUG_PRODUCE std::cout << "converting " << nv << " vertices " @@ -92,20 +93,20 @@ void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEv std::set uind; // for verifing index consistency for (int j = nv - 1; j >= 0; --j) { - auto i = soa.sortInd[j]; // on gpu sorted in ascending order.... + auto i = soa.view()[j].sortInd(); // on gpu sorted in ascending order.... assert(i < nv); uind.insert(i); assert(itrk.empty()); - auto z = soa.zv[i]; + auto z = soa.view()[i].zv(); auto x = x0 + dxdz * z; auto y = y0 + dydz * z; z += z0; reco::Vertex::Error err; - err(2, 2) = 1.f / soa.wv[i]; + err(2, 2) = 1.f / soa.view()[i].wv(); err(2, 2) *= 2.; // artifically inflate error //Copy also the tracks (no intention to be efficient....) for (auto k = 0U; k < indToEdm.size(); ++k) { - if (soa.idv[k] == int16_t(i)) + if (soa.view()[k].idv() == int16_t(i)) itrk.push_back(k); } auto nt = itrk.size(); @@ -119,7 +120,7 @@ void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEv itrk.clear(); continue; } // remove outliers - (*vertexes).emplace_back(reco::Vertex::Point(x, y, z), err, soa.chi2[i], soa.ndof[i], nt); + (*vertexes).emplace_back(reco::Vertex::Point(x, y, z), err, soa.view()[i].chi2(), soa.view()[i].ndof(), nt); auto &v = (*vertexes).back(); v.reserve(itrk.size()); for (auto it : itrk) { diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc index dc125878b1058..ef97c9a2b6ea7 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc @@ -2,7 +2,8 @@ #include "CUDADataFormats/Common/interface/Product.h" #include "CUDADataFormats/Common/interface/HostProduct.h" -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" #include "DataFormats/Common/interface/Handle.h" #include "FWCore/Framework/interface/ESHandle.h" #include "FWCore/Framework/interface/Event.h" @@ -30,15 +31,15 @@ class PixelVertexSoAFromCUDA : public edm::stream::EDProducer edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; - edm::EDGetTokenT> tokenCUDA_; - edm::EDPutTokenT tokenSOA_; + edm::EDGetTokenT> tokenCUDA_; + edm::EDPutTokenT tokenSOA_; - cms::cuda::host::unique_ptr m_soa; + ZVertex::ZVertexSoAHost zvertex_h; }; PixelVertexSoAFromCUDA::PixelVertexSoAFromCUDA(const edm::ParameterSet& iConfig) - : tokenCUDA_(consumes>(iConfig.getParameter("src"))), - tokenSOA_(produces()) {} + : tokenCUDA_(consumes>(iConfig.getParameter("src"))), + tokenSOA_(produces()) {} void PixelVertexSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { edm::ParameterSetDescription desc; @@ -50,16 +51,16 @@ void PixelVertexSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& de void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - auto const& inputDataWrapped = iEvent.get(tokenCUDA_); + cms::cuda::Product const& inputDataWrapped = iEvent.get(tokenCUDA_); cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; - auto const& inputData = ctx.get(inputDataWrapped); - - m_soa = inputData.toHostAsync(ctx.stream()); + auto const& zvertex_d = ctx.get(inputDataWrapped); + zvertex_h = ZVertex::ZVertexSoAHost(ctx.stream()); + zvertex_d.copyToHost(zvertex_h.buffer(), ctx.stream()); } void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { // No copies.... - iEvent.emplace(tokenSOA_, ZVertexHeterogeneous(std::move(m_soa))); + iEvent.emplace(tokenSOA_, std::move(zvertex_h)); } DEFINE_FWK_MODULE(PixelVertexSoAFromCUDA); diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h index f71aa56842a67..f920586117078 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h @@ -17,7 +17,7 @@ namespace gpuVertexFinder { // // based on Rodrighez&Laio algo // - __device__ __forceinline__ void clusterTracksByDensity(gpuVertexFinder::ZVertices* pdata, + __device__ __forceinline__ void clusterTracksByDensity(VtxSoAView pdata, gpuVertexFinder::WorkSpace* pws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster @@ -32,20 +32,21 @@ namespace gpuVertexFinder { auto er2mx = errmax * errmax; - auto& __restrict__ data = *pdata; + auto& __restrict__ data = pdata; auto& __restrict__ ws = *pws; auto nt = ws.ntrks; float const* __restrict__ zt = ws.zt; float const* __restrict__ ezt2 = ws.ezt2; - uint32_t& nvFinal = data.nvFinal; + uint32_t& nvFinal = data.nvFinal(); uint32_t& nvIntermediate = ws.nvIntermediate; uint8_t* __restrict__ izt = ws.izt; - int32_t* __restrict__ nn = data.ndof; + int32_t* __restrict__ nn = data.ndof(); int32_t* __restrict__ iv = ws.iv; - assert(pdata); + //TODO: check if there is a way to assert this + //assert(pdata); assert(zt); using Hist = cms::cuda::HistoContainer; @@ -63,7 +64,7 @@ namespace gpuVertexFinder { // fill hist (bin shall be wider than "eps") for (auto i = threadIdx.x; i < nt; i += blockDim.x) { - assert(i < ZVertices::MAXTRACKS); + assert(i < ZVertex::utilities::MAXTRACKS); int iz = int(zt[i] * 10.); // valid if eps<=0.1 // iz = std::clamp(iz, INT8_MIN, INT8_MAX); // sorry c++17 only iz = std::min(std::max(iz, INT8_MIN), INT8_MAX); @@ -197,7 +198,7 @@ namespace gpuVertexFinder { } __syncthreads(); - assert(foundClusters < ZVertices::MAXVTX); + assert(foundClusters < ZVertex::utilities::MAXVTX); // propagate the negative id to all the tracks in the cluster. for (auto i = threadIdx.x; i < nt; i += blockDim.x) { @@ -219,7 +220,7 @@ namespace gpuVertexFinder { printf("found %d proto vertices\n", foundClusters); } - __global__ void clusterTracksByDensityKernel(gpuVertexFinder::ZVertices* pdata, + __global__ void clusterTracksByDensityKernel(VtxSoAView pdata, gpuVertexFinder::WorkSpace* pws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h index a11283a7b2065..0476cfbae5fef 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h @@ -14,7 +14,7 @@ namespace gpuVertexFinder { // this algo does not really scale as it works in a single block... // enough for <10K tracks we have - __global__ void clusterTracksDBSCAN(ZVertices* pdata, + __global__ void clusterTracksDBSCAN(VtxSoAView pdata, WorkSpace* pws, int minT, // min number of neighbours to be "core" float eps, // max absolute distance to cluster @@ -28,20 +28,21 @@ namespace gpuVertexFinder { auto er2mx = errmax * errmax; - auto& __restrict__ data = *pdata; + auto& __restrict__ data = pdata; auto& __restrict__ ws = *pws; auto nt = ws.ntrks; float const* __restrict__ zt = ws.zt; float const* __restrict__ ezt2 = ws.ezt2; - uint32_t& nvFinal = data.nvFinal; + uint32_t& nvFinal = data.nvFinal(); uint32_t& nvIntermediate = ws.nvIntermediate; uint8_t* __restrict__ izt = ws.izt; - int32_t* __restrict__ nn = data.ndof; + int32_t* __restrict__ nn = data.ndof(); int32_t* __restrict__ iv = ws.iv; - assert(pdata); + //TODO: check if there is a way to assert this + //assert(pdata); assert(zt); using Hist = cms::cuda::HistoContainer; @@ -59,7 +60,7 @@ namespace gpuVertexFinder { // fill hist (bin shall be wider than "eps") for (auto i = threadIdx.x; i < nt; i += blockDim.x) { - assert(i < ZVertices::MAXTRACKS); + assert(i < ZVertex::utilities::MAXTRACKS); int iz = int(zt[i] * 10.); // valid if eps<=0.1 iz = std::clamp(iz, INT8_MIN, INT8_MAX); izt[i] = iz - INT8_MIN; @@ -214,7 +215,7 @@ namespace gpuVertexFinder { } __syncthreads(); - assert(foundClusters < ZVertices::MAXVTX); + assert(foundClusters < ZVertex::utilities::MAXVTX); // propagate the negative id to all the tracks in the cluster. for (auto i = threadIdx.x; i < nt; i += blockDim.x) { diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h index 66d246fcfa4fa..230405c47366a 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h @@ -14,7 +14,7 @@ namespace gpuVertexFinder { // this algo does not really scale as it works in a single block... // enough for <10K tracks we have - __global__ void clusterTracksIterative(ZVertices* pdata, + __global__ void clusterTracksIterative(VtxSoAView pdata, WorkSpace* pws, int minT, // min number of neighbours to be "core" float eps, // max absolute distance to cluster @@ -28,20 +28,21 @@ namespace gpuVertexFinder { auto er2mx = errmax * errmax; - auto& __restrict__ data = *pdata; + auto& __restrict__ data = pdata; auto& __restrict__ ws = *pws; auto nt = ws.ntrks; float const* __restrict__ zt = ws.zt; float const* __restrict__ ezt2 = ws.ezt2; - uint32_t& nvFinal = data.nvFinal; + uint32_t& nvFinal = data.nvFinal(); uint32_t& nvIntermediate = ws.nvIntermediate; uint8_t* __restrict__ izt = ws.izt; - int32_t* __restrict__ nn = data.ndof; + int32_t* __restrict__ nn = data.ndof(); int32_t* __restrict__ iv = ws.iv; - assert(pdata); + //TODO: check if there is a way to assert this + //assert(pdata); assert(zt); using Hist = cms::cuda::HistoContainer; @@ -59,7 +60,7 @@ namespace gpuVertexFinder { // fill hist (bin shall be wider than "eps") for (auto i = threadIdx.x; i < nt; i += blockDim.x) { - assert(i < ZVertices::MAXTRACKS); + assert(i < ZVertex::utilities::MAXTRACKS); int iz = int(zt[i] * 10.); // valid if eps<=0.1 iz = std::clamp(iz, INT8_MIN, INT8_MAX); izt[i] = iz - INT8_MIN; @@ -185,7 +186,7 @@ namespace gpuVertexFinder { } __syncthreads(); - assert(foundClusters < ZVertices::MAXVTX); + assert(foundClusters < ZVertex::utilities::MAXVTX); // propagate the negative id to all the tracks in the cluster. for (auto i = threadIdx.x; i < nt; i += blockDim.x) { diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h index 0acf67244528a..51364e78ee92e 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h @@ -12,27 +12,28 @@ namespace gpuVertexFinder { - __device__ __forceinline__ void fitVertices(ZVertices* pdata, + __device__ __forceinline__ void fitVertices(VtxSoAView pdata, WorkSpace* pws, float chi2Max // for outlier rejection ) { constexpr bool verbose = false; // in principle the compiler should optmize out if false - auto& __restrict__ data = *pdata; + auto& __restrict__ data = pdata; auto& __restrict__ ws = *pws; auto nt = ws.ntrks; float const* __restrict__ zt = ws.zt; float const* __restrict__ ezt2 = ws.ezt2; - float* __restrict__ zv = data.zv; - float* __restrict__ wv = data.wv; - float* __restrict__ chi2 = data.chi2; - uint32_t& nvFinal = data.nvFinal; + float* __restrict__ zv = data.zv(); + float* __restrict__ wv = data.wv(); + float* __restrict__ chi2 = data.chi2(); + uint32_t& nvFinal = data.nvFinal(); uint32_t& nvIntermediate = ws.nvIntermediate; - int32_t* __restrict__ nn = data.ndof; + int32_t* __restrict__ nn = data.ndof(); int32_t* __restrict__ iv = ws.iv; - assert(pdata); + //TODO: check if there is a way to assert this + //assert(pdata); assert(zt); assert(nvFinal <= nvIntermediate); @@ -101,7 +102,7 @@ namespace gpuVertexFinder { printf("and %d noise\n", noise); } - __global__ void fitVerticesKernel(ZVertices* pdata, + __global__ void fitVerticesKernel(VtxSoAView pdata, WorkSpace* pws, float chi2Max // for outlier rejection ) { diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h index 93f78d498b26f..c705fc1f4065e 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h @@ -15,16 +15,16 @@ namespace gpuVertexFinder { - __device__ __forceinline__ void sortByPt2(ZVertices* pdata, WorkSpace* pws) { - auto& __restrict__ data = *pdata; + __device__ __forceinline__ void sortByPt2(VtxSoAView pdata, WorkSpace* pws) { + auto& __restrict__ data = pdata; auto& __restrict__ ws = *pws; auto nt = ws.ntrks; float const* __restrict__ ptt2 = ws.ptt2; - uint32_t const& nvFinal = data.nvFinal; + uint32_t const& nvFinal = data.nvFinal(); int32_t const* __restrict__ iv = ws.iv; - float* __restrict__ ptv2 = data.ptv2; - uint16_t* __restrict__ sortInd = data.sortInd; + float* __restrict__ ptv2 = data.ptv2(); + uint16_t* __restrict__ sortInd = data.sortInd(); // if (threadIdx.x == 0) // printf("sorting %d vertices\n",nvFinal); @@ -34,7 +34,7 @@ namespace gpuVertexFinder { // fill indexing for (auto i = threadIdx.x; i < nt; i += blockDim.x) { - data.idv[ws.itrk[i]] = iv[i]; + data[ws.itrk[i]].idv() = iv[i]; } // can be done asynchronoisly at the end of previous event @@ -66,7 +66,7 @@ namespace gpuVertexFinder { #endif } - __global__ void sortByPt2Kernel(ZVertices* pdata, WorkSpace* pws) { sortByPt2(pdata, pws); } + __global__ void sortByPt2Kernel(VtxSoAView pdata, WorkSpace* pws) { sortByPt2(pdata, pws); } } // namespace gpuVertexFinder diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h index 0fe8bd882dcc5..ad72c489ed67e 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h @@ -12,23 +12,24 @@ namespace gpuVertexFinder { - __device__ __forceinline__ void splitVertices(ZVertices* pdata, WorkSpace* pws, float maxChi2) { + __device__ __forceinline__ void splitVertices(VtxSoAView pdata, WorkSpace* pws, float maxChi2) { constexpr bool verbose = false; // in principle the compiler should optmize out if false - auto& __restrict__ data = *pdata; + auto& __restrict__ data = pdata; auto& __restrict__ ws = *pws; auto nt = ws.ntrks; float const* __restrict__ zt = ws.zt; float const* __restrict__ ezt2 = ws.ezt2; - float* __restrict__ zv = data.zv; - float* __restrict__ wv = data.wv; - float const* __restrict__ chi2 = data.chi2; - uint32_t& nvFinal = data.nvFinal; + float* __restrict__ zv = data.zv(); + float* __restrict__ wv = data.wv(); + float const* __restrict__ chi2 = data.chi2(); + uint32_t& nvFinal = data.nvFinal(); - int32_t const* __restrict__ nn = data.ndof; + int32_t const* __restrict__ nn = data.ndof(); int32_t* __restrict__ iv = ws.iv; - assert(pdata); + //TODO: check if there is a way to assert this + //assert(pdata); assert(zt); // one vertex per block @@ -130,7 +131,7 @@ namespace gpuVertexFinder { } // loop on vertices } - __global__ void splitVerticesKernel(ZVertices* pdata, WorkSpace* pws, float maxChi2) { + __global__ void splitVerticesKernel(VtxSoAView pdata, WorkSpace* pws, float maxChi2) { splitVertices(pdata, pws, maxChi2); } diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc index c92060f8ba2cc..f8755996c3980 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc @@ -1,5 +1,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include //TODO: understand why this is needed + #include "gpuClusterTracksByDensity.h" #include "gpuClusterTracksDBSCAN.h" #include "gpuClusterTracksIterative.h" @@ -8,6 +10,7 @@ #include "gpuSplitVertices.h" #include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" #undef PIXVERTEX_DEBUG_PRODUCE @@ -20,9 +23,11 @@ namespace gpuVertexFinder { // split vertices with a chi2/NDoF greater than this constexpr float maxChi2ForSplit = 9.f; using TkSoAConstView = pixelTrack::TrackSoAConstView; + using VtxSoAView = ZVertex::ZVertexSoAView; - __global__ void loadTracks(TkSoAConstView tracks_view, ZVertexSoA* soa, WorkSpace* pws, float ptMin, float ptMax) { - assert(soa); + __global__ void loadTracks(TkSoAConstView tracks_view, VtxSoAView soa, WorkSpace* pws, float ptMin, float ptMax) { + //TODO: check if there is a way to assert this + //assert(soa); auto const* quality = pixelTrack::utilities::qualityData(tracks_view); auto first = blockIdx.x * blockDim.x + threadIdx.x; @@ -31,7 +36,7 @@ namespace gpuVertexFinder { assert(nHits >= 3); // initialize soa... - soa->idv[idx] = -1; + soa[idx].idv() = -1; if (pixelTrack::utilities::isTriplet(tracks_view, idx)) continue; // no triplets @@ -57,7 +62,7 @@ namespace gpuVertexFinder { // #define THREE_KERNELS #ifndef THREE_KERNELS - __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata, + __global__ void vertexFinderOneKernel(VtxSoAView pdata, gpuVertexFinder::WorkSpace* pws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster @@ -75,7 +80,7 @@ namespace gpuVertexFinder { sortByPt2(pdata, pws); } #else - __global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata, + __global__ void vertexFinderKernel1(gpuVertexFinder::VtxSoAView pdata, gpuVertexFinder::WorkSpace* pws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster @@ -87,7 +92,7 @@ namespace gpuVertexFinder { fitVertices(pdata, pws, maxChi2ForFirstFit); } - __global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws) { + __global__ void vertexFinderKernel2(gpuVertexFinder::VtxSoAView pdata, gpuVertexFinder::WorkSpace* pws) { fitVertices(pdata, pws, maxChi2ForFinalFit); __syncthreads(); sortByPt2(pdata, pws); @@ -95,23 +100,24 @@ namespace gpuVertexFinder { #endif #ifdef __CUDACC__ - ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, + ZVertex::ZVertexSoADevice Producer::makeAsync(cudaStream_t stream, TkSoAConstView tracks_view, float ptMin, float ptMax) const { #ifdef PIXVERTEX_DEBUG_PRODUCE std::cout << "producing Vertices on GPU" << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE - ZVertexHeterogeneous vertices(cms::cuda::make_device_unique(stream)); + ZVertex::ZVertexSoADevice vertices(stream); #else - ZVertexHeterogeneous Producer::make(TkSoAConstView tracks_view, float ptMin, float ptMax) const { + ZVertex::ZVertexSoAHost Producer::make(TkSoAConstView tracks_view, float ptMin, float ptMax) const { #ifdef PIXVERTEX_DEBUG_PRODUCE std::cout << "producing Vertices on CPU" << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE - ZVertexHeterogeneous vertices(std::make_unique()); + ZVertex::ZVertexSoAHost vertices; #endif - auto* soa = vertices.get(); - assert(soa); + auto soa = vertices.view(); + //TODO: check if there is a way to assert this + //assert(soa); #ifdef __CUDACC__ auto ws_d = cms::cuda::make_device_unique(stream); diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h index 8c542607812b9..b8a81ea04a03d 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h @@ -4,18 +4,21 @@ #include #include -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +//#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" #include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" namespace gpuVertexFinder { - using ZVertices = ZVertexSoA; + using VtxSoAView = ZVertex::ZVertexSoAView; using TkSoAConstView = pixelTrack::TrackSoAConstView; // workspace used in the vertex reco algos struct WorkSpace { - static constexpr uint32_t MAXTRACKS = ZVertexSoA::MAXTRACKS; - static constexpr uint32_t MAXVTX = ZVertexSoA::MAXVTX; + static constexpr uint32_t MAXTRACKS = ZVertex::utilities::MAXTRACKS; + static constexpr uint32_t MAXVTX = ZVertex::utilities::MAXVTX; uint32_t ntrks; // number of "selected tracks" uint16_t itrk[MAXTRACKS]; // index of original track @@ -33,14 +36,14 @@ namespace gpuVertexFinder { } }; - __global__ void init(ZVertexSoA* pdata, WorkSpace* pws) { - pdata->init(); + __global__ void init(VtxSoAView pdata, WorkSpace* pws) { + ZVertex::utilities::init(pdata); pws->init(); } class Producer { public: - using ZVertices = ZVertexSoA; + using VtxSoAView = ZVertex::ZVertexSoAView; using WorkSpace = gpuVertexFinder::WorkSpace; Producer(bool oneKernel, @@ -63,8 +66,8 @@ namespace gpuVertexFinder { ~Producer() = default; - ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoAConstView tracks_view, float ptMin, float ptMax) const; - ZVertexHeterogeneous make(TkSoAConstView tracks_view, float ptMin, float ptMax) const; + ZVertex::ZVertexSoADevice makeAsync(cudaStream_t stream, TkSoAConstView tracks_view, float ptMin, float ptMax) const; + ZVertex::ZVertexSoAHost make(TkSoAConstView tracks_view, float ptMin, float ptMax) const; private: const bool oneKernel_; From 152bb647d4539b9ef6b2bb7dceca7ee66b8f5c2e Mon Sep 17 00:00:00 2001 From: Breno Orzari Date: Fri, 4 Nov 2022 12:06:35 +0100 Subject: [PATCH 098/110] Removing copyToHost and --- .../Track/interface/TrackSoAHeterogeneousDevice.h | 7 +------ .../Track/interface/TrackSoAHeterogeneousHost.h | 2 +- CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp | 6 ++++-- .../Vertex/interface/ZVertexSoAHeterogeneousDevice.h | 7 +------ .../Vertex/interface/ZVertexSoAHeterogeneousHost.h | 2 +- .../PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc | 3 ++- .../plugins/CAHitNtupletGeneratorKernelsImpl.h | 3 +-- .../PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc | 7 ++++--- .../PixelVertexFinding/plugins/gpuVertexFinder.cc | 6 ++---- .../PixelVertexFinding/plugins/gpuVertexFinder.h | 3 +-- 10 files changed, 18 insertions(+), 28 deletions(-) diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h index b79f8d959720c..fb1c45f331d19 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h @@ -1,7 +1,7 @@ #ifndef CUDADataFormats_Track_TrackHeterogeneousDevice_H #define CUDADataFormats_Track_TrackHeterogeneousDevice_H -#include +#include #include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" #include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" @@ -17,11 +17,6 @@ class TrackSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection>(S, stream) {} - // Copy data from device to host - __host__ void copyToHost(cms::cuda::host::unique_ptr &host_ptr, cudaStream_t stream) const { - cudaCheck(cudaMemcpyAsync(host_ptr.get(), const_buffer().get(), bufferSize(), cudaMemcpyDeviceToHost, stream)); - cudaCheck(cudaGetLastError()); - } }; namespace pixelTrack { diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h index a4b18134066a3..70427f2bfd559 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h @@ -1,7 +1,7 @@ #ifndef CUDADataFormats_Track_TrackHeterogeneousHost_H #define CUDADataFormats_Track_TrackHeterogeneousHost_H -#include +#include #include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" #include "CUDADataFormats/Common/interface/PortableHostCollection.h" diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp index 0ad6863d4f8c7..0647296b9ef40 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp @@ -13,7 +13,7 @@ the same Layout to access the data on host and print it. */ -#include +#include #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" @@ -42,7 +42,9 @@ int main() { // Instantate tracks on host. This is where the data will be // copied to from device. pixelTrack::TrackSoAHost tracks_h(stream); - tracks_d.copyToHost(tracks_h.buffer(), stream); + //tracks_d.copyToHost(tracks_h.buffer(), stream); + cudaCheck(cudaMemcpyAsync(tracks_h.buffer().get(), tracks_d.const_buffer().get(), tracks_d.bufferSize(), cudaMemcpyDeviceToHost, stream)); + cudaCheck(cudaGetLastError()); // Print results std::cout << "pt" diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h index 47cb8af2b4cc6..d1ff67b042701 100644 --- a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h +++ b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h @@ -1,7 +1,7 @@ #ifndef CUDADataFormats_Vertex_ZVertexHeterogeneousDevice_H #define CUDADataFormats_Vertex_ZVertexHeterogeneousDevice_H -#include +#include #include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" #include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" @@ -17,11 +17,6 @@ class ZVertexSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection explicit ZVertexSoAHeterogeneousDevice(cudaStream_t stream) : PortableDeviceCollection>(S, stream) {} - // Copy data from device to host - __host__ void copyToHost(cms::cuda::host::unique_ptr &host_ptr, cudaStream_t stream) const { - cudaCheck(cudaMemcpyAsync(host_ptr.get(), const_buffer().get(), bufferSize(), cudaMemcpyDeviceToHost, stream)); - cudaCheck(cudaGetLastError()); - } }; namespace ZVertex { diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h index e751e2da8f5de..4867c49d15bab 100644 --- a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h +++ b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h @@ -1,7 +1,7 @@ #ifndef CUDADataFormats_Vertex_ZVertexHeterogeneousHost_H #define CUDADataFormats_Vertex_ZVertexHeterogeneousHost_H -#include +#include #include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" #include "CUDADataFormats/Common/interface/PortableHostCollection.h" diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc index 283e5b0292464..1dadeb9d0dcc1 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc @@ -59,7 +59,8 @@ void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent, cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; auto const& tracks_d = ctx.get(inputDataWrapped); // Tracks on device tracks_h = pixelTrack::TrackSoAHost(ctx.stream()); // Create an instance of Tracks on Host, using the stream - tracks_d.copyToHost(tracks_h.buffer(), ctx.stream()); // Copy data from Device to Host + cudaCheck(cudaMemcpyAsync(tracks_h.buffer().get(), tracks_d.const_buffer().get(), tracks_d.bufferSize(), cudaMemcpyDeviceToHost, ctx.stream())); // Copy data from Device to Host + cudaCheck(cudaGetLastError()); } void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index 4f2272db13354..75f52305ab39b 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -9,7 +9,6 @@ #include #include -#include #include #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" @@ -196,7 +195,7 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells, /* chi2 penalize higher-pt tracks (try rescale it?) auto score = [&](auto it) { - return tracks_view[it].nLayers() < 4 ? + return tracks_view[it].nLayers() < 4 ? std::abs(pixelTrack::utilities::tip(tracks_view, it)) : // tip for triplets tracks_view[it].chi2(it); //chi2 for quads }; diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc index ef97c9a2b6ea7..f373c95e02760 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc @@ -53,9 +53,10 @@ void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { cms::cuda::Product const& inputDataWrapped = iEvent.get(tokenCUDA_); cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; - auto const& zvertex_d = ctx.get(inputDataWrapped); - zvertex_h = ZVertex::ZVertexSoAHost(ctx.stream()); - zvertex_d.copyToHost(zvertex_h.buffer(), ctx.stream()); + auto const& zvertex_d = ctx.get(inputDataWrapped); // Tracks on device + zvertex_h = ZVertex::ZVertexSoAHost(ctx.stream()); // Create an instance of Tracks on Host, using the stream + cudaCheck(cudaMemcpyAsync(zvertex_h.buffer().get(), zvertex_d.const_buffer().get(), zvertex_d.bufferSize(), cudaMemcpyDeviceToHost, ctx.stream())); // Copy data from Device to Host + cudaCheck(cudaGetLastError()); } void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc index f8755996c3980..0e6327c6ed05b 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc @@ -1,6 +1,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include //TODO: understand why this is needed +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" #include "gpuClusterTracksByDensity.h" #include "gpuClusterTracksDBSCAN.h" @@ -9,9 +10,6 @@ #include "gpuSortByPt2.h" #include "gpuSplitVertices.h" -#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" -#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" - #undef PIXVERTEX_DEBUG_PRODUCE namespace gpuVertexFinder { diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h index b8a81ea04a03d..d56d68470acd8 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h @@ -4,11 +4,10 @@ #include #include -//#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" #include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" #include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" #include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" -#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" namespace gpuVertexFinder { From 90b8e3af01e5aea45d64241afe6f35adcc5a9642 Mon Sep 17 00:00:00 2001 From: Breno Orzari Date: Fri, 4 Nov 2022 12:14:41 +0100 Subject: [PATCH 099/110] Removing ZVertexHeterogeneous (not needed anymore) --- .../Vertex/interface/ZVertexHeterogeneous.h | 13 ---------- CUDADataFormats/Vertex/interface/ZVertexSoA.h | 26 ------------------- 2 files changed, 39 deletions(-) delete mode 100644 CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h delete mode 100644 CUDADataFormats/Vertex/interface/ZVertexSoA.h diff --git a/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h deleted file mode 100644 index 417a960951fb1..0000000000000 --- a/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef CUDADataFormatsVertexZVertexHeterogeneous_H -#define CUDADataFormatsVertexZVertexHeterogeneous_H - -#include "CUDADataFormats/Vertex/interface/ZVertexSoA.h" -#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" - -using ZVertexHeterogeneous = HeterogeneousSoA; -#ifndef __CUDACC__ -#include "CUDADataFormats/Common/interface/Product.h" -using ZVertexCUDAProduct = cms::cuda::Product; -#endif - -#endif diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoA.h b/CUDADataFormats/Vertex/interface/ZVertexSoA.h deleted file mode 100644 index e31b87f30fa11..0000000000000 --- a/CUDADataFormats/Vertex/interface/ZVertexSoA.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef CUDADataFormats_Vertex_ZVertexSoA_h -#define CUDADataFormats_Vertex_ZVertexSoA_h - -#include -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" - -// SOA for vertices -// These vertices are clusterized and fitted only along the beam line (z) -// to obtain their global coordinate the beam spot position shall be added (eventually correcting for the beam angle as well) -struct ZVertexSoA { - static constexpr uint32_t MAXTRACKS = 32 * 1024; - static constexpr uint32_t MAXVTX = 1024; - - int16_t idv[MAXTRACKS]; // vertex index for each associated (original) track (-1 == not associate) - float zv[MAXVTX]; // output z-posistion of found vertices - float wv[MAXVTX]; // output weight (1/error^2) on the above - float chi2[MAXVTX]; // vertices chi2 - float ptv2[MAXVTX]; // vertices pt^2 - int32_t ndof[MAXTRACKS]; // vertices number of dof (reused as workspace for the number of nearest neighbours FIXME) - uint16_t sortInd[MAXVTX]; // sorted index (by pt2) ascending - uint32_t nvFinal; // the number of vertices - - __host__ __device__ void init() { nvFinal = 0; } -}; - -#endif // CUDADataFormats_Vertex_ZVertexSoA_h From 3071132831016924fb83632827740df0b907057e Mon Sep 17 00:00:00 2001 From: Breno Orzari Date: Fri, 4 Nov 2022 12:32:41 +0100 Subject: [PATCH 100/110] Cleanup and updating dataformat in L2TauTagNNProducer.cc --- .../interface/ZVertexSoAHeterogeneousDevice.h | 2 +- .../interface/ZVertexSoAHeterogeneousHost.h | 2 +- CUDADataFormats/Vertex/src/classes.h | 1 - .../plugins/PixelTrackDumpCUDA.cc | 1 - .../HLTProducers/src/L2TauTagNNProducer.cc | 27 ++++++++++--------- 5 files changed, 16 insertions(+), 17 deletions(-) diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h index d1ff67b042701..b1b9779ddf400 100644 --- a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h +++ b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h @@ -25,4 +25,4 @@ namespace ZVertex { } // namespace pixelTrack -#endif // CUDADataFormats_Vertex_ZVertexHeterogeneousT_H +#endif // CUDADataFormats_Vertex_ZVertexHeterogeneousDevice_H diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h index 4867c49d15bab..0c02356192c4e 100644 --- a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h +++ b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h @@ -23,4 +23,4 @@ namespace ZVertex { } // namespace ZVertex -#endif // CUDADataFormats_Vertex_ZVertexHeterogeneousT_H +#endif // CUDADataFormats_Vertex_ZVertexHeterogeneousHost_H diff --git a/CUDADataFormats/Vertex/src/classes.h b/CUDADataFormats/Vertex/src/classes.h index 6f087ecb2cf46..0340affffa06c 100644 --- a/CUDADataFormats/Vertex/src/classes.h +++ b/CUDADataFormats/Vertex/src/classes.h @@ -1,7 +1,6 @@ #ifndef CUDADataFormats_Vertex_src_classes_h #define CUDADataFormats_Vertex_src_classes_h -//#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" #include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" #include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" #include "CUDADataFormats/Common/interface/Product.h" diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc index a1acc6376e111..6bf47b7302da1 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc @@ -4,7 +4,6 @@ #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" -//#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" #include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" #include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" #include "DataFormats/Common/interface/Handle.h" diff --git a/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc b/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc index db650684e7578..aa8565e9aed1f 100644 --- a/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc +++ b/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc @@ -50,8 +50,9 @@ #include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" -#include "CUDADataFormats/Vertex/interface/ZVertexSoA.h" -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" namespace L2TauTagNNv1 { constexpr int nCellEta = 5; @@ -181,10 +182,10 @@ class L2TauNNProducer : public edm::stream::EDProducer& allTaus, const pixelTrack::TrackSoAHost& patatracks_tsoa, - const ZVertexSoA& patavtx_soa, + const ZVertex::ZVertexSoAHost& patavtx_soa, const reco::BeamSpot& beamspot, const MagneticField* magfi); - void selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa, + void selectGoodTracksAndVertices(const ZVertex::ZVertexSoAHost& patavtx_soa, const pixelTrack::TrackSoAHost& patatracks_tsoa, std::vector& trkGood, std::vector& vtxGood); @@ -208,7 +209,7 @@ class L2TauNNProducer : public edm::stream::EDProducer eeToken_; const edm::ESGetToken geometryToken_; const edm::ESGetToken bFieldToken_; - const edm::EDGetTokenT pataVerticesToken_; + const edm::EDGetTokenT pataVerticesToken_; const edm::EDGetTokenT pataTracksToken_; const edm::EDGetTokenT beamSpotToken_; const unsigned int maxVtx_; @@ -293,7 +294,7 @@ L2TauNNProducer::L2TauNNProducer(const edm::ParameterSet& cfg, const L2TauNNProd eeToken_(consumes(cfg.getParameter("eeInput"))), geometryToken_(esConsumes()), bFieldToken_(esConsumes()), - pataVerticesToken_(consumes(cfg.getParameter("pataVertices"))), + pataVerticesToken_(consumes(cfg.getParameter("pataVertices"))), pataTracksToken_(consumes(cfg.getParameter("pataTracks"))), beamSpotToken_(consumes(cfg.getParameter("BeamSpot"))), maxVtx_(cfg.getParameter("maxVtx")), @@ -570,12 +571,12 @@ void L2TauNNProducer::fillCaloRecHits(tensorflow::Tensor& cellGridMatrix, } } -void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa, +void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertex::ZVertexSoAHost& patavtx_soa, const pixelTrack::TrackSoAHost& patatracks_tsoa, std::vector& trkGood, std::vector& vtxGood) { const auto maxTracks = patatracks_tsoa.view().metadata().size(); - const int nv = patavtx_soa.nvFinal; + const int nv = patavtx_soa.view().nvFinal(); trkGood.clear(); trkGood.reserve(maxTracks); vtxGood.clear(); @@ -591,7 +592,7 @@ void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa, if (nHits == 0) { break; } - int vtx_ass_to_track = patavtx_soa.idv[trk_idx]; + int vtx_ass_to_track = patavtx_soa.view()[trk_idx].idv(); if (vtx_ass_to_track >= 0 && vtx_ass_to_track < nv) { auto patatrackPt = patatracks_tsoa.view()[trk_idx].pt(); ++nTrkAssociated[vtx_ass_to_track]; @@ -607,7 +608,7 @@ void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa, if (nv > 0) { const auto minFOM_fromFrac = (*std::max_element(pTSquaredSum.begin(), pTSquaredSum.end())) * fractionSumPt2_; for (int j = nv - 1; j >= 0 && vtxGood.size() < maxVtx_; --j) { - auto vtx_idx = patavtx_soa.sortInd[j]; + auto vtx_idx = patavtx_soa.view()[j].sortInd(); assert(vtx_idx < nv); if (nTrkAssociated[vtx_idx] >= 2 && pTSquaredSum[vtx_idx] >= minFOM_fromFrac && pTSquaredSum[vtx_idx] > minSumPt2_) { @@ -652,7 +653,7 @@ std::pair L2TauNNProducer::impactParameter(int it, void L2TauNNProducer::fillPatatracks(tensorflow::Tensor& cellGridMatrix, const std::vector& allTaus, const pixelTrack::TrackSoAHost& patatracks_tsoa, - const ZVertexSoA& patavtx_soa, + const ZVertex::ZVertexSoAHost& patavtx_soa, const reco::BeamSpot& beamspot, const MagneticField* magfi) { using NNInputs = L2TauTagNNv1::NNInputs; @@ -688,7 +689,7 @@ void L2TauNNProducer::fillPatatracks(tensorflow::Tensor& cellGridMatrix, continue; const int patatrackNdof = 2 * std::min(6, nHits) - 5; - const int vtx_idx_assTrk = patavtx_soa.idv[it]; + const int vtx_idx_assTrk = patavtx_soa.view()[it].idv(); if (reco::deltaR2(patatrackEta, patatrackPhi, tauEta, tauPhi) < dR2_max) { std::tie(deta, dphi, eta_idx, phi_idx) = getEtaPhiIndices(patatrackEta, patatrackPhi, allTaus[tau_idx]->polarP4()); @@ -765,7 +766,7 @@ void L2TauNNProducer::produce(edm::Event& event, const edm::EventSetup& eventset const auto hbhe = event.getHandle(hbheToken_); const auto ho = event.getHandle(hoToken_); auto& patatracks_SoA = event.get(pataTracksToken_); - const auto& vertices_SoA = *event.get(pataVerticesToken_); + auto& vertices_SoA = event.get(pataVerticesToken_); const auto bsHandle = event.getHandle(beamSpotToken_); auto const fieldESH = eventsetup.getHandle(bFieldToken_); From 551398bbfbcbf9f294aaa46b8a6aa12a7c2ff455 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Fri, 4 Nov 2022 12:45:55 +0100 Subject: [PATCH 101/110] Adapted test --- .../PixelVertexFinding/test/VertexFinder_t.h | 111 ++++++++++-------- 1 file changed, 60 insertions(+), 51 deletions(-) diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h index 5f8a0646c726a..cf6fccf04ffc0 100644 --- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h +++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h @@ -7,6 +7,13 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/launch.h" +#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h" +#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h" + +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" // TODO: included in order to compile Eigen columns first!!! +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" #ifdef USE_DBSCAN #include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h" #define CLUSTERIZE gpuVertexFinder::clusterTracksDBSCAN @@ -23,7 +30,7 @@ #ifdef ONE_KERNEL #ifdef __CUDACC__ -__global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata, +__global__ void vertexFinderOneKernel(gpuVertexFinder::VtxSoAView pdata, gpuVertexFinder::WorkSpace* pws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster @@ -102,23 +109,26 @@ struct ClusterGenerator { }; // a macro SORRY -#define LOC_ONGPU(M) ((char*)(onGPU_d.get()) + offsetof(gpuVertexFinder::ZVertices, M)) #define LOC_WS(M) ((char*)(ws_d.get()) + offsetof(gpuVertexFinder::WorkSpace, M)) -__global__ void print(gpuVertexFinder::ZVertices const* pdata, gpuVertexFinder::WorkSpace const* pws) { - auto const& __restrict__ data = *pdata; +__global__ void print(gpuVertexFinder::VtxSoAView pdata, gpuVertexFinder::WorkSpace const* pws) { auto const& __restrict__ ws = *pws; - printf("nt,nv %d %d,%d\n", ws.ntrks, data.nvFinal, ws.nvIntermediate); + printf("nt,nv %d %d,%d\n", ws.ntrks, pdata.nvFinal(), ws.nvIntermediate); } int main() { + cudaStream_t stream; #ifdef __CUDACC__ cms::cudatest::requireDevices(); + cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - auto onGPU_d = cms::cuda::make_device_unique(1, nullptr); + // auto onGPU_d = cms::cuda::make_device_unique(1, nullptr); + ZVertex::ZVertexSoADevice onGPU_d(stream); auto ws_d = cms::cuda::make_device_unique(1, nullptr); #else - auto onGPU_d = std::make_unique(); + stream = nullptr; + // auto onGPU_d = std::make_unique(); + ZVertex::ZVertexSoAHost onGPU_d(stream); auto ws_d = std::make_unique(); #endif @@ -135,10 +145,9 @@ int main() { gen(ev); #ifdef __CUDACC__ - init<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get()); + gpuVertexFinder::init<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.get()); #else - onGPU_d->init(); - ws_d->init(); + gpuVertexFinder::init(onGPU_d.view(), ws_d.get()); #endif std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl; @@ -168,30 +177,30 @@ int main() { uint32_t nv = 0; #ifdef __CUDACC__ - print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get()); + print<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.get()); cudaCheck(cudaGetLastError()); cudaDeviceSynchronize(); #ifdef ONE_KERNEL - cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]); + cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.view(), ws_d.get(), kk, par[0], par[1], par[2]); #else - cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]); + cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.view(), ws_d.get(), kk, par[0], par[1], par[2]); #endif - print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get()); + print<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.get()); cudaCheck(cudaGetLastError()); cudaDeviceSynchronize(); - cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); + cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.get(), 50.f); cudaCheck(cudaGetLastError()); - cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else - print(onGPU_d.get(), ws_d.get()); - CLUSTERIZE(onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]); - print(onGPU_d.get(), ws_d.get()); - fitVertices(onGPU_d.get(), ws_d.get(), 50.f); - nv = onGPU_d->nvFinal; + print(onGPU_d.view(), ws_d.get()); + CLUSTERIZE(onGPU_d.view(), ws_d.get(), kk, par[0], par[1], par[2]); + print(onGPU_d.view(), ws_d.get()); + fitVertices(onGPU_d.view(), ws_d.get(), 50.f); + nv = onGPU_d.view().nvFinal(); #endif if (nv == 0) { @@ -221,18 +230,18 @@ int main() { nn = hnn; ind = hind; #else - zv = onGPU_d->zv; - wv = onGPU_d->wv; - ptv2 = onGPU_d->ptv2; - nn = onGPU_d->ndof; - ind = onGPU_d->sortInd; + zv = onGPU_d.view().zv(); + wv = onGPU_d.view().wv(); + ptv2 = onGPU_d.view().ptv2(); + nn = onGPU_d.view().ndof(); + ind = onGPU_d.view().sortInd(); #endif #ifdef __CUDACC__ - cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(nn, onGPU_d.view().ndof(), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float), cudaMemcpyDeviceToHost)); #else - memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); + memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float)); #endif for (auto j = 0U; j < nv; ++j) @@ -244,14 +253,14 @@ int main() { } #ifdef __CUDACC__ - cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); - cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.get(), 50.f); + cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(nn, onGPU_d.view().ndof(), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float), cudaMemcpyDeviceToHost)); #else - fitVertices(onGPU_d.get(), ws_d.get(), 50.f); - nv = onGPU_d->nvFinal; - memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); + fitVertices(onGPU_d.view(), ws_d.get(), 50.f); + nv = onGPU_d.view().nvFinal(); + memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float)); #endif for (auto j = 0U; j < nv; ++j) @@ -264,26 +273,26 @@ int main() { #ifdef __CUDACC__ // one vertex per block!!! - cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f); + cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.view(), ws_d.get(), 9.f); cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else - splitVertices(onGPU_d.get(), ws_d.get(), 9.f); + splitVertices(onGPU_d.view(), ws_d.get(), 9.f); nv = ws_d->nvIntermediate; #endif std::cout << "after split " << nv << std::endl; #ifdef __CUDACC__ - cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f); + cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.get(), 5000.f); cudaCheck(cudaGetLastError()); - cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get()); + cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.view(), ws_d.get()); cudaCheck(cudaGetLastError()); - cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else - fitVertices(onGPU_d.get(), ws_d.get(), 5000.f); - sortByPt2(onGPU_d.get(), ws_d.get()); - nv = onGPU_d->nvFinal; - memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); + fitVertices(onGPU_d.view(), ws_d.get(), 5000.f); + sortByPt2(onGPU_d.view(), ws_d.get()); + nv = onGPU_d.view().nvFinal(); + memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float)); #endif if (nv == 0) { @@ -292,12 +301,12 @@ int main() { } #ifdef __CUDACC__ - cudaCheck(cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(zv, onGPU_d.view().zv(), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(wv, onGPU_d.view().wv(), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(ptv2, onGPU_d.view().ptv2(), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(nn, onGPU_d.view().ndof(), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(ind, onGPU_d.view().sortInd(), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost)); #endif for (auto j = 0U; j < nv; ++j) if (nn[j] > 0) From 71cef1bc5e4e9e3a5e8290fbb1619956e8d2b8d1 Mon Sep 17 00:00:00 2001 From: Breno Orzari Date: Fri, 4 Nov 2022 14:45:59 +0100 Subject: [PATCH 102/110] Adding nullptr to Host collection --- .../PixelVertexFinding/plugins/gpuVertexFinder.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc index 0e6327c6ed05b..b12926b95c707 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc @@ -111,7 +111,7 @@ namespace gpuVertexFinder { #ifdef PIXVERTEX_DEBUG_PRODUCE std::cout << "producing Vertices on CPU" << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE - ZVertex::ZVertexSoAHost vertices; + ZVertex::ZVertexSoAHost vertices(nullptr); #endif auto soa = vertices.view(); //TODO: check if there is a way to assert this From ce16830e724b92375e972d34b1aa705208b45659 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Fri, 4 Nov 2022 15:12:24 +0100 Subject: [PATCH 103/110] Initial implementation for WorkSpace port --- .../interface/ZVertexSoAHeterogeneousHost.h | 3 +- .../Vertex/interface/ZVertexUtilities.h | 16 ++++----- .../plugins/WorkSpaceSoAHeterogeneousDevice.h | 24 +++++++++++++ .../plugins/WorkSpaceSoAHeterogeneousHost.h | 24 +++++++++++++ .../plugins/WorkSpaceUtilities.h | 36 +++++++++++++++++++ .../plugins/gpuVertexFinder.h | 32 ++++------------- 6 files changed, 97 insertions(+), 38 deletions(-) create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h index 0c02356192c4e..4c07bb3ffedb4 100644 --- a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h +++ b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h @@ -5,14 +5,13 @@ #include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" #include "CUDADataFormats/Common/interface/PortableHostCollection.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" template class ZVertexSoAHeterogeneousHost : public cms::cuda::PortableHostCollection> { public: ZVertexSoAHeterogeneousHost() = default; - // Constructor which specifies the SoA size + // Constructor which specifies the SoA size and CUDA stream explicit ZVertexSoAHeterogeneousHost(cudaStream_t stream) : PortableHostCollection>(S, stream) {} }; diff --git a/CUDADataFormats/Vertex/interface/ZVertexUtilities.h b/CUDADataFormats/Vertex/interface/ZVertexUtilities.h index 05ed34e2e8d69..d0614abee91c9 100644 --- a/CUDADataFormats/Vertex/interface/ZVertexUtilities.h +++ b/CUDADataFormats/Vertex/interface/ZVertexUtilities.h @@ -6,7 +6,7 @@ GENERATE_SOA_LAYOUT(ZVertexSoAHeterogeneousLayout, SOA_COLUMN(int16_t, idv), - SOA_COLUMN(float, zv), // this is chi2/ndof as not necessarely all hits are used in the fit + SOA_COLUMN(float, zv), SOA_COLUMN(float, wv), SOA_COLUMN(float, chi2), SOA_COLUMN(float, ptv2), @@ -17,8 +17,12 @@ GENERATE_SOA_LAYOUT(ZVertexSoAHeterogeneousLayout, // Previous ZVertexSoA class methods. // They operate on View and ConstView of the ZVertexSoA. namespace ZVertex { + // Common types for both Host and Device code + using ZVertexSoALayout = ZVertexSoAHeterogeneousLayout<>; + using ZVertexSoAView = ZVertexSoAHeterogeneousLayout<>::View; + using ZVertexSoAConstView = ZVertexSoAHeterogeneousLayout<>::ConstView; + namespace utilities { - using ZVertexSoAView = ZVertexSoAHeterogeneousLayout<>::View; static constexpr uint32_t MAXTRACKS = 32 * 1024; static constexpr uint32_t MAXVTX = 1024; @@ -28,12 +32,4 @@ namespace ZVertex { } // namespace utilities } // namespace ZVertex -namespace ZVertex { - // Common types for both Host and Device code - using ZVertexSoALayout = ZVertexSoAHeterogeneousLayout<>; - using ZVertexSoAView = ZVertexSoAHeterogeneousLayout<>::View; - using ZVertexSoAConstView = ZVertexSoAHeterogeneousLayout<>::ConstView; - -} // namespace ZVertex - #endif diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h new file mode 100644 index 0000000000000..abe77cf84a777 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h @@ -0,0 +1,24 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_WorkSpaceSoAHeterogeneousDevice_h +#define RecoPixelVertexing_PixelVertexFinding_WorkSpaceSoAHeterogeneousDevice_h + +#include +#include "WorkSpaceUtilities.h" +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" +#include "CUDADataFormats/Vertex/interface/WorkSpaceUtilities.h" +#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" + +template +class WorkSpaceSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection { + WorkSpaceSoAHeterogeneousDevice() = default; + + // Constructor which specifies the SoA size and CUDA stream + explicit WorkSpaceSoAHeterogeneousDevice(cudaStream_t stream) + : PortableDeviceCollection>(S, stream) {} +}; + +namespace gpuVertexFinder { + namespace WorkSpace { + using WorkSpaceSoADevice = WorkSpaceSoAHeterogeneousDevice; + } +} // namespace gpuVertexFinder +#endif diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h new file mode 100644 index 0000000000000..5b893718a468d --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h @@ -0,0 +1,24 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_WorkSpaceSoAHeterogeneousHost_h +#define RecoPixelVertexing_PixelVertexFinding_WorkSpaceSoAHeterogeneousHost_h + +#include +#include "WorkSpaceUtilities.h" +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" +#include "CUDADataFormats/Vertex/interface/WorkSpaceUtilities.h" +#include "CUDADataFormats/Common/interface/PortableHostCollection.h" + +template +class WorkSpaceSoAHeterogeneousHost : public cms::cuda::PortableHostCollection { + WorkSpaceSoAHeterogeneousHost() = default; + + // Constructor which specifies the SoA size and CUDA stream + explicit WorkSpaceSoAHeterogeneousHost(cudaStream_t stream) + : PortableHostCollection>(S, stream) {} +}; + +namespace gpuVertexFinder { + namespace WorkSpace { + using WorkSpaceSoAHost = WorkSpaceSoAHeterogeneousHost; + } +} // namespace gpuVertexFinder +#endif diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h new file mode 100644 index 0000000000000..a86ade097ec7c --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h @@ -0,0 +1,36 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_WorkSpace_h +#define RecoPixelVertexing_PixelVertexFinding_WorkSpace_h + +#include +#include +#include "DataFormats/SoATemplate/interface/SoALayout.h" + +// Intermediate data used in the vertex reco algos +// For internal use only +GENERATE_SOA_LAYOUT(WorkSpaceSoAHeterogeneousLayout, + SOA_COLUMN(uint16_t, itrk), // index of original track + SOA_COLUMN(float, zt), // input track z at bs + SOA_COLUMN(float, ezt2), // input error^2 on the above + SOA_COLUMN(float, ptt2), // input pt^2 on the above + SOA_COLUMN(uint8_t, izt), // interized z-position of input tracks + SOA_COLUMN(int32_t, iv), // vertex index for each associated track + SOA_SCALAR(uint32_t, ntrks), // number of "selected tracks" + SOA_SCALAR(uint32_t, nvIntermediate)) // the number of vertices after splitting pruning etc. + +// Methods that operate on View and ConstView of the WorkSpaceSoALayout. +namespace gpuVertexFinder { + namespace workSpace { + using WorkSpaceSoALayout = WorkSpaceSoAHeterogeneousLayout<>; + using WorkSpaceSoAView = WorkSpaceSoAHeterogeneousLayout<>::View; + using WorkSpaceSoAConstView = WorkSpaceSoAHeterogeneousLayout<>::ConstView; + + namespace utilities { + __host__ __device__ inline void init(WorkSpaceSoAView &workspace_view) { + workspace_view.ntrks() = 0; + workspace_view.nvIntermediate() = 0; + } + } // namespace utilities + } // namespace workSpace +} // namespace gpuVertexFinder + +#endif diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h index d56d68470acd8..dfed3772dd2ec 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h @@ -8,43 +8,23 @@ #include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" #include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" #include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" +#include "WorkSpaceUtilities.h" +#include "WorkSpaceSoAHeterogeneousHost.h" +#include "WorkSpaceSoAHeterogeneousDevice.h" namespace gpuVertexFinder { using VtxSoAView = ZVertex::ZVertexSoAView; using TkSoAConstView = pixelTrack::TrackSoAConstView; + using WsSoAView = gpuVertexFinder::workSpace::WorkSpaceSoAView; - // workspace used in the vertex reco algos - struct WorkSpace { - static constexpr uint32_t MAXTRACKS = ZVertex::utilities::MAXTRACKS; - static constexpr uint32_t MAXVTX = ZVertex::utilities::MAXVTX; - - uint32_t ntrks; // number of "selected tracks" - uint16_t itrk[MAXTRACKS]; // index of original track - float zt[MAXTRACKS]; // input track z at bs - float ezt2[MAXTRACKS]; // input error^2 on the above - float ptt2[MAXTRACKS]; // input pt^2 on the above - uint8_t izt[MAXTRACKS]; // interized z-position of input tracks - int32_t iv[MAXTRACKS]; // vertex index for each associated track - - uint32_t nvIntermediate; // the number of vertices after splitting pruning etc. - - __host__ __device__ void init() { - ntrks = 0; - nvIntermediate = 0; - } - }; - - __global__ void init(VtxSoAView pdata, WorkSpace* pws) { + __global__ void init(VtxSoAView pdata, WsSoAview pws) { ZVertex::utilities::init(pdata); - pws->init(); + gpuVertexFinder::workSpace::utilities::init(pws); } class Producer { public: - using VtxSoAView = ZVertex::ZVertexSoAView; - using WorkSpace = gpuVertexFinder::WorkSpace; - Producer(bool oneKernel, bool useDensity, bool useDBSCAN, From c1dfb684634190b48dd110728bfd3af1d3dd772e Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Fri, 4 Nov 2022 15:32:34 +0100 Subject: [PATCH 104/110] Adapt gpuVertexFinder.cc --- .../plugins/gpuVertexFinder.cc | 81 +++++++++---------- 1 file changed, 40 insertions(+), 41 deletions(-) diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc index b12926b95c707..baefe500d74d7 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc @@ -20,10 +20,8 @@ namespace gpuVertexFinder { // split vertices with a chi2/NDoF greater than this constexpr float maxChi2ForSplit = 9.f; - using TkSoAConstView = pixelTrack::TrackSoAConstView; - using VtxSoAView = ZVertex::ZVertexSoAView; - __global__ void loadTracks(TkSoAConstView tracks_view, VtxSoAView soa, WorkSpace* pws, float ptMin, float ptMax) { + __global__ void loadTracks(TkSoAConstView tracks_view, VtxSoAView soa, WsSoAView pws, float ptMin, float ptMax) { //TODO: check if there is a way to assert this //assert(soa); auto const* quality = pixelTrack::utilities::qualityData(tracks_view); @@ -49,19 +47,19 @@ namespace gpuVertexFinder { // clamp pt pt = std::min(pt, ptMax); - auto& data = *pws; - auto it = atomicAdd(&data.ntrks, 1); - data.itrk[it] = idx; - data.zt[it] = pixelTrack::utilities::zip(tracks_view, idx); - data.ezt2[it] = tracks_view[idx].covariance()(14); - data.ptt2[it] = pt * pt; + auto& data = pws; + auto it = atomicAdd(&data.ntrks(), 1); + data[it].itrk() = idx; + data[it].zt() = pixelTrack::utilities::zip(tracks_view, idx); + data[it].ezt2() = tracks_view[idx].covariance()(14); + data[it].ptt2() = pt * pt; } } // #define THREE_KERNELS #ifndef THREE_KERNELS __global__ void vertexFinderOneKernel(VtxSoAView pdata, - gpuVertexFinder::WorkSpace* pws, + WsSoAView pws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" @@ -78,8 +76,8 @@ namespace gpuVertexFinder { sortByPt2(pdata, pws); } #else - __global__ void vertexFinderKernel1(gpuVertexFinder::VtxSoAView pdata, - gpuVertexFinder::WorkSpace* pws, + __global__ void vertexFinderKernel1(VtxSoAView pdata, + WsSoAView pws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" @@ -90,7 +88,7 @@ namespace gpuVertexFinder { fitVertices(pdata, pws, maxChi2ForFirstFit); } - __global__ void vertexFinderKernel2(gpuVertexFinder::VtxSoAView pdata, gpuVertexFinder::WorkSpace* pws) { + __global__ void vertexFinderKernel2(VtxSoAView pdata, WsSoAView pws) { fitVertices(pdata, pws, maxChi2ForFinalFit); __syncthreads(); sortByPt2(pdata, pws); @@ -99,9 +97,9 @@ namespace gpuVertexFinder { #ifdef __CUDACC__ ZVertex::ZVertexSoADevice Producer::makeAsync(cudaStream_t stream, - TkSoAConstView tracks_view, - float ptMin, - float ptMax) const { + TkSoAConstView tracks_view, + float ptMin, + float ptMax) const { #ifdef PIXVERTEX_DEBUG_PRODUCE std::cout << "producing Vertices on GPU" << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE @@ -118,20 +116,20 @@ namespace gpuVertexFinder { //assert(soa); #ifdef __CUDACC__ - auto ws_d = cms::cuda::make_device_unique(stream); + auto ws_d = gpuVertexing::workSpace::WorkSpaceSoAHeterogeneousDevice(stream); #else - auto ws_d = std::make_unique(); + auto ws_d = gpuVertexing::workSpace::WorkSpaceSoAHeterogeneousHost(nullptr); #endif #ifdef __CUDACC__ - init<<<1, 1, 0, stream>>>(soa, ws_d.get()); + init<<<1, 1, 0, stream>>>(soa, ws_d.view()); auto blockSize = 128; auto numberOfBlocks = (tracks_view.metadata().size() + blockSize - 1) / blockSize; - loadTracks<<>>(tracks_view, soa, ws_d.get(), ptMin, ptMax); + loadTracks<<>>(tracks_view, soa, ws_d.view(), ptMin, ptMax); cudaCheck(cudaGetLastError()); #else - init(soa, ws_d.get()); - loadTracks(tracks_view, soa, ws_d.get(), ptMin, ptMax); + init(soa, ws_d.view()); + loadTracks(tracks_view, soa, ws_d.view(), ptMin, ptMax); #endif #ifdef __CUDACC__ @@ -143,50 +141,51 @@ namespace gpuVertexFinder { if (oneKernel_) { // implemented only for density clustesrs #ifndef THREE_KERNELS - vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), minT, eps, errmax, chi2max); #else - vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), minT, eps, errmax, chi2max); cudaCheck(cudaGetLastError()); // one block per vertex... - splitVerticesKernel<<>>(soa, ws_d.get(), maxChi2ForSplit); + splitVerticesKernel<<>>(soa, ws_d.view(), maxChi2ForSplit); cudaCheck(cudaGetLastError()); - vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get()); + vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view()); #endif } else { // five kernels if (useDensity_) { - clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>( + soa, ws_d.view(), minT, eps, errmax, chi2max); } else if (useDBSCAN_) { - clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), minT, eps, errmax, chi2max); } else if (useIterative_) { - clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), minT, eps, errmax, chi2max); } cudaCheck(cudaGetLastError()); - fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFirstFit); + fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), maxChi2ForFirstFit); cudaCheck(cudaGetLastError()); // one block per vertex... - splitVerticesKernel<<>>(soa, ws_d.get(), maxChi2ForSplit); + splitVerticesKernel<<>>(soa, ws_d.view(), maxChi2ForSplit); cudaCheck(cudaGetLastError()); - fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFinalFit); + fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), maxChi2ForFinalFit); cudaCheck(cudaGetLastError()); - sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get()); + sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view()); } cudaCheck(cudaGetLastError()); #else // __CUDACC__ if (useDensity_) { - clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max); + clusterTracksByDensity(soa, ws_d.view(), minT, eps, errmax, chi2max); } else if (useDBSCAN_) { - clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max); + clusterTracksDBSCAN(soa, ws_d.view(), minT, eps, errmax, chi2max); } else if (useIterative_) { - clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max); + clusterTracksIterative(soa, ws_d.view(), minT, eps, errmax, chi2max); } #ifdef PIXVERTEX_DEBUG_PRODUCE - std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl; + std::cout << "found " << ws_d.view().nvIntermediate() << " vertices " << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE - fitVertices(soa, ws_d.get(), maxChi2ForFirstFit); + fitVertices(soa, ws_d.view(), maxChi2ForFirstFit); // one block per vertex! - splitVertices(soa, ws_d.get(), maxChi2ForSplit); - fitVertices(soa, ws_d.get(), maxChi2ForFinalFit); - sortByPt2(soa, ws_d.get()); + splitVertices(soa, ws_d.view(), maxChi2ForSplit); + fitVertices(soa, ws_d.view(), maxChi2ForFinalFit); + sortByPt2(soa, ws_d.view()); #endif return vertices; From b4d1dbb80f8cc1291bfa3cdc2d0fdb64ae7599d3 Mon Sep 17 00:00:00 2001 From: Breno Orzari Date: Fri, 4 Nov 2022 15:36:18 +0100 Subject: [PATCH 105/110] Changing WorkSpace into WsSoAView --- .../plugins/gpuClusterTracksByDensity.h | 18 +++++++++--------- .../plugins/gpuClusterTracksDBSCAN.h | 16 ++++++++-------- .../plugins/gpuClusterTracksIterative.h | 16 ++++++++-------- .../plugins/gpuFitVertices.h | 16 ++++++++-------- .../PixelVertexFinding/plugins/gpuSortByPt2.h | 14 +++++++------- .../plugins/gpuSplitVertices.h | 16 ++++++++-------- 6 files changed, 48 insertions(+), 48 deletions(-) diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h index f920586117078..4124f80e017eb 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h @@ -18,7 +18,7 @@ namespace gpuVertexFinder { // based on Rodrighez&Laio algo // __device__ __forceinline__ void clusterTracksByDensity(VtxSoAView pdata, - gpuVertexFinder::WorkSpace* pws, + WsSoAView pws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" @@ -33,17 +33,17 @@ namespace gpuVertexFinder { auto er2mx = errmax * errmax; auto& __restrict__ data = pdata; - auto& __restrict__ ws = *pws; - auto nt = ws.ntrks; - float const* __restrict__ zt = ws.zt; - float const* __restrict__ ezt2 = ws.ezt2; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ zt = ws.zt(); + float const* __restrict__ ezt2 = ws.ezt2(); uint32_t& nvFinal = data.nvFinal(); - uint32_t& nvIntermediate = ws.nvIntermediate; + uint32_t& nvIntermediate = ws.nvIntermediate(); - uint8_t* __restrict__ izt = ws.izt; + uint8_t* __restrict__ izt = ws.izt(); int32_t* __restrict__ nn = data.ndof(); - int32_t* __restrict__ iv = ws.iv; + int32_t* __restrict__ iv = ws.iv(); //TODO: check if there is a way to assert this //assert(pdata); @@ -221,7 +221,7 @@ namespace gpuVertexFinder { } __global__ void clusterTracksByDensityKernel(VtxSoAView pdata, - gpuVertexFinder::WorkSpace* pws, + WsSoAView pws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h index 0476cfbae5fef..43e420a4c0cbc 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h @@ -15,7 +15,7 @@ namespace gpuVertexFinder { // this algo does not really scale as it works in a single block... // enough for <10K tracks we have __global__ void clusterTracksDBSCAN(VtxSoAView pdata, - WorkSpace* pws, + WsSoAView pws, int minT, // min number of neighbours to be "core" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" @@ -29,17 +29,17 @@ namespace gpuVertexFinder { auto er2mx = errmax * errmax; auto& __restrict__ data = pdata; - auto& __restrict__ ws = *pws; - auto nt = ws.ntrks; - float const* __restrict__ zt = ws.zt; - float const* __restrict__ ezt2 = ws.ezt2; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ zt = ws.zt(); + float const* __restrict__ ezt2 = ws.ezt2(); uint32_t& nvFinal = data.nvFinal(); - uint32_t& nvIntermediate = ws.nvIntermediate; + uint32_t& nvIntermediate = ws.nvIntermediate(); - uint8_t* __restrict__ izt = ws.izt; + uint8_t* __restrict__ izt = ws.izt(); int32_t* __restrict__ nn = data.ndof(); - int32_t* __restrict__ iv = ws.iv; + int32_t* __restrict__ iv = ws.iv(); //TODO: check if there is a way to assert this //assert(pdata); diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h index 230405c47366a..1b172cabf9318 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h @@ -15,7 +15,7 @@ namespace gpuVertexFinder { // this algo does not really scale as it works in a single block... // enough for <10K tracks we have __global__ void clusterTracksIterative(VtxSoAView pdata, - WorkSpace* pws, + WsSoAView pws, int minT, // min number of neighbours to be "core" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" @@ -29,17 +29,17 @@ namespace gpuVertexFinder { auto er2mx = errmax * errmax; auto& __restrict__ data = pdata; - auto& __restrict__ ws = *pws; - auto nt = ws.ntrks; - float const* __restrict__ zt = ws.zt; - float const* __restrict__ ezt2 = ws.ezt2; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ zt = ws.zt(); + float const* __restrict__ ezt2 = ws.ezt2(); uint32_t& nvFinal = data.nvFinal(); - uint32_t& nvIntermediate = ws.nvIntermediate; + uint32_t& nvIntermediate = ws.nvIntermediate(); - uint8_t* __restrict__ izt = ws.izt; + uint8_t* __restrict__ izt = ws.izt(); int32_t* __restrict__ nn = data.ndof(); - int32_t* __restrict__ iv = ws.iv; + int32_t* __restrict__ iv = ws.iv(); //TODO: check if there is a way to assert this //assert(pdata); diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h index 51364e78ee92e..7b926023b4e19 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h @@ -13,24 +13,24 @@ namespace gpuVertexFinder { __device__ __forceinline__ void fitVertices(VtxSoAView pdata, - WorkSpace* pws, + WsSoAView pws, float chi2Max // for outlier rejection ) { constexpr bool verbose = false; // in principle the compiler should optmize out if false auto& __restrict__ data = pdata; - auto& __restrict__ ws = *pws; - auto nt = ws.ntrks; - float const* __restrict__ zt = ws.zt; - float const* __restrict__ ezt2 = ws.ezt2; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ zt = ws.zt(); + float const* __restrict__ ezt2 = ws.ezt2(); float* __restrict__ zv = data.zv(); float* __restrict__ wv = data.wv(); float* __restrict__ chi2 = data.chi2(); uint32_t& nvFinal = data.nvFinal(); - uint32_t& nvIntermediate = ws.nvIntermediate; + uint32_t& nvIntermediate = ws.nvIntermediate(); int32_t* __restrict__ nn = data.ndof(); - int32_t* __restrict__ iv = ws.iv; + int32_t* __restrict__ iv = ws.iv(); //TODO: check if there is a way to assert this //assert(pdata); @@ -103,7 +103,7 @@ namespace gpuVertexFinder { } __global__ void fitVerticesKernel(VtxSoAView pdata, - WorkSpace* pws, + WsSoAView pws, float chi2Max // for outlier rejection ) { fitVertices(pdata, pws, chi2Max); diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h index c705fc1f4065e..38eeac91c5161 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h @@ -15,14 +15,14 @@ namespace gpuVertexFinder { - __device__ __forceinline__ void sortByPt2(VtxSoAView pdata, WorkSpace* pws) { + __device__ __forceinline__ void sortByPt2(VtxSoAView pdata, WsSoAView pws) { auto& __restrict__ data = pdata; - auto& __restrict__ ws = *pws; - auto nt = ws.ntrks; - float const* __restrict__ ptt2 = ws.ptt2; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ ptt2 = ws.ptt2(); uint32_t const& nvFinal = data.nvFinal(); - int32_t const* __restrict__ iv = ws.iv; + int32_t const* __restrict__ iv = ws.iv(); float* __restrict__ ptv2 = data.ptv2(); uint16_t* __restrict__ sortInd = data.sortInd(); @@ -34,7 +34,7 @@ namespace gpuVertexFinder { // fill indexing for (auto i = threadIdx.x; i < nt; i += blockDim.x) { - data[ws.itrk[i]].idv() = iv[i]; + data[ws[i].itrk(i)].idv() = iv[i]; } // can be done asynchronoisly at the end of previous event @@ -66,7 +66,7 @@ namespace gpuVertexFinder { #endif } - __global__ void sortByPt2Kernel(VtxSoAView pdata, WorkSpace* pws) { sortByPt2(pdata, pws); } + __global__ void sortByPt2Kernel(VtxSoAView pdata, WsSoAView pws) { sortByPt2(pdata, pws); } } // namespace gpuVertexFinder diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h index ad72c489ed67e..f90978811b839 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h @@ -12,21 +12,21 @@ namespace gpuVertexFinder { - __device__ __forceinline__ void splitVertices(VtxSoAView pdata, WorkSpace* pws, float maxChi2) { + __device__ __forceinline__ void splitVertices(VtxSoAView pdata, WsSoAView pws, float maxChi2) { constexpr bool verbose = false; // in principle the compiler should optmize out if false auto& __restrict__ data = pdata; - auto& __restrict__ ws = *pws; - auto nt = ws.ntrks; - float const* __restrict__ zt = ws.zt; - float const* __restrict__ ezt2 = ws.ezt2; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ zt = ws.zt(); + float const* __restrict__ ezt2 = ws.ezt2(); float* __restrict__ zv = data.zv(); float* __restrict__ wv = data.wv(); float const* __restrict__ chi2 = data.chi2(); uint32_t& nvFinal = data.nvFinal(); int32_t const* __restrict__ nn = data.ndof(); - int32_t* __restrict__ iv = ws.iv; + int32_t* __restrict__ iv = ws.iv(); //TODO: check if there is a way to assert this //assert(pdata); @@ -121,7 +121,7 @@ namespace gpuVertexFinder { // get a new global vertex __shared__ uint32_t igv; if (0 == threadIdx.x) - igv = atomicAdd(&ws.nvIntermediate, 1); + igv = atomicAdd(&ws.nvIntermediate(), 1); __syncthreads(); for (auto k = threadIdx.x; k < nq; k += blockDim.x) { if (1 == newV[k]) @@ -131,7 +131,7 @@ namespace gpuVertexFinder { } // loop on vertices } - __global__ void splitVerticesKernel(VtxSoAView pdata, WorkSpace* pws, float maxChi2) { + __global__ void splitVerticesKernel(VtxSoAView pdata, WsSoAView pws, float maxChi2) { splitVertices(pdata, pws, maxChi2); } From 73d9392759d514c0b04550f6a35b479e6f5c3842 Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Fri, 4 Nov 2022 15:45:17 +0100 Subject: [PATCH 106/110] Updated tests to use new WorkSpace definition --- .../PixelVertexFinding/test/VertexFinder_t.h | 82 ++++++++++--------- 1 file changed, 42 insertions(+), 40 deletions(-) diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h index cf6fccf04ffc0..ec392a1f4a8d8 100644 --- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h +++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h @@ -9,6 +9,9 @@ #include "HeterogeneousCore/CUDAUtilities/interface/launch.h" #include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h" #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h" +#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h" +#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h" +#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h" #include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" // TODO: included in order to compile Eigen columns first!!! #include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" @@ -31,7 +34,7 @@ #ifdef ONE_KERNEL #ifdef __CUDACC__ __global__ void vertexFinderOneKernel(gpuVertexFinder::VtxSoAView pdata, - gpuVertexFinder::WorkSpace* pws, + gpuVertexFinder::WsSoAView pws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" @@ -108,12 +111,9 @@ struct ClusterGenerator { std::exponential_distribution ptGen; }; -// a macro SORRY -#define LOC_WS(M) ((char*)(ws_d.get()) + offsetof(gpuVertexFinder::WorkSpace, M)) - -__global__ void print(gpuVertexFinder::VtxSoAView pdata, gpuVertexFinder::WorkSpace const* pws) { - auto const& __restrict__ ws = *pws; - printf("nt,nv %d %d,%d\n", ws.ntrks, pdata.nvFinal(), ws.nvIntermediate); +__global__ void print(gpuVertexFinder::VtxSoAView pdata, gpuVertexFinder::WsSoAView pws) { + auto const& __restrict__ ws = pws; + printf("nt,nv %d %d,%d\n", ws.ntrks(), pdata.nvFinal(), ws.nvIntermediate()); } int main() { @@ -122,14 +122,13 @@ int main() { cms::cudatest::requireDevices(); cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - // auto onGPU_d = cms::cuda::make_device_unique(1, nullptr); ZVertex::ZVertexSoADevice onGPU_d(stream); - auto ws_d = cms::cuda::make_device_unique(1, nullptr); + gpuVertexing::workSpace::WorkSpaceSoAHeterogeneousDevice ws_d(stream); #else stream = nullptr; - // auto onGPU_d = std::make_unique(); + ZVertex::ZVertexSoAHost onGPU_d(stream); - auto ws_d = std::make_unique(); + gpuVertexing::workSpace::WorkSpaceSoAHeterogeneousHost ws_d(stream); #endif Event ev; @@ -145,23 +144,26 @@ int main() { gen(ev); #ifdef __CUDACC__ - gpuVertexFinder::init<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.get()); + gpuVertexFinder::init<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.view()); #else - gpuVertexFinder::init(onGPU_d.view(), ws_d.get()); + gpuVertexFinder::init(onGPU_d.view(), ws_d.view()); #endif std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl; auto nt = ev.ztrack.size(); #ifdef __CUDACC__ - cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice)); - cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice)); - cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); - cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(&ws_d.view().ntrks(), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice)); + cudaCheck( + cudaMemcpy(ws_d.view().zt(), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice)); + cudaCheck( + cudaMemcpy(ws_d.view().ezt2(), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); + cudaCheck( + cudaMemcpy(ws_d.view().ptt2(), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); #else - ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t)); - ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size()); - ::memcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size()); - ::memcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size()); + ::memcpy(&ws_d.view().ntrks(), &nt, sizeof(uint32_t)); + ::memcpy(ws_d.view().zt(), ev.ztrack.data(), sizeof(float) * ev.ztrack.size()); + ::memcpy(ws_d.view().ezt2(), ev.eztrack.data(), sizeof(float) * ev.eztrack.size()); + ::memcpy(ws_d.view().ptt2(), ev.pttrack.data(), sizeof(float) * ev.eztrack.size()); #endif std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (i % 4) << std::endl; @@ -177,29 +179,29 @@ int main() { uint32_t nv = 0; #ifdef __CUDACC__ - print<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.get()); + print<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.view()); cudaCheck(cudaGetLastError()); cudaDeviceSynchronize(); #ifdef ONE_KERNEL - cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.view(), ws_d.get(), kk, par[0], par[1], par[2]); + cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.view(), ws_d.view(), kk, par[0], par[1], par[2]); #else - cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.view(), ws_d.get(), kk, par[0], par[1], par[2]); + cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.view(), ws_d.view(), kk, par[0], par[1], par[2]); #endif - print<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.get()); + print<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.view()); cudaCheck(cudaGetLastError()); cudaDeviceSynchronize(); - cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.get(), 50.f); + cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.view(), 50.f); cudaCheck(cudaGetLastError()); cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else - print(onGPU_d.view(), ws_d.get()); - CLUSTERIZE(onGPU_d.view(), ws_d.get(), kk, par[0], par[1], par[2]); - print(onGPU_d.view(), ws_d.get()); - fitVertices(onGPU_d.view(), ws_d.get(), 50.f); + print(onGPU_d.view(), ws_d.view()); + CLUSTERIZE(onGPU_d.view(), ws_d.view(), kk, par[0], par[1], par[2]); + print(onGPU_d.view(), ws_d.view()); + fitVertices(onGPU_d.view(), ws_d.view(), 50.f); nv = onGPU_d.view().nvFinal(); #endif @@ -253,12 +255,12 @@ int main() { } #ifdef __CUDACC__ - cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.get(), 50.f); + cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.view(), 50.f); cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost)); cudaCheck(cudaMemcpy(nn, onGPU_d.view().ndof(), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); cudaCheck(cudaMemcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float), cudaMemcpyDeviceToHost)); #else - fitVertices(onGPU_d.view(), ws_d.get(), 50.f); + fitVertices(onGPU_d.view(), ws_d.view(), 50.f); nv = onGPU_d.view().nvFinal(); memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float)); #endif @@ -273,24 +275,24 @@ int main() { #ifdef __CUDACC__ // one vertex per block!!! - cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.view(), ws_d.get(), 9.f); - cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.view(), ws_d.view(), 9.f); + cudaCheck(cudaMemcpy(&nv, &ws_d.view().nvIntermediate(), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else - splitVertices(onGPU_d.view(), ws_d.get(), 9.f); - nv = ws_d->nvIntermediate; + splitVertices(onGPU_d.view(), ws_d.view(), 9.f); + nv = ws_d.view().nvIntermediate(); #endif std::cout << "after split " << nv << std::endl; #ifdef __CUDACC__ - cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.get(), 5000.f); + cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.view(), 5000.f); cudaCheck(cudaGetLastError()); - cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.view(), ws_d.get()); + cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.view(), ws_d.view()); cudaCheck(cudaGetLastError()); cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else - fitVertices(onGPU_d.view(), ws_d.get(), 5000.f); - sortByPt2(onGPU_d.view(), ws_d.get()); + fitVertices(onGPU_d.view(), ws_d.view(), 5000.f); + sortByPt2(onGPU_d.view(), ws_d.view()); nv = onGPU_d.view().nvFinal(); memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float)); #endif From 3085f4944650e92019a05d732dbc1775e1d79d6c Mon Sep 17 00:00:00 2001 From: Breno Orzari Date: Fri, 4 Nov 2022 16:14:38 +0100 Subject: [PATCH 107/110] Fixing namespaces and some dataformats usage --- .../plugins/WorkSpaceSoAHeterogeneousDevice.h | 7 ++-- .../plugins/WorkSpaceSoAHeterogeneousHost.h | 7 ++-- .../PixelVertexFinding/plugins/gpuSortByPt2.h | 2 +- .../plugins/gpuVertexFinder.cc | 4 +-- .../plugins/gpuVertexFinder.h | 2 +- .../PixelVertexFinding/test/VertexFinder_t.h | 33 ++++++++++--------- 6 files changed, 29 insertions(+), 26 deletions(-) diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h index abe77cf84a777..1c704d1374ca7 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h @@ -4,11 +4,12 @@ #include #include "WorkSpaceUtilities.h" #include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" -#include "CUDADataFormats/Vertex/interface/WorkSpaceUtilities.h" +#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h" #include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" template -class WorkSpaceSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection { +class WorkSpaceSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection> { +public: WorkSpaceSoAHeterogeneousDevice() = default; // Constructor which specifies the SoA size and CUDA stream @@ -17,7 +18,7 @@ class WorkSpaceSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollecti }; namespace gpuVertexFinder { - namespace WorkSpace { + namespace workSpace { using WorkSpaceSoADevice = WorkSpaceSoAHeterogeneousDevice; } } // namespace gpuVertexFinder diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h index 5b893718a468d..1051da0bbcee8 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h @@ -4,11 +4,12 @@ #include #include "WorkSpaceUtilities.h" #include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" -#include "CUDADataFormats/Vertex/interface/WorkSpaceUtilities.h" +#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h" #include "CUDADataFormats/Common/interface/PortableHostCollection.h" template -class WorkSpaceSoAHeterogeneousHost : public cms::cuda::PortableHostCollection { +class WorkSpaceSoAHeterogeneousHost : public cms::cuda::PortableHostCollection> { +public: WorkSpaceSoAHeterogeneousHost() = default; // Constructor which specifies the SoA size and CUDA stream @@ -17,7 +18,7 @@ class WorkSpaceSoAHeterogeneousHost : public cms::cuda::PortableHostCollection; } } // namespace gpuVertexFinder diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h index 38eeac91c5161..ff8cea612de47 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h @@ -34,7 +34,7 @@ namespace gpuVertexFinder { // fill indexing for (auto i = threadIdx.x; i < nt; i += blockDim.x) { - data[ws[i].itrk(i)].idv() = iv[i]; + data[ws[i].itrk()].idv() = iv[i]; } // can be done asynchronoisly at the end of previous event diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc index baefe500d74d7..686f9899d8439 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc @@ -116,9 +116,9 @@ namespace gpuVertexFinder { //assert(soa); #ifdef __CUDACC__ - auto ws_d = gpuVertexing::workSpace::WorkSpaceSoAHeterogeneousDevice(stream); + auto ws_d = gpuVertexFinder::workSpace::WorkSpaceSoADevice(stream); #else - auto ws_d = gpuVertexing::workSpace::WorkSpaceSoAHeterogeneousHost(nullptr); + auto ws_d = gpuVertexFinder::workSpace::WorkSpaceSoAHost(nullptr); #endif #ifdef __CUDACC__ diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h index dfed3772dd2ec..cc8224521680c 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h @@ -18,7 +18,7 @@ namespace gpuVertexFinder { using TkSoAConstView = pixelTrack::TrackSoAConstView; using WsSoAView = gpuVertexFinder::workSpace::WorkSpaceSoAView; - __global__ void init(VtxSoAView pdata, WsSoAview pws) { + __global__ void init(VtxSoAView pdata, WsSoAView pws) { ZVertex::utilities::init(pdata); gpuVertexFinder::workSpace::utilities::init(pws); } diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h index ec392a1f4a8d8..211b8c1b4d4c6 100644 --- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h +++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h @@ -9,14 +9,15 @@ #include "HeterogeneousCore/CUDAUtilities/interface/launch.h" #include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h" #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h" -#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h" -#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h" -#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h" #include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" // TODO: included in order to compile Eigen columns first!!! #include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" #include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" #include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" + +#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h" +#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h" +#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h" #ifdef USE_DBSCAN #include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h" #define CLUSTERIZE gpuVertexFinder::clusterTracksDBSCAN @@ -40,15 +41,15 @@ __global__ void vertexFinderOneKernel(gpuVertexFinder::VtxSoAView pdata, float errmax, // max error to be "seed" float chi2max // max normalized distance to cluster, ) { - clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max); + gpuVertexFinder::clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max); __syncthreads(); - fitVertices(pdata, pws, 50.); + gpuVertexFinder::fitVertices(pdata, pws, 50.); __syncthreads(); - splitVertices(pdata, pws, 9.f); + gpuVertexFinder::splitVertices(pdata, pws, 9.f); __syncthreads(); - fitVertices(pdata, pws, 5000.); + gpuVertexFinder::fitVertices(pdata, pws, 5000.); __syncthreads(); - sortByPt2(pdata, pws); + gpuVertexFinder::sortByPt2(pdata, pws); } #endif #endif @@ -112,7 +113,7 @@ struct ClusterGenerator { }; __global__ void print(gpuVertexFinder::VtxSoAView pdata, gpuVertexFinder::WsSoAView pws) { - auto const& __restrict__ ws = pws; + auto & __restrict__ ws = pws; printf("nt,nv %d %d,%d\n", ws.ntrks(), pdata.nvFinal(), ws.nvIntermediate()); } @@ -123,12 +124,12 @@ int main() { cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); ZVertex::ZVertexSoADevice onGPU_d(stream); - gpuVertexing::workSpace::WorkSpaceSoAHeterogeneousDevice ws_d(stream); + gpuVertexFinder::workSpace::WorkSpaceSoADevice ws_d(stream); #else stream = nullptr; ZVertex::ZVertexSoAHost onGPU_d(stream); - gpuVertexing::workSpace::WorkSpaceSoAHeterogeneousHost ws_d(stream); + gpuVertexFinder::workSpace::WorkSpaceSoAHost ws_d(stream); #endif Event ev; @@ -201,7 +202,7 @@ int main() { print(onGPU_d.view(), ws_d.view()); CLUSTERIZE(onGPU_d.view(), ws_d.view(), kk, par[0], par[1], par[2]); print(onGPU_d.view(), ws_d.view()); - fitVertices(onGPU_d.view(), ws_d.view(), 50.f); + gpuVertexFinder::fitVertices(onGPU_d.view(), ws_d.view(), 50.f); nv = onGPU_d.view().nvFinal(); #endif @@ -260,7 +261,7 @@ int main() { cudaCheck(cudaMemcpy(nn, onGPU_d.view().ndof(), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); cudaCheck(cudaMemcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float), cudaMemcpyDeviceToHost)); #else - fitVertices(onGPU_d.view(), ws_d.view(), 50.f); + gpuVertexFinder::fitVertices(onGPU_d.view(), ws_d.view(), 50.f); nv = onGPU_d.view().nvFinal(); memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float)); #endif @@ -278,7 +279,7 @@ int main() { cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.view(), ws_d.view(), 9.f); cudaCheck(cudaMemcpy(&nv, &ws_d.view().nvIntermediate(), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else - splitVertices(onGPU_d.view(), ws_d.view(), 9.f); + gpuVertexFinder::splitVertices(onGPU_d.view(), ws_d.view(), 9.f); nv = ws_d.view().nvIntermediate(); #endif std::cout << "after split " << nv << std::endl; @@ -291,8 +292,8 @@ int main() { cudaCheck(cudaGetLastError()); cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else - fitVertices(onGPU_d.view(), ws_d.view(), 5000.f); - sortByPt2(onGPU_d.view(), ws_d.view()); + gpuVertexFinder::fitVertices(onGPU_d.view(), ws_d.view(), 5000.f); + gpuVertexFinder::sortByPt2(onGPU_d.view(), ws_d.view()); nv = onGPU_d.view().nvFinal(); memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float)); #endif From 80b9c4d06244bd972a05db4cf4ea42a8ae66f0ed Mon Sep 17 00:00:00 2001 From: Dimitris Papagiannis Date: Fri, 4 Nov 2022 17:09:47 +0100 Subject: [PATCH 108/110] code-format --- CUDADataFormats/Track/interface/PixelTrackUtilities.h | 4 +++- .../Track/interface/TrackSoAHeterogeneousDevice.h | 1 - .../Track/test/TrackSoAHeterogeneous_test.cpp | 3 ++- .../Vertex/interface/ZVertexSoAHeterogeneousDevice.h | 3 +-- .../PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc | 10 +++++++--- .../plugins/PixelVertexProducerCUDA.cc | 2 +- .../plugins/PixelVertexSoAFromCUDA.cc | 10 +++++++--- .../PixelVertexFinding/test/VertexFinder_t.h | 2 +- .../TkSeedGenerator/plugins/SeedProducerFromSoA.cc | 8 ++++---- 9 files changed, 26 insertions(+), 17 deletions(-) diff --git a/CUDADataFormats/Track/interface/PixelTrackUtilities.h b/CUDADataFormats/Track/interface/PixelTrackUtilities.h index 4208dfe93f69c..ce3658f126a83 100644 --- a/CUDADataFormats/Track/interface/PixelTrackUtilities.h +++ b/CUDADataFormats/Track/interface/PixelTrackUtilities.h @@ -65,7 +65,9 @@ namespace pixelTrack { __host__ __device__ inline float zip(const TrackSoAConstView &tracks, int32_t i) { return tracks[i].state()(4); } - __host__ __device__ inline bool isTriplet(const TrackSoAConstView &tracks, int i) { return tracks[i].nLayers() == 3; } + __host__ __device__ inline bool isTriplet(const TrackSoAConstView &tracks, int i) { + return tracks[i].nLayers() == 3; + } template __host__ __device__ inline void copyFromCircle( diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h index fb1c45f331d19..71b8dc48b8b35 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h @@ -16,7 +16,6 @@ class TrackSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection>(S, stream) {} - }; namespace pixelTrack { diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp index 0647296b9ef40..572a84cdd2d73 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp @@ -43,7 +43,8 @@ int main() { // copied to from device. pixelTrack::TrackSoAHost tracks_h(stream); //tracks_d.copyToHost(tracks_h.buffer(), stream); - cudaCheck(cudaMemcpyAsync(tracks_h.buffer().get(), tracks_d.const_buffer().get(), tracks_d.bufferSize(), cudaMemcpyDeviceToHost, stream)); + cudaCheck(cudaMemcpyAsync( + tracks_h.buffer().get(), tracks_d.const_buffer().get(), tracks_d.bufferSize(), cudaMemcpyDeviceToHost, stream)); cudaCheck(cudaGetLastError()); // Print results diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h index b1b9779ddf400..ca97e2533b8d1 100644 --- a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h +++ b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h @@ -16,13 +16,12 @@ class ZVertexSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection // Constructor which specifies the SoA size explicit ZVertexSoAHeterogeneousDevice(cudaStream_t stream) : PortableDeviceCollection>(S, stream) {} - }; namespace ZVertex { using ZVertexSoADevice = ZVertexSoAHeterogeneousDevice; -} // namespace pixelTrack +} // namespace ZVertex #endif // CUDADataFormats_Vertex_ZVertexHeterogeneousDevice_H diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc index 1dadeb9d0dcc1..191e9009f6d6e 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc @@ -57,9 +57,13 @@ void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { cms::cuda::Product const& inputDataWrapped = iEvent.get(tokenCUDA_); cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; - auto const& tracks_d = ctx.get(inputDataWrapped); // Tracks on device - tracks_h = pixelTrack::TrackSoAHost(ctx.stream()); // Create an instance of Tracks on Host, using the stream - cudaCheck(cudaMemcpyAsync(tracks_h.buffer().get(), tracks_d.const_buffer().get(), tracks_d.bufferSize(), cudaMemcpyDeviceToHost, ctx.stream())); // Copy data from Device to Host + auto const& tracks_d = ctx.get(inputDataWrapped); // Tracks on device + tracks_h = pixelTrack::TrackSoAHost(ctx.stream()); // Create an instance of Tracks on Host, using the stream + cudaCheck(cudaMemcpyAsync(tracks_h.buffer().get(), + tracks_d.const_buffer().get(), + tracks_d.bufferSize(), + cudaMemcpyDeviceToHost, + ctx.stream())); // Copy data from Device to Host cudaCheck(cudaGetLastError()); } diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc index 45d1a9d52d99e..db60fb7ebf4bb 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc @@ -105,7 +105,7 @@ void PixelVertexProducerCUDA::produceOnGPU(edm::StreamID streamID, iEvent.getByToken(tokenGPUTrack_, hTracks); cms::cuda::ScopedContextProduce ctx{*hTracks}; - auto &tracks = ctx.get(*hTracks); + auto& tracks = ctx.get(*hTracks); ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks.view(), ptMin_, ptMax_)); } diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc index f373c95e02760..7dd714f22de6f 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc @@ -53,9 +53,13 @@ void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { cms::cuda::Product const& inputDataWrapped = iEvent.get(tokenCUDA_); cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; - auto const& zvertex_d = ctx.get(inputDataWrapped); // Tracks on device - zvertex_h = ZVertex::ZVertexSoAHost(ctx.stream()); // Create an instance of Tracks on Host, using the stream - cudaCheck(cudaMemcpyAsync(zvertex_h.buffer().get(), zvertex_d.const_buffer().get(), zvertex_d.bufferSize(), cudaMemcpyDeviceToHost, ctx.stream())); // Copy data from Device to Host + auto const& zvertex_d = ctx.get(inputDataWrapped); // Tracks on device + zvertex_h = ZVertex::ZVertexSoAHost(ctx.stream()); // Create an instance of Tracks on Host, using the stream + cudaCheck(cudaMemcpyAsync(zvertex_h.buffer().get(), + zvertex_d.const_buffer().get(), + zvertex_d.bufferSize(), + cudaMemcpyDeviceToHost, + ctx.stream())); // Copy data from Device to Host cudaCheck(cudaGetLastError()); } diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h index 211b8c1b4d4c6..c2cea5a9a1f13 100644 --- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h +++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h @@ -113,7 +113,7 @@ struct ClusterGenerator { }; __global__ void print(gpuVertexFinder::VtxSoAView pdata, gpuVertexFinder::WsSoAView pws) { - auto & __restrict__ ws = pws; + auto& __restrict__ ws = pws; printf("nt,nv %d %d,%d\n", ws.ntrks(), pdata.nvFinal(), ws.nvIntermediate()); } diff --git a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc index a5cc27c338ebe..4301749e441fe 100644 --- a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc +++ b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc @@ -91,7 +91,7 @@ void SeedProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, co // std::cout << "beamspot " << bsh.x0() << ' ' << bsh.y0() << ' ' << bsh.z0() << std::endl; GlobalPoint bs(bsh.x0(), bsh.y0(), bsh.z0()); - auto & tsoa = iEvent.get(tokenTrack_); + auto& tsoa = iEvent.get(tokenTrack_); auto const* quality = pixelTrack::utilities::qualityData(tsoa.view()); //auto const& fit = tsoa.stateAtBS; @@ -100,7 +100,7 @@ void SeedProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, co int32_t nt = 0; for (int32_t it = 0; it < maxTracks; ++it) { - auto nHits = pixelTrack::utilities::nHits(tsoa.view(),it); + auto nHits = pixelTrack::utilities::nHits(tsoa.view(), it); if (nHits == 0) break; // this is a guard: maybe we need to move to nTracks... @@ -122,11 +122,11 @@ void SeedProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, co // mind: this values are respect the beamspot! - float phi = pixelTrack::utilities::phi(tsoa.view(),it); + float phi = pixelTrack::utilities::phi(tsoa.view(), it); riemannFit::Vector5d ipar, opar; riemannFit::Matrix5d icov, ocov; - pixelTrack::utilities::copyToDense(tsoa.view(),ipar, icov, it); + pixelTrack::utilities::copyToDense(tsoa.view(), ipar, icov, it); riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov); LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.); From 9b5af467d02c0695e9ee91bb4a83dad6c34f3c8e Mon Sep 17 00:00:00 2001 From: AdrianoDee Date: Wed, 16 Nov 2022 12:01:16 +0100 Subject: [PATCH 109/110] Pixel hits portable (WIP) --- .../interface/TrackingRecHit2DHeterogeneous.h | 1 + .../interface/TrackingRecHitSoADevice.h | 60 ++++++++++++++++ .../interface/TrackingRecHitSoAHost.h | 45 ++++++++++++ .../interface/TrackingRecHitsUtilities.h | 68 +++++++++++++++++++ CUDADataFormats/TrackingRecHit/src/classes.h | 2 + .../TrackingRecHit/src/classes_def.xml | 4 ++ .../TrackingRecHit/test/BuildFile.xml | 9 ++- .../test/TrackingRecHitSoA_test.cpp | 45 ++++++++++++ .../test/TrackingRecHitSoA_test.cu | 66 ++++++++++++++++++ .../plugins/PixelRecHitGPUKernel.cu | 19 +++--- .../plugins/PixelRecHitGPUKernel.h | 4 +- .../plugins/SiPixelRecHitCUDA.cc | 6 +- .../plugins/SiPixelRecHitFromCUDA.cc | 10 +-- .../plugins/SiPixelRecHitSoAFromCUDA.cc | 50 +++++++++----- .../plugins/SiPixelRecHitSoAFromLegacy.cc | 33 +++++---- .../SiPixelRecHits/plugins/gpuPixelRecHits.h | 36 +++++----- .../python/SiPixelRecHits_cfi.py | 2 +- 17 files changed, 392 insertions(+), 68 deletions(-) create mode 100644 CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h create mode 100644 CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h create mode 100644 CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h create mode 100644 CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp create mode 100644 CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h index 8ce37f280ac6c..98112285fce13 100644 --- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h @@ -4,6 +4,7 @@ #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h" #include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" template class TrackingRecHit2DHeterogeneous { diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h new file mode 100644 index 0000000000000..fad70322a7c35 --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h @@ -0,0 +1,60 @@ +#ifndef CUDADataFormats_Track_TrackHeterogeneousDevice_H +#define CUDADataFormats_Track_TrackHeterogeneousDevice_H + +#include + +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" +#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +class TrackingRecHitSoADevice : public cms::cuda::PortableDeviceCollection> { +public: + TrackingRecHitSoADevice() = default; // cms::cuda::Product needs this + + // Constructor which specifies the SoA size + explicit TrackingRecHitSoADevice(uint32_t nHits, bool isPhase2, int32_t offsetBPIX2, pixelCPEforGPU::ParamsOnGPU const* cpeParams, uint32_t const* hitsModuleStart, cudaStream_t stream) + : PortableDeviceCollection>(nHits, stream), nHits_(nHits), cpeParams_(cpeParams) + { + nModules_ = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules; + phiBinner_ = &(view().phiBinner()); + cudaCheck(cudaMemcpyAsync(&(view().nHits()), &nHits, sizeof(uint32_t),cudaMemcpyHostToDevice,stream)); + cudaCheck(cudaMemcpyAsync(&(view().nMaxModules()), &nModules_, sizeof(uint32_t),cudaMemcpyHostToDevice,stream)); + cudaCheck(cudaMemcpyAsync(&(view().hitsModuleStart()), hitsModuleStart, sizeof(uint32_t) * int(nModules_),cudaMemcpyHostToDevice,stream)); + // cudaCheck(cudaMemcpyAsync(&(view().cpeParams()), cpeParams, int(sizeof(pixelCPEforGPU::ParamsOnGPU)),cudaMemcpyHostToDevice,stream)); + cudaCheck(cudaMemcpyAsync(&(view().offsetBPIX2()), &offsetBPIX2, sizeof(int32_t),cudaMemcpyHostToDevice,stream)); + + } + + uint32_t nHits() const { return nHits_; } + uint32_t nModules() const { return nModules_; } + + cms::cuda::host::unique_ptr localCoordToHostAsync(cudaStream_t stream) const { + auto ret = cms::cuda::make_host_unique(5 * nHits(), stream); + size_t rowSize = sizeof(float) * nHits(); + cudaCheck(cudaMemcpyAsync(ret.get(), view().xLocal() , rowSize, cudaMemcpyDeviceToHost, stream)); + cudaCheck(cudaMemcpyAsync(ret.get() + rowSize , view().xLocal() , rowSize, cudaMemcpyDeviceToHost, stream)); + cudaCheck(cudaMemcpyAsync(ret.get() + (rowSize * 2), view().xLocal() , rowSize, cudaMemcpyDeviceToHost, stream)); + cudaCheck(cudaMemcpyAsync(ret.get() + (rowSize * 3) , view().xLocal() , rowSize, cudaMemcpyDeviceToHost, stream)); + return ret; + } + + cms::cuda::host::unique_ptr hitsModuleStartToHostAsync(cudaStream_t stream) const { + auto ret = cms::cuda::make_host_unique(nModules() + 1, stream); + cudaCheck(cudaMemcpyAsync(ret.get(), view().hitsModuleStart().begin(), sizeof(uint32_t) * (nModules() + 1), cudaMemcpyDeviceToHost, stream)); + return ret; + } + + auto phiBinnerStorage() { return phiBinnerStorage_; } + auto phiBinner() { return phiBinner_; } + + private: + uint32_t nHits_; //Needed for the host SoA size + pixelCPEforGPU::ParamsOnGPU const* cpeParams_; //TODO: this is used not that much (only once in BrokenLineFit), would make sens to remove it from this class. + uint32_t nModules_; + trackingRecHitSoA::PhiBinnerStorageType* phiBinnerStorage_; + trackingRecHitSoA::PhiBinner* phiBinner_; +}; + + +#endif // CUDADataFormats_Track_TrackHeterogeneousT_H diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h new file mode 100644 index 0000000000000..cb76d538474da --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h @@ -0,0 +1,45 @@ +#ifndef CUDADataFormats_Track_TrackHeterogeneousHost_H +#define CUDADataFormats_Track_TrackHeterogeneousHost_H + +#include + +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" +#include "CUDADataFormats/Common/interface/PortableHostCollection.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +class TrackingRecHitSoAHost : public cms::cuda::PortableHostCollection> { +public: + TrackingRecHitSoAHost() = default; + + // This SoA Host is used basically only for DQM + // so we just need a slim constructor + explicit TrackingRecHitSoAHost(uint32_t nHits, cudaStream_t stream) + : PortableHostCollection>(nHits, stream) {} + + explicit TrackingRecHitSoAHost(uint32_t nHits, bool isPhase2, int32_t offsetBPIX2, pixelCPEforGPU::ParamsOnGPU const* cpeParams, uint32_t const* hitsModuleStart, cudaStream_t stream) + : PortableHostCollection>(nHits, stream), nHits_(nHits), cpeParams_(cpeParams) + { + nModules_ = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules; + + view().nHits() = nHits; + view().nMaxModules() = nModules_; + std::copy(hitsModuleStart,hitsModuleStart+nModules_,view().hitsModuleStart().begin()); + + view().offsetBPIX2() = offsetBPIX2; + + } + + uint32_t nHits() const { return nHits_; } + uint32_t nModules() const { return nModules_; } + auto phiBinnerStorage() { return phiBinnerStorage_; } + + private: + uint32_t nHits_; //Needed for the host SoA size + pixelCPEforGPU::ParamsOnGPU const* cpeParams_; + uint32_t nModules_; + trackingRecHitSoA::PhiBinnerStorageType* phiBinnerStorage_; +}; + + +#endif // CUDADataFormats_Track_TrackHeterogeneousT_H diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h new file mode 100644 index 0000000000000..f9cc022e571e3 --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h @@ -0,0 +1,68 @@ +#ifndef CUDADataFormats_RecHits_TrackingRecHitsUtilities_h +#define CUDADataFormats_RecHits_TrackingRecHitsUtilities_h + +#include +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "DataFormats/SoATemplate/interface/SoALayout.h" +#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" +#include "SiPixelHitStatus.h" + +namespace trackingRecHitSoA{ + + // more information on bit fields : https://en.cppreference.com/w/cpp/language/bit_field + struct SiPixelHitStatusAndCharge { + SiPixelHitStatus status; + uint32_t charge : 24; + }; + + struct Test { + int a; + }; + + using hindex_type = uint32_t; // if above is <=2^32 + using PhiBinner = cms::cuda:: + HistoContainer; //28 for phase2 geometry + using PhiBinnerStorageType = PhiBinner::index_type; + + using AverageGeometry = pixelTopology::AverageGeometry; + + using ParamsOnGPU = pixelCPEforGPU::ParamsOnGPU; + + using HitLayerStartArray = std::array; + using HitModuleStartArray = std::array; + +} + + +GENERATE_SOA_LAYOUT(TrackingRecHitSoALayout, + SOA_COLUMN(float, xLocal), + SOA_COLUMN(float, yLocal), // this is chi2/ndof as not necessarely all hits are used in the fit + SOA_COLUMN(float, xerrLocal), + SOA_COLUMN(float, yerrLocal), + SOA_COLUMN(float, xGlobal), + SOA_COLUMN(float, yGlobal), + SOA_COLUMN(float, zGlobal), + SOA_COLUMN(float, rGlobal), + SOA_COLUMN(int16_t, iphi), + SOA_COLUMN(trackingRecHitSoA::SiPixelHitStatusAndCharge, chargeAndStatus), + SOA_COLUMN(int16_t, clusterSizeX), + SOA_COLUMN(int16_t, clusterSizeY), + SOA_COLUMN(int16_t, detectorIndex), + + SOA_SCALAR(trackingRecHitSoA::ParamsOnGPU, cpeParams), + SOA_SCALAR(trackingRecHitSoA::AverageGeometry, averageGeometry), + SOA_SCALAR(trackingRecHitSoA::PhiBinner, phiBinner), + SOA_SCALAR(trackingRecHitSoA::HitLayerStartArray,hitsLayerStart), + SOA_SCALAR(trackingRecHitSoA::HitModuleStartArray,hitsModuleStart), + SOA_SCALAR(uint32_t, nHits), + SOA_SCALAR(int32_t, offsetBPIX2), + SOA_SCALAR(uint32_t, nMaxModules)) + +namespace trackingRecHitSoA +{ + using HitSoAView = TrackingRecHitSoALayout<>::View; + using HitSoAConstView = TrackingRecHitSoALayout<>::ConstView; + +} +#endif diff --git a/CUDADataFormats/TrackingRecHit/src/classes.h b/CUDADataFormats/TrackingRecHit/src/classes.h index abecfb38797de..b43537c915e3d 100644 --- a/CUDADataFormats/TrackingRecHit/src/classes.h +++ b/CUDADataFormats/TrackingRecHit/src/classes.h @@ -4,6 +4,8 @@ #include "CUDADataFormats/Common/interface/Product.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" #include "DataFormats/Common/interface/Wrapper.h" #endif // CUDADataFormats_SiPixelCluster_src_classes_h diff --git a/CUDADataFormats/TrackingRecHit/src/classes_def.xml b/CUDADataFormats/TrackingRecHit/src/classes_def.xml index f633d77c48ef7..fe92a1b6ea31e 100644 --- a/CUDADataFormats/TrackingRecHit/src/classes_def.xml +++ b/CUDADataFormats/TrackingRecHit/src/classes_def.xml @@ -7,4 +7,8 @@ + + + + diff --git a/CUDADataFormats/TrackingRecHit/test/BuildFile.xml b/CUDADataFormats/TrackingRecHit/test/BuildFile.xml index ce49c46fffba0..77626dbf724ff 100644 --- a/CUDADataFormats/TrackingRecHit/test/BuildFile.xml +++ b/CUDADataFormats/TrackingRecHit/test/BuildFile.xml @@ -1,5 +1,12 @@ + - + + + + + + + diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp new file mode 100644 index 0000000000000..7f4308ebf1492 --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp @@ -0,0 +1,45 @@ +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" + +#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h" + +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" + +namespace testTrackingRecHit2DNew { + + void run(TrackingRecHitSoADevice& hits, cudaStream_t stream); + +} + +int main() { + cms::cudatest::requireDevices(); + + cudaStream_t stream; + cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + + + // inner scope to deallocate memory before destroying the stream + { + uint32_t nHits = 2000; + int32_t offset = 100; + uint32_t moduleStart[1856]; + + for (size_t i = 0; i < 1856; i++) { + moduleStart[i] = i*2; + } + + TrackingRecHitSoADevice tkhit(nHits,false,offset,nullptr,&moduleStart[0],stream); + + testTrackingRecHit2DNew::run(tkhit,stream); + + auto test = tkhit.localCoordToHostAsync(stream); + printf("tkhit hits %d \n",tkhit.nHits()); + } + + cudaCheck(cudaStreamDestroy(stream)); + + return 0; +} diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu new file mode 100644 index 0000000000000..93f4dde061786 --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu @@ -0,0 +1,66 @@ +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" + +namespace testTrackingRecHit2DNew { + + __global__ void fill(trackingRecHitSoA::HitSoAView soa) { + // assert(soa); + + int i = threadIdx.x; + int j = blockIdx.x; + if(i==0 and j==0) + { + + soa.offsetBPIX2() = 22; + soa[10].xLocal() =1.11; + } + + soa[i].iphi() = i%10; + soa.hitsLayerStart()[j] = j; + //k = soa.test().a; + + } + + __global__ void show(trackingRecHitSoA::HitSoAView soa) { + // assert(soa); + + int i = threadIdx.x; + int j = blockIdx.x; + if(i==0 and j==0) + { + printf("nbins = %d \n", soa.phiBinner().nbins()); + printf("offsetBPIX %d ->%d \n",i,soa.offsetBPIX2()); + printf("nHits %d ->%d \n",i,soa.nHits()); + printf("hitsModuleStart %d ->%d \n",i,soa.hitsModuleStart().at(28)); + } + + if(i%d \n",i,soa[i].iphi()); + + if(j*blockDim.x+i < soa.phiBinner().nbins()) + printf(">bin size %d ->%d \n",j*blockDim.x+i,soa.phiBinner().size(j*blockDim.x+i)); + + } + + + + void run(TrackingRecHitSoADevice& hits, cudaStream_t stream) { + // assert(soa); + printf("RUN!\n"); + int k = 0; + show<<<10, 100, 0, stream>>>(hits.view()); + printf("k = %d\n",k); + + cms::cuda::fillManyFromVector(hits.phiBinner(), + 10, + hits.view().iphi(), + hits.view().hitsLayerStart().data(), + 2000, + 256, + hits.phiBinnerStorage(), + stream); + + show<<<10, 1000, 0, stream>>>(hits.view()); + } + +} // namespace testTrackingRecHit2D diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu index 135254fa6e9f2..6fd9a57a6cc72 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu @@ -34,7 +34,7 @@ namespace { namespace pixelgpudetails { - TrackingRecHit2DGPU PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const& digis_d, + TrackingRecHitSoADevice PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const& digis_d, SiPixelClustersCUDA const& clusters_d, BeamSpotCUDA const& bs_d, pixelCPEforGPU::ParamsOnGPU const* cpeParams, @@ -42,10 +42,11 @@ namespace pixelgpudetails { cudaStream_t stream) const { auto nHits = clusters_d.nClusters(); - TrackingRecHit2DGPU hits_d( - nHits, isPhase2, clusters_d.offsetBPIX2(), cpeParams, clusters_d.clusModuleStart(), stream); - assert(hits_d.nMaxModules() == isPhase2 ? phase2PixelTopology::numberOfModules - : phase1PixelTopology::numberOfModules); + TrackingRecHitSoADevice hits_d(nHits, isPhase2, clusters_d.offsetBPIX2(), cpeParams, clusters_d.clusModuleStart(), stream); + // TrackingRecHit2DGPU hits_d( + // nHits, isPhase2, clusters_d.offsetBPIX2(), cpeParams, clusters_d.clusModuleStart(), stream); + // assert(hits_d.nMaxModules() == isPhase2 ? phase2PixelTopology::numberOfModules + // : phase1PixelTopology::numberOfModules); int activeModulesWithDigis = digis_d.nModules(); // protect from empty events @@ -65,13 +66,13 @@ namespace pixelgpudetails { // assuming full warp of threads is better than a smaller number... if (nHits) { - setHitsLayerStart<<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart()); + setHitsLayerStart<<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.view().hitsLayerStart().data()); cudaCheck(cudaGetLastError()); auto nLayers = isPhase2 ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers; - cms::cuda::fillManyFromVector(hits_d.phiBinner(), + cms::cuda::fillManyFromVector(&(hits_d.view().phiBinner()), nLayers, - hits_d.iphi(), - hits_d.hitsLayerStart(), + hits_d.view().iphi(), + hits_d.view().hitsLayerStart().data(), nHits, 256, hits_d.phiBinnerStorage(), diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h index 8289c8db7f2f4..5d55e713391e1 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h @@ -8,7 +8,7 @@ #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" namespace pixelgpudetails { @@ -22,7 +22,7 @@ namespace pixelgpudetails { PixelRecHitGPUKernel& operator=(const PixelRecHitGPUKernel&) = delete; PixelRecHitGPUKernel& operator=(PixelRecHitGPUKernel&&) = delete; - TrackingRecHit2DGPU makeHitsAsync(SiPixelDigisCUDA const& digis_d, + TrackingRecHitSoADevice makeHitsAsync(SiPixelDigisCUDA const& digis_d, SiPixelClustersCUDA const& clusters_d, BeamSpotCUDA const& bs_d, pixelCPEforGPU::ParamsOnGPU const* cpeParams, diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc index 8112e9ebd19c8..1c050e037144e 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc @@ -4,7 +4,7 @@ #include "CUDADataFormats/Common/interface/Product.h" #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" #include "DataFormats/Common/interface/Handle.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" @@ -37,7 +37,7 @@ class SiPixelRecHitCUDA : public edm::global::EDProducer<> { const edm::EDGetTokenT> tBeamSpot; const edm::EDGetTokenT> token_; const edm::EDGetTokenT> tokenDigi_; - const edm::EDPutTokenT> tokenHit_; + const edm::EDPutTokenT> tokenHit_; const pixelgpudetails::PixelRecHitGPUKernel gpuAlgo_; }; @@ -47,7 +47,7 @@ SiPixelRecHitCUDA::SiPixelRecHitCUDA(const edm::ParameterSet& iConfig) tBeamSpot(consumes>(iConfig.getParameter("beamSpot"))), token_(consumes>(iConfig.getParameter("src"))), tokenDigi_(consumes>(iConfig.getParameter("src"))), - tokenHit_(produces>()) {} + tokenHit_(produces>()) {} void SiPixelRecHitCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { edm::ParameterSetDescription desc; diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc index 7ff2da5552e6d..bc6c3fa370372 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc @@ -24,6 +24,8 @@ #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" + class SiPixelRecHitFromCUDA : public edm::stream::EDProducer { public: explicit SiPixelRecHitFromCUDA(const edm::ParameterSet& iConfig); @@ -40,7 +42,7 @@ class SiPixelRecHitFromCUDA : public edm::stream::EDProducer void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; const edm::ESGetToken geomToken_; - const edm::EDGetTokenT> hitsToken_; // CUDA hits + const edm::EDGetTokenT> hitsToken_; // CUDA hits const edm::EDGetTokenT clusterToken_; // legacy clusters const edm::EDPutTokenT rechitsPutToken_; // legacy rechits const edm::EDPutTokenT hostPutToken_; @@ -54,7 +56,7 @@ class SiPixelRecHitFromCUDA : public edm::stream::EDProducer SiPixelRecHitFromCUDA::SiPixelRecHitFromCUDA(const edm::ParameterSet& iConfig) : geomToken_(esConsumes()), hitsToken_( - consumes>(iConfig.getParameter("pixelRecHitSrc"))), + consumes>(iConfig.getParameter("pixelRecHitSrc"))), clusterToken_(consumes(iConfig.getParameter("src"))), rechitsPutToken_(produces()), hostPutToken_(produces()) {} @@ -69,12 +71,12 @@ void SiPixelRecHitFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& des void SiPixelRecHitFromCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - cms::cuda::Product const& inputDataWrapped = iEvent.get(hitsToken_); + cms::cuda::Product const& inputDataWrapped = iEvent.get(hitsToken_); cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; auto const& inputData = ctx.get(inputDataWrapped); nHits_ = inputData.nHits(); - nMaxModules_ = inputData.nMaxModules(); + nMaxModules_ = inputData.nModules(); LogDebug("SiPixelRecHitFromCUDA") << "converting " << nHits_ << " Hits"; if (0 == nHits_) diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc index 7532470ebd3d4..aedaf6955c747 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc @@ -24,6 +24,9 @@ #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" + class SiPixelRecHitSoAFromCUDA : public edm::stream::EDProducer { public: explicit SiPixelRecHitSoAFromCUDA(const edm::ParameterSet& iConfig); @@ -38,22 +41,24 @@ class SiPixelRecHitSoAFromCUDA : public edm::stream::EDProducer> hitsTokenGPU_; // CUDA hits - const edm::EDPutTokenT hitsPutTokenCPU_; + const edm::EDGetTokenT> hitsTokenGPU_; // CUDA hits + const edm::EDPutTokenT hitsPutTokenCPU_; const edm::EDPutTokenT hostPutToken_; uint32_t nHits_; + TrackingRecHitSoAHost hits_h_; + uint32_t nMaxModules_; - cms::cuda::host::unique_ptr store32_; - cms::cuda::host::unique_ptr store16_; - cms::cuda::host::unique_ptr hitsModuleStart_; + // cms::cuda::host::unique_ptr store32_; + // cms::cuda::host::unique_ptr store16_; + // cms::cuda::host::unique_ptr hitsModuleStart_; }; SiPixelRecHitSoAFromCUDA::SiPixelRecHitSoAFromCUDA(const edm::ParameterSet& iConfig) : hitsTokenGPU_( - consumes>(iConfig.getParameter("pixelRecHitSrc"))), - hitsPutTokenCPU_(produces()), + consumes>(iConfig.getParameter("pixelRecHitSrc"))), + hitsPutTokenCPU_(produces()), hostPutToken_(produces()) {} void SiPixelRecHitSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { @@ -65,29 +70,42 @@ void SiPixelRecHitSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& void SiPixelRecHitSoAFromCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - cms::cuda::Product const& inputDataWrapped = iEvent.get(hitsTokenGPU_); + cms::cuda::Product const& inputDataWrapped = iEvent.get(hitsTokenGPU_); cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; auto const& inputData = ctx.get(inputDataWrapped); - nHits_ = inputData.nHits(); - LogDebug("SiPixelRecHitSoAFromCUDA") << "copying to cpu SoA" << inputData.nHits() << " Hits"; + nHits_ = inputData.view().nHits(); if (0 == nHits_) return; - nMaxModules_ = inputData.nMaxModules(); - store32_ = inputData.store32ToHostAsync(ctx.stream()); - store16_ = inputData.store16ToHostAsync(ctx.stream()); - hitsModuleStart_ = inputData.hitsModuleStartToHostAsync(ctx.stream()); + + nMaxModules_ = inputData.view().nMaxModules(); + + hits_h_ = TrackingRecHitSoAHost(nHits_,ctx.stream()); + cudaCheck(cudaMemcpyAsync(hits_h_.buffer().get(), + inputData.const_buffer().get(), + inputData.bufferSize(), + cudaMemcpyDeviceToHost, + ctx.stream())); // Copy data from Device to Host + cudaCheck(cudaGetLastError()); + + + LogDebug("SiPixelRecHitSoAFromCUDA") << "copying to cpu SoA" << inputData.nHits() << " Hits"; + + // store32_ = inputData.store32ToHostAsync(ctx.stream()); + // store16_ = inputData.store16ToHostAsync(ctx.stream()); + // hitsModuleStart_ = inputData.hitsModuleStartToHostAsync(ctx.stream()); } void SiPixelRecHitSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& es) { auto hmsp = std::make_unique(nMaxModules_ + 1); if (nHits_ > 0) - std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + nMaxModules_ + 1, hmsp.get()); + std::copy(hits_h_.view().hitsModuleStart().begin(),hits_h_.view().hitsModuleStart().end(),hmsp.get()); + // std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + nMaxModules_ + 1, hmsp.get()); iEvent.emplace(hostPutToken_, std::move(hmsp)); - iEvent.emplace(hitsPutTokenCPU_, store32_.get(), store16_.get(), hitsModuleStart_.get(), nHits_); + iEvent.emplace(hitsPutTokenCPU_, std::move(hits_h_));//store32_.get(), store16_.get(), hitsModuleStart_.get(), nHits_); } DEFINE_FWK_MODULE(SiPixelRecHitSoAFromCUDA); diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc index d23ecec66fea0..663674b2a4145 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc @@ -3,7 +3,7 @@ #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" #include "CUDADataFormats/Common/interface/HostProduct.h" #include "DataFormats/BeamSpot/interface/BeamSpot.h" #include "DataFormats/Common/interface/DetSetVectorNew.h" @@ -25,6 +25,8 @@ #include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEBase.h" #include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h" + #include "gpuPixelRecHits.h" class SiPixelRecHitSoAFromLegacy : public edm::global::EDProducer<> { @@ -44,7 +46,7 @@ class SiPixelRecHitSoAFromLegacy : public edm::global::EDProducer<> { const edm::ESGetToken cpeToken_; const edm::EDGetTokenT bsGetToken_; const edm::EDGetTokenT clusterToken_; // Legacy Clusters - const edm::EDPutTokenT tokenHit_; + const edm::EDPutTokenT tokenHit_; const edm::EDPutTokenT tokenModuleStart_; const bool convert2Legacy_; const bool isPhase2_; @@ -55,7 +57,7 @@ SiPixelRecHitSoAFromLegacy::SiPixelRecHitSoAFromLegacy(const edm::ParameterSet& cpeToken_(esConsumes(edm::ESInputTag("", iConfig.getParameter("CPE")))), bsGetToken_{consumes(iConfig.getParameter("beamSpot"))}, clusterToken_{consumes(iConfig.getParameter("src"))}, - tokenHit_{produces()}, + tokenHit_{produces()}, tokenModuleStart_{produces()}, convert2Legacy_(iConfig.getParameter("convertToLegacy")), isPhase2_(iConfig.getParameter("isPhase2")) { @@ -156,9 +158,9 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv // output SoA // element 96 is the start of BPIX2 (i.e. the number of clusters in BPIX1) - auto output = std::make_unique( + auto output = std::make_unique( numberOfClusters, isPhase2_, hitsModuleStart[startBPIX2], &cpeView, hitsModuleStart, nullptr); - assert(output->nMaxModules() == uint32_t(nMaxModules)); + assert(output->nModules() == uint32_t(nMaxModules)); if (0 == numberOfClusters) { iEvent.put(std::move(output)); @@ -239,9 +241,9 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv gpuPixelRecHits::getHits(&cpeView, &bsHost, digiView, ndigi, &clusterView, output->view()); for (auto h = fc; h < lc; ++h) if (h - fc < maxHitsInModule) - assert(gind == output->view()->detectorIndex(h)); + assert(gind == output->view()[h].detectorIndex()); else - assert(gpuClustering::invalidModuleId == output->view()->detectorIndex(h)); + assert(gpuClustering::invalidModuleId == output->view()[h].detectorIndex()); if (convert2Legacy_) { SiPixelRecHitCollectionNew::FastFiller recHitsOnDetUnit(*legacyOutput, detid); for (auto h = fc; h < lc; ++h) { @@ -250,8 +252,8 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv if (ih >= maxHitsInModule) break; assert(ih < clusterRef.size()); - LocalPoint lp(output->view()->xLocal(h), output->view()->yLocal(h)); - LocalError le(output->view()->xerrLocal(h), 0, output->view()->yerrLocal(h)); + LocalPoint lp(output->view()[h].xLocal(), output->view()[h].yLocal()); + LocalError le(output->view()[h].xerrLocal(), 0, output->view()[h].yerrLocal()); SiPixelRecHitQuality::QualWordType rqw = 0; SiPixelRecHit hit(lp, le, rqw, *genericDet, clusterRef[ih]); recHitsOnDetUnit.push_back(hit); @@ -264,20 +266,21 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv // fill data structure to support CA const auto nLayers = isPhase2_ ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers; for (auto i = 0U; i < nLayers + 1; ++i) { - output->hitsLayerStart()[i] = hitsModuleStart[cpeView.layerGeometry().layerStart[i]]; + output->view().hitsLayerStart()[i] = hitsModuleStart[cpeView.layerGeometry().layerStart[i]]; LogDebug("SiPixelRecHitSoAFromLegacy") << "Layer n." << i << " - starting at module: " << cpeView.layerGeometry().layerStart[i] - << " - starts ad cluster: " << output->hitsLayerStart()[i] << "\n"; + << " - starts ad cluster: " << output->view()[i].hitsLayerStart() << "\n"; } - cms::cuda::fillManyFromVector(output->phiBinner(), + cms::cuda::fillManyFromVector(&(output->view().phiBinner()), nLayers, - output->iphi(), - output->hitsLayerStart(), - numberOfHits, + output->view().iphi(), + output->view().hitsLayerStart().data(), + output->nHits(), 256, output->phiBinnerStorage()); + LogDebug("SiPixelRecHitSoAFromLegacy") << "created HitSoa for " << numberOfClusters << " clusters in " << numberOfDetUnits << " Dets"; iEvent.put(std::move(output)); diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h index 5b862b2cf63b9..db0940f0f50f7 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h @@ -7,11 +7,12 @@ #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" #include "DataFormats/Math/interface/approx_atan2.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" namespace gpuPixelRecHits { @@ -20,14 +21,14 @@ namespace gpuPixelRecHits { SiPixelDigisCUDASOAView const digis, int numElements, SiPixelClustersCUDA::SiPixelClustersCUDASOAView const* __restrict__ pclusters, - TrackingRecHit2DSOAView* phits) { + trackingRecHitSoA::HitSoAView hits) { // FIXME // the compiler seems NOT to optimize loads from views (even in a simple test case) // The whole gimnastic here of copying or not is a pure heuristic exercise that seems to produce the fastest code with the above signature // not using views (passing a gazzilion of array pointers) seems to produce the fastest code (but it is harder to mantain) - assert(phits); + // assert(phits); assert(cpeParams); - auto& hits = *phits; + // auto& hits = *phits; auto const& clusters = *pclusters; auto isPhase2 = cpeParams->commonParams().isPhase2; @@ -175,18 +176,19 @@ namespace gpuPixelRecHits { pixelCPEforGPU::errorFromSize(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic); // store it - hits.setChargeAndStatus(h, clusParams.charge[ic], clusParams.status[ic]); - hits.detectorIndex(h) = me; + hits[h].chargeAndStatus().charge = clusParams.charge[ic]; + hits[h].chargeAndStatus().status = clusParams.status[ic]; + hits[h].detectorIndex() = me; float xl, yl; - hits.xLocal(h) = xl = clusParams.xpos[ic]; - hits.yLocal(h) = yl = clusParams.ypos[ic]; + hits[h].xLocal() = xl = clusParams.xpos[ic]; + hits[h].yLocal() = yl = clusParams.ypos[ic]; - hits.clusterSizeX(h) = clusParams.xsize[ic]; - hits.clusterSizeY(h) = clusParams.ysize[ic]; + hits[h].clusterSizeX() = clusParams.xsize[ic]; + hits[h].clusterSizeY() = clusParams.ysize[ic]; - hits.xerrLocal(h) = clusParams.xerr[ic] * clusParams.xerr[ic] + cpeParams->detParams(me).apeXX; - hits.yerrLocal(h) = clusParams.yerr[ic] * clusParams.yerr[ic] + cpeParams->detParams(me).apeYY; + hits[h].xerrLocal() = clusParams.xerr[ic] * clusParams.xerr[ic] + cpeParams->detParams(me).apeXX; + hits[h].yerrLocal() = clusParams.yerr[ic] * clusParams.yerr[ic] + cpeParams->detParams(me).apeYY; // keep it local for computations float xg, yg, zg; @@ -197,12 +199,12 @@ namespace gpuPixelRecHits { yg -= bs->y; zg -= bs->z; - hits.xGlobal(h) = xg; - hits.yGlobal(h) = yg; - hits.zGlobal(h) = zg; + hits[h].xGlobal() = xg; + hits[h].yGlobal() = yg; + hits[h].zGlobal() = zg; - hits.rGlobal(h) = std::sqrt(xg * xg + yg * yg); - hits.iphi(h) = unsafe_atan2s<7>(yg, xg); + hits[h].rGlobal() = std::sqrt(xg * xg + yg * yg); + hits[h].iphi() = unsafe_atan2s<7>(yg, xg); } __syncthreads(); } // end loop on batches diff --git a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py index 4af0238682abb..11a69fead8ad3 100644 --- a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py +++ b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py @@ -56,7 +56,7 @@ siPixelRecHitsPreSplittingSoA = SwitchProducerCUDA( cpu = cms.EDAlias( siPixelRecHitsPreSplittingCPU = cms.VPSet( - cms.PSet(type = cms.string("cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous")), + cms.PSet(type = cms.string("TrackingRecHitSoAHost")), cms.PSet(type = cms.string("uintAsHostProduct")) )), ) From 8016c995db6364391f8e955d52d7337016084671 Mon Sep 17 00:00:00 2001 From: AdrianoDee Date: Fri, 18 Nov 2022 12:33:50 +0100 Subject: [PATCH 110/110] Pixel hits portable (WIP) - II --- .../interface/TrackingRecHitSoADevice.h | 30 +++++++++-------- .../interface/TrackingRecHitsUtilities.h | 26 +++++++++++---- .../test/TrackingRecHitSoA_test.cpp | 18 +++++++++-- .../test/TrackingRecHitSoA_test.cu | 15 ++++++--- .../plugins/PixelRecHitGPUKernel.cu | 9 ++++-- .../plugins/PixelRecHitGPUKernel.h | 1 + .../plugins/SiPixelRecHitCUDA.cc | 3 ++ .../plugins/SiPixelRecHitFromCUDA.cc | 32 +++++++++++++++---- .../SiPixelRecHits/plugins/gpuPixelRecHits.h | 1 + 9 files changed, 99 insertions(+), 36 deletions(-) diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h index fad70322a7c35..104eab337af3f 100644 --- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h @@ -14,10 +14,11 @@ class TrackingRecHitSoADevice : public cms::cuda::PortableDeviceCollection>(nHits, stream), nHits_(nHits), cpeParams_(cpeParams) + : PortableDeviceCollection>(nHits, stream), nHits_(nHits), cpeParams_(cpeParams), hitsModuleStart_(hitsModuleStart) { nModules_ = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules; phiBinner_ = &(view().phiBinner()); + // phiBinner_ = cms::cuda::make_device_unique(stream).get(); cudaCheck(cudaMemcpyAsync(&(view().nHits()), &nHits, sizeof(uint32_t),cudaMemcpyHostToDevice,stream)); cudaCheck(cudaMemcpyAsync(&(view().nMaxModules()), &nModules_, sizeof(uint32_t),cudaMemcpyHostToDevice,stream)); cudaCheck(cudaMemcpyAsync(&(view().hitsModuleStart()), hitsModuleStart, sizeof(uint32_t) * int(nModules_),cudaMemcpyHostToDevice,stream)); @@ -26,34 +27,35 @@ class TrackingRecHitSoADevice : public cms::cuda::PortableDeviceCollection localCoordToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(5 * nHits(), stream); + auto ret = cms::cuda::make_host_unique(4 * nHits(), stream); size_t rowSize = sizeof(float) * nHits(); - cudaCheck(cudaMemcpyAsync(ret.get(), view().xLocal() , rowSize, cudaMemcpyDeviceToHost, stream)); - cudaCheck(cudaMemcpyAsync(ret.get() + rowSize , view().xLocal() , rowSize, cudaMemcpyDeviceToHost, stream)); - cudaCheck(cudaMemcpyAsync(ret.get() + (rowSize * 2), view().xLocal() , rowSize, cudaMemcpyDeviceToHost, stream)); - cudaCheck(cudaMemcpyAsync(ret.get() + (rowSize * 3) , view().xLocal() , rowSize, cudaMemcpyDeviceToHost, stream)); + printf("%d \n",nModules()); + printf("%d \n",nHits()); + cudaCheck(cudaMemcpyAsync(ret.get(), view().xLocal() , rowSize * 4, cudaMemcpyDeviceToHost, stream)); + // cudaCheck(cudaMemcpyAsync(ret.get() + rowSize , view().yLocal() , rowSize, cudaMemcpyDeviceToHost, stream)); + // cudaCheck(cudaMemcpyAsync(ret.get() + size_t(rowSize * 2), view().xerrLocal() , rowSize, cudaMemcpyDeviceToHost, stream)); + // cudaCheck(cudaMemcpyAsync(ret.get() + size_t(rowSize * 3) , view().yerrLocal() , rowSize, cudaMemcpyDeviceToHost, stream)); return ret; - } + } //move to utilities - cms::cuda::host::unique_ptr hitsModuleStartToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(nModules() + 1, stream); - cudaCheck(cudaMemcpyAsync(ret.get(), view().hitsModuleStart().begin(), sizeof(uint32_t) * (nModules() + 1), cudaMemcpyDeviceToHost, stream)); - return ret; - } auto phiBinnerStorage() { return phiBinnerStorage_; } + auto hitsModuleStart() const { return hitsModuleStart_; } auto phiBinner() { return phiBinner_; } private: uint32_t nHits_; //Needed for the host SoA size - pixelCPEforGPU::ParamsOnGPU const* cpeParams_; //TODO: this is used not that much (only once in BrokenLineFit), would make sens to remove it from this class. + pixelCPEforGPU::ParamsOnGPU const* cpeParams_; //TODO: this is used not that much from the hits (only once in BrokenLineFit), would make sens to remove it from this class. + uint32_t const* hitsModuleStart_; + uint32_t nModules_; trackingRecHitSoA::PhiBinnerStorageType* phiBinnerStorage_; trackingRecHitSoA::PhiBinner* phiBinner_; + }; diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h index f9cc022e571e3..c37636d68a138 100644 --- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h @@ -6,6 +6,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" #include "DataFormats/SoATemplate/interface/SoALayout.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" #include "SiPixelHitStatus.h" namespace trackingRecHitSoA{ @@ -21,16 +22,15 @@ namespace trackingRecHitSoA{ }; using hindex_type = uint32_t; // if above is <=2^32 - using PhiBinner = cms::cuda:: - HistoContainer; //28 for phase2 geometry + using PhiBinner = cms::cuda::HistoContainer; //28 for phase2 geometry using PhiBinnerStorageType = PhiBinner::index_type; using AverageGeometry = pixelTopology::AverageGeometry; using ParamsOnGPU = pixelCPEforGPU::ParamsOnGPU; - using HitLayerStartArray = std::array; - using HitModuleStartArray = std::array; + using HitLayerStartArray = std::array; + using HitModuleStartArray = std::array; } @@ -49,20 +49,34 @@ GENERATE_SOA_LAYOUT(TrackingRecHitSoALayout, SOA_COLUMN(int16_t, clusterSizeX), SOA_COLUMN(int16_t, clusterSizeY), SOA_COLUMN(int16_t, detectorIndex), + SOA_COLUMN(trackingRecHitSoA::PhiBinnerStorageType, phiBinnerStorage), + + SOA_SCALAR(trackingRecHitSoA::HitModuleStartArray,hitsModuleStart), + SOA_SCALAR(trackingRecHitSoA::HitLayerStartArray,hitsLayerStart), SOA_SCALAR(trackingRecHitSoA::ParamsOnGPU, cpeParams), SOA_SCALAR(trackingRecHitSoA::AverageGeometry, averageGeometry), SOA_SCALAR(trackingRecHitSoA::PhiBinner, phiBinner), - SOA_SCALAR(trackingRecHitSoA::HitLayerStartArray,hitsLayerStart), - SOA_SCALAR(trackingRecHitSoA::HitModuleStartArray,hitsModuleStart), + SOA_SCALAR(uint32_t, nHits), SOA_SCALAR(int32_t, offsetBPIX2), SOA_SCALAR(uint32_t, nMaxModules)) namespace trackingRecHitSoA { + using HitSoAView = TrackingRecHitSoALayout<>::View; using HitSoAConstView = TrackingRecHitSoALayout<>::ConstView; + constexpr size_t columnsSizes = 8 * sizeof(float) + 4 * sizeof(int16_t) + sizeof(trackingRecHitSoA::SiPixelHitStatusAndCharge) + sizeof(trackingRecHitSoA::PhiBinnerStorageType); + + // cms::cuda::host::unique_ptr hitsModuleStartToHostAsync(HitSoAConstView& view, cudaStream_t stream) { + // // printf("%d \n",nModules()); + // auto ret = cms::cuda::make_host_unique(view.nMaxModules() + 1, stream); + // cudaCheck(cudaMemcpyAsync(ret.get(), view.hitsModuleStart().data(), sizeof(uint32_t) * (view.nMaxModules() + 1), cudaMemcpyDeviceToHost, stream)); + // return ret; + // } + + } #endif diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp index 7f4308ebf1492..eda9a97c02859 100644 --- a/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp +++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp @@ -18,7 +18,7 @@ int main() { cms::cudatest::requireDevices(); cudaStream_t stream; - cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamDefault)); // inner scope to deallocate memory before destroying the stream @@ -34,9 +34,23 @@ int main() { TrackingRecHitSoADevice tkhit(nHits,false,offset,nullptr,&moduleStart[0],stream); testTrackingRecHit2DNew::run(tkhit,stream); + printf("tkhit hits %d \n",tkhit.nHits()); auto test = tkhit.localCoordToHostAsync(stream); - printf("tkhit hits %d \n",tkhit.nHits()); + printf("test[9] %.2f\n",test[9]); + + // auto mods = tkhit.hitsModuleStartToHostAsync(stream); + // auto ret = cms::cuda::make_host_unique(tkhit.nModules() + 1, stream); + // uint32_t* ret; + // // cudaCheck(cudaMemcpyAsync(ret, &(tkhit.view().hitsModuleStart()), sizeof(uint32_t) * (tkhit.nModules() + 1), cudaMemcpyDeviceToHost, stream)); + // size_t skipSize = int(trackingRecHitSoA::columnsSizes * nHits); + // cudaCheck(cudaMemcpyAsync(ret, + // tkhit.const_buffer().get() + skipSize, + // sizeof(uint32_t) * (1856 + 1), + // cudaMemcpyDeviceToHost, + // ctx.stream())); + + printf("mods[9] %d\n",tkhit.hitsModuleStart()[9]); } cudaCheck(cudaStreamDestroy(stream)); diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu index 93f4dde061786..eb042219d3a33 100644 --- a/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu +++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu @@ -18,7 +18,7 @@ namespace testTrackingRecHit2DNew { soa[i].iphi() = i%10; soa.hitsLayerStart()[j] = j; //k = soa.test().a; - + __syncthreads(); } __global__ void show(trackingRecHitSoA::HitSoAView soa) { @@ -26,9 +26,11 @@ namespace testTrackingRecHit2DNew { int i = threadIdx.x; int j = blockIdx.x; + if(i==0 and j==0) { printf("nbins = %d \n", soa.phiBinner().nbins()); + printf("mMaxModules = %d \n", soa.nMaxModules()); printf("offsetBPIX %d ->%d \n",i,soa.offsetBPIX2()); printf("nHits %d ->%d \n",i,soa.nHits()); printf("hitsModuleStart %d ->%d \n",i,soa.hitsModuleStart().at(28)); @@ -39,7 +41,7 @@ namespace testTrackingRecHit2DNew { if(j*blockDim.x+i < soa.phiBinner().nbins()) printf(">bin size %d ->%d \n",j*blockDim.x+i,soa.phiBinner().size(j*blockDim.x+i)); - + __syncthreads(); } @@ -48,19 +50,22 @@ namespace testTrackingRecHit2DNew { // assert(soa); printf("RUN!\n"); int k = 0; - show<<<10, 100, 0, stream>>>(hits.view()); + fill<<<10, 100, 0, stream>>>(hits.view()); printf("k = %d\n",k); + cudaCheck(cudaDeviceSynchronize()); + cms::cuda::fillManyFromVector(hits.phiBinner(), 10, hits.view().iphi(), hits.view().hitsLayerStart().data(), 2000, 256, - hits.phiBinnerStorage(), + hits.view().phiBinnerStorage(), stream); - + cudaCheck(cudaDeviceSynchronize()); show<<<10, 1000, 0, stream>>>(hits.view()); + cudaCheck(cudaDeviceSynchronize()); } } // namespace testTrackingRecHit2D diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu index 6fd9a57a6cc72..0fea13b849ef1 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu @@ -13,6 +13,7 @@ #include "PixelRecHitGPUKernel.h" #include "gpuPixelRecHits.h" +#define GPU_DEBUG namespace { __global__ void setHitsLayerStart(uint32_t const* __restrict__ hitsModuleStart, pixelCPEforGPU::ParamsOnGPU const* cpeParams, @@ -69,13 +70,13 @@ namespace pixelgpudetails { setHitsLayerStart<<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.view().hitsLayerStart().data()); cudaCheck(cudaGetLastError()); auto nLayers = isPhase2 ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers; - cms::cuda::fillManyFromVector(&(hits_d.view().phiBinner()), + cms::cuda::fillManyFromVector(hits_d.phiBinner(), nLayers, hits_d.view().iphi(), hits_d.view().hitsLayerStart().data(), nHits, 256, - hits_d.phiBinnerStorage(), + hits_d.view().phiBinnerStorage(), stream); cudaCheck(cudaGetLastError()); @@ -84,6 +85,10 @@ namespace pixelgpudetails { #endif } } + #ifdef GPU_DEBUG + cudaCheck(cudaDeviceSynchronize()); + std::cout << "DONE" << std::endl; + #endif return hits_d; } diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h index 5d55e713391e1..ada509f57de0a 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h @@ -10,6 +10,7 @@ #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" +#define GPU_DEBUG namespace pixelgpudetails { class PixelRecHitGPUKernel { diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc index 1c050e037144e..2f5c5710b1586 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc @@ -23,6 +23,8 @@ #include "PixelRecHitGPUKernel.h" +#define GPU_DEBUG + class SiPixelRecHitCUDA : public edm::global::EDProducer<> { public: explicit SiPixelRecHitCUDA(const edm::ParameterSet& iConfig); @@ -82,6 +84,7 @@ void SiPixelRecHitCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, cons tokenHit_, gpuAlgo_.makeHitsAsync( digis, clusters, bs, fcpe->getGPUProductAsync(ctx.stream()), fcpe->isPhase2(), ctx.stream())); + std::cout << __LINE__< uint32_t nHits_; uint32_t nMaxModules_; cms::cuda::host::unique_ptr store32_; + // uint32_t* hitsModuleStart_; cms::cuda::host::unique_ptr hitsModuleStart_; }; @@ -74,15 +75,32 @@ void SiPixelRecHitFromCUDA::acquire(edm::Event const& iEvent, cms::cuda::Product const& inputDataWrapped = iEvent.get(hitsToken_); cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; auto const& inputData = ctx.get(inputDataWrapped); - + std::cout << __LINE__< hclusters = iEvent.getHandle(clusterToken_); auto const& input = *hclusters; constexpr uint32_t maxHitsInModule = gpuClustering::maxHitsInModule(); - + std::cout << __LINE__ << std::endl; int numberOfDetUnits = 0; int numberOfClusters = 0; for (auto const& dsv : input) { diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h index db0940f0f50f7..9dbab6c900030 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h @@ -14,6 +14,7 @@ #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" +#define GPU_DEBUG namespace gpuPixelRecHits { __global__ void getHits(pixelCPEforGPU::ParamsOnGPU const* __restrict__ cpeParams,