diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h deleted file mode 100644 index 3ee5af80353dd..0000000000000 --- a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h -#define CUDADataFormats_Track_PixelTrackHeterogeneous_h - -#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h" - -using PixelTrackHeterogeneous = HeterogeneousSoA; - -#endif // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h \ No newline at end of file diff --git a/CUDADataFormats/Track/interface/PixelTrackUtilities.h b/CUDADataFormats/Track/interface/PixelTrackUtilities.h new file mode 100644 index 0000000000000..ce3658f126a83 --- /dev/null +++ b/CUDADataFormats/Track/interface/PixelTrackUtilities.h @@ -0,0 +1,146 @@ +#ifndef CUDADataFormats_Track_PixelTrackUtilities_h +#define CUDADataFormats_Track_PixelTrackUtilities_h + +#include +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "DataFormats/SoATemplate/interface/SoALayout.h" + +namespace pixelTrack { + enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity, notQuality }; + constexpr uint32_t qualitySize{uint8_t(Quality::notQuality)}; + const std::string qualityName[qualitySize]{"bad", "edup", "dup", "loose", "strict", "tight", "highPurity"}; + inline Quality qualityByName(std::string const &name) { + auto qp = std::find(qualityName, qualityName + qualitySize, name) - qualityName; + return static_cast(qp); + } + +#ifdef GPU_SMALL_EVENTS + // kept for testing and debugging + constexpr uint32_t maxNumber() { return 2 * 1024; } +#else + // tested on MC events with 55-75 pileup events + constexpr uint32_t maxNumber() { return 32 * 1024; } +#endif + + using HitContainer = cms::cuda::OneToManyAssoc; + +} // namespace pixelTrack + +// Aliases in order to not confuse the GENERATE_SOA_LAYOUT +// macro with weird colons and angled brackets. +using Vector5f = Eigen::Matrix; +using Vector15f = Eigen::Matrix; +using HitContainer = pixelTrack::HitContainer; + +GENERATE_SOA_LAYOUT(TrackSoAHeterogeneousLayout, + SOA_COLUMN(uint8_t, quality), + SOA_COLUMN(float, chi2), // this is chi2/ndof as not necessarely all hits are used in the fit + SOA_COLUMN(int8_t, nLayers), + SOA_COLUMN(float, eta), + SOA_COLUMN(float, pt), + SOA_EIGEN_COLUMN(Vector5f, state), + SOA_EIGEN_COLUMN(Vector15f, covariance), + SOA_SCALAR(int, nTracks), + SOA_SCALAR(HitContainer, hitIndices), + SOA_SCALAR(HitContainer, detIndices)) + +// Previous TrajectoryStateSoAT class methods. +// They operate on View and ConstView of the TrackSoA. +namespace pixelTrack { + namespace utilities { + using TrackSoAView = TrackSoAHeterogeneousLayout<>::View; + using TrackSoAConstView = TrackSoAHeterogeneousLayout<>::ConstView; + using Quality = pixelTrack::Quality; + using hindex_type = uint32_t; + // State at the Beam spot + // phi,tip,1/pt,cotan(theta),zip + __host__ __device__ inline float charge(const TrackSoAConstView &tracks, int32_t i) { + return std::copysign(1.f, tracks[i].state()(2)); + } + + __host__ __device__ inline float phi(const TrackSoAConstView &tracks, int32_t i) { return tracks[i].state()(0); } + + __host__ __device__ inline float tip(const TrackSoAConstView &tracks, int32_t i) { return tracks[i].state()(1); } + + __host__ __device__ inline float zip(const TrackSoAConstView &tracks, int32_t i) { return tracks[i].state()(4); } + + __host__ __device__ inline bool isTriplet(const TrackSoAConstView &tracks, int i) { + return tracks[i].nLayers() == 3; + } + + template + __host__ __device__ inline void copyFromCircle( + TrackSoAView &tracks, V3 const &cp, M3 const &ccov, V2 const &lp, M2 const &lcov, float b, int32_t i) { + tracks[i].state() << cp.template cast(), lp.template cast(); + + tracks[i].state()(2) = tracks[i].state()(2) * b; + auto cov = tracks[i].covariance(); + cov(0) = ccov(0, 0); + cov(1) = ccov(0, 1); + cov(2) = b * float(ccov(0, 2)); + cov(4) = cov(3) = 0; + cov(5) = ccov(1, 1); + cov(6) = b * float(ccov(1, 2)); + cov(8) = cov(7) = 0; + cov(9) = b * b * float(ccov(2, 2)); + cov(11) = cov(10) = 0; + cov(12) = lcov(0, 0); + cov(13) = lcov(0, 1); + cov(14) = lcov(1, 1); + } + + template + __host__ __device__ inline void copyFromDense(TrackSoAView &tracks, V5 const &v, M5 const &cov, int32_t i) { + tracks[i].state() = v.template cast(); + for (int j = 0, ind = 0; j < 5; ++j) + for (auto k = j; k < 5; ++k) + tracks[i].covariance()(ind++) = cov(j, k); + } + + template + __host__ __device__ inline void copyToDense(const TrackSoAConstView &tracks, V5 &v, M5 &cov, int32_t i) { + v = tracks[i].state().template cast(); + for (int j = 0, ind = 0; j < 5; ++j) { + cov(j, j) = tracks[i].covariance()(ind++); + for (auto k = j + 1; k < 5; ++k) + cov(k, j) = cov(j, k) = tracks[i].covariance()(ind++); + } + } + + // TODO: Not using TrackSoAConstView due to weird bugs with HitContainer + __host__ __device__ inline int computeNumberOfLayers(TrackSoAView &tracks, int32_t i) { + auto pdet = tracks.detIndices().begin(i); + int nl = 1; + auto ol = phase1PixelTopology::getLayer(*pdet); + for (; pdet < tracks.detIndices().end(i); ++pdet) { + auto il = phase1PixelTopology::getLayer(*pdet); + if (il != ol) + ++nl; + ol = il; + } + return nl; + } + __host__ __device__ inline int nHits(const TrackSoAConstView &tracks, int i) { return tracks.detIndices().size(i); } + + // Casts quality SoA data (uint8_t) to pixelTrack::Quality. This is required + // to use the data as an enum instead of a plain uint8_t + __host__ __device__ inline const Quality *qualityData(const TrackSoAConstView &tracks) { + return reinterpret_cast(tracks.quality()); + } + __host__ __device__ inline Quality *qualityData(TrackSoAView tracks) { + return reinterpret_cast(tracks.quality()); + } + + } // namespace utilities +} // namespace pixelTrack + +namespace pixelTrack { + // Common types for both Host and Device code + using TrackSoALayout = TrackSoAHeterogeneousLayout<>; + using TrackSoAView = TrackSoAHeterogeneousLayout<>::View; + using TrackSoAConstView = TrackSoAHeterogeneousLayout<>::ConstView; + +} // namespace pixelTrack + +#endif diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h new file mode 100644 index 0000000000000..71b8dc48b8b35 --- /dev/null +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h @@ -0,0 +1,27 @@ +#ifndef CUDADataFormats_Track_TrackHeterogeneousDevice_H +#define CUDADataFormats_Track_TrackHeterogeneousDevice_H + +#include + +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +template +class TrackSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection> { +public: + TrackSoAHeterogeneousDevice() = default; // cms::cuda::Product needs this + + // Constructor which specifies the SoA size + explicit TrackSoAHeterogeneousDevice(cudaStream_t stream) + : PortableDeviceCollection>(S, stream) {} +}; + +namespace pixelTrack { + + using TrackSoADevice = TrackSoAHeterogeneousDevice; + +} // namespace pixelTrack + +#endif // CUDADataFormats_Track_TrackHeterogeneousT_H diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h new file mode 100644 index 0000000000000..70427f2bfd559 --- /dev/null +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h @@ -0,0 +1,26 @@ +#ifndef CUDADataFormats_Track_TrackHeterogeneousHost_H +#define CUDADataFormats_Track_TrackHeterogeneousHost_H + +#include + +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/Common/interface/PortableHostCollection.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +template +class TrackSoAHeterogeneousHost : public cms::cuda::PortableHostCollection> { +public: + TrackSoAHeterogeneousHost() = default; + + // Constructor which specifies the SoA size + explicit TrackSoAHeterogeneousHost(cudaStream_t stream) + : PortableHostCollection>(S, stream) {} +}; + +namespace pixelTrack { + + using TrackSoAHost = TrackSoAHeterogeneousHost; + +} // namespace pixelTrack + +#endif // CUDADataFormats_Track_TrackHeterogeneousT_H diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h deleted file mode 100644 index 356ea3eddeb7f..0000000000000 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h +++ /dev/null @@ -1,107 +0,0 @@ -#ifndef CUDADataFormats_Track_TrackHeterogeneousT_H -#define CUDADataFormats_Track_TrackHeterogeneousT_H - -#include -#include - -#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h" -#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" - -#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" - -namespace pixelTrack { - enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity, notQuality }; - constexpr uint32_t qualitySize{uint8_t(Quality::notQuality)}; - const std::string qualityName[qualitySize]{"bad", "edup", "dup", "loose", "strict", "tight", "highPurity"}; - inline Quality qualityByName(std::string const &name) { - auto qp = std::find(qualityName, qualityName + qualitySize, name) - qualityName; - return static_cast(qp); - } -} // namespace pixelTrack - -template -class TrackSoAHeterogeneousT { -public: - static constexpr int32_t stride() { return S; } - - using Quality = pixelTrack::Quality; - using hindex_type = uint32_t; - using HitContainer = cms::cuda::OneToManyAssoc; - - // Always check quality is at least loose! - // CUDA does not support enums in __lgc ... -private: - eigenSoA::ScalarSoA quality_; - -public: - constexpr Quality quality(int32_t i) const { return (Quality)(quality_(i)); } - constexpr Quality &quality(int32_t i) { return (Quality &)(quality_(i)); } - constexpr Quality const *qualityData() const { return (Quality const *)(quality_.data()); } - constexpr Quality *qualityData() { return (Quality *)(quality_.data()); } - - // this is chi2/ndof as not necessarely all hits are used in the fit - eigenSoA::ScalarSoA chi2; - - eigenSoA::ScalarSoA nLayers; - - constexpr int nTracks() const { return nTracks_; } - constexpr void setNTracks(int n) { nTracks_ = n; } - - constexpr int nHits(int i) const { return detIndices.size(i); } - - constexpr bool isTriplet(int i) const { return nLayers(i) == 3; } - - constexpr int computeNumberOfLayers(int32_t i) const { - // layers are in order and we assume tracks are either forward or backward - auto pdet = detIndices.begin(i); - int nl = 1; - auto ol = phase1PixelTopology::getLayer(*pdet); - for (; pdet < detIndices.end(i); ++pdet) { - auto il = phase1PixelTopology::getLayer(*pdet); - if (il != ol) - ++nl; - ol = il; - } - return nl; - } - - // State at the Beam spot - // phi,tip,1/pt,cotan(theta),zip - TrajectoryStateSoAT stateAtBS; - eigenSoA::ScalarSoA eta; - eigenSoA::ScalarSoA pt; - constexpr float charge(int32_t i) const { return std::copysign(1.f, stateAtBS.state(i)(2)); } - constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); } - constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); } - constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); } - - // state at the detector of the outermost hit - // representation to be decided... - // not yet filled on GPU - // TrajectoryStateSoA stateAtOuterDet; - - HitContainer hitIndices; - HitContainer detIndices; - -private: - int nTracks_; -}; - -namespace pixelTrack { - -#ifdef GPU_SMALL_EVENTS - // kept for testing and debugging - constexpr uint32_t maxNumber() { return 2 * 1024; } -#else - // tested on MC events with 55-75 pileup events - constexpr uint32_t maxNumber() { return 32 * 1024; } -#endif - - using TrackSoA = TrackSoAHeterogeneousT; - using TrajectoryState = TrajectoryStateSoAT; - using HitContainer = TrackSoA::HitContainer; - -} // namespace pixelTrack - -#endif // CUDADataFormats_Track_TrackHeterogeneousT_H diff --git a/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h b/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h deleted file mode 100644 index 64fcd573a6991..0000000000000 --- a/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h +++ /dev/null @@ -1,59 +0,0 @@ -#ifndef CUDADataFormats_Track_TrajectoryStateSOAT_H -#define CUDADataFormats_Track_TrajectoryStateSOAT_H - -#include -#include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h" - -template -struct TrajectoryStateSoAT { - using Vector5f = Eigen::Matrix; - using Vector15f = Eigen::Matrix; - - using Vector5d = Eigen::Matrix; - using Matrix5d = Eigen::Matrix; - - static constexpr int32_t stride() { return S; } - - eigenSoA::MatrixSoA state; - eigenSoA::MatrixSoA covariance; - - template - __host__ __device__ inline void copyFromCircle( - V3 const& cp, M3 const& ccov, V2 const& lp, M2 const& lcov, float b, int32_t i) { - state(i) << cp.template cast(), lp.template cast(); - state(i)(2) *= b; - auto cov = covariance(i); - cov(0) = ccov(0, 0); - cov(1) = ccov(0, 1); - cov(2) = b * float(ccov(0, 2)); - cov(4) = cov(3) = 0; - cov(5) = ccov(1, 1); - cov(6) = b * float(ccov(1, 2)); - cov(8) = cov(7) = 0; - cov(9) = b * b * float(ccov(2, 2)); - cov(11) = cov(10) = 0; - cov(12) = lcov(0, 0); - cov(13) = lcov(0, 1); - cov(14) = lcov(1, 1); - } - - template - __host__ __device__ inline void copyFromDense(V5 const& v, M5 const& cov, int32_t i) { - state(i) = v.template cast(); - for (int j = 0, ind = 0; j < 5; ++j) - for (auto k = j; k < 5; ++k) - covariance(i)(ind++) = cov(j, k); - } - - template - __host__ __device__ inline void copyToDense(V5& v, M5& cov, int32_t i) const { - v = state(i).template cast(); - for (int j = 0, ind = 0; j < 5; ++j) { - cov(j, j) = covariance(i)(ind++); - for (auto k = j + 1; k < 5; ++k) - cov(k, j) = cov(j, k) = covariance(i)(ind++); - } - } -}; - -#endif // CUDADataFormats_Track_TrajectoryStateSOAT_H diff --git a/CUDADataFormats/Track/src/TrackSoAHeterogeneous_t_test.cc b/CUDADataFormats/Track/src/TrackSoAHeterogeneous_t_test.cc new file mode 100644 index 0000000000000..24792bb6350f8 --- /dev/null +++ b/CUDADataFormats/Track/src/TrackSoAHeterogeneous_t_test.cc @@ -0,0 +1,2 @@ +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" diff --git a/CUDADataFormats/Track/src/classes.h b/CUDADataFormats/Track/src/classes.h index 97c116f6c88d3..338598e28ebf5 100644 --- a/CUDADataFormats/Track/src/classes.h +++ b/CUDADataFormats/Track/src/classes.h @@ -3,7 +3,8 @@ #include "CUDADataFormats/Common/interface/Product.h" #include "CUDADataFormats/Common/interface/HostProduct.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" #include "DataFormats/Common/interface/Wrapper.h" #endif // CUDADataFormats_Track_src_classes_h diff --git a/CUDADataFormats/Track/src/classes_def.xml b/CUDADataFormats/Track/src/classes_def.xml index 9c80ae91baf29..c4337b0b7ee06 100644 --- a/CUDADataFormats/Track/src/classes_def.xml +++ b/CUDADataFormats/Track/src/classes_def.xml @@ -1,6 +1,8 @@ - - - - + + + + + + diff --git a/CUDADataFormats/Track/test/BuildFile.xml b/CUDADataFormats/Track/test/BuildFile.xml index fc78783db473b..e91df3fc785f7 100644 --- a/CUDADataFormats/Track/test/BuildFile.xml +++ b/CUDADataFormats/Track/test/BuildFile.xml @@ -1,19 +1,12 @@ - - - - - - + + - - - - - + + diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_t.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_t.cpp index 9708b689dd05b..b3d62ffa810f6 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_t.cpp +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_t.cpp @@ -1,4 +1,5 @@ -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" #include #include diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp new file mode 100644 index 0000000000000..572a84cdd2d73 --- /dev/null +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp @@ -0,0 +1,72 @@ +/** + Simple test for the pixelTrack::TrackSoA data structure + which inherits from PortableDeviceCollection. + + Creates an instance of the class (automatically allocates + memory on device), passes the view of the SoA data to + the CUDA kernels which: + - Fill the SoA with data. + - Verify that the data written is correct. + + Then, the SoA data are copied back to Host, where + a temporary host-side view (tmp_view) is created using + the same Layout to access the data on host and print it. + */ + +#include +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h" +#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h" + +namespace testTrackSoAHeterogeneousT { + + void runKernels(pixelTrack::TrackSoAView tracks_view, cudaStream_t stream); +} + +int main() { + cms::cudatest::requireDevices(); + + cudaStream_t stream; + cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + + // Inner scope to deallocate memory before destroying the stream + { + // Instantiate tracks on device. PortableDeviceCollection allocates + // SoA on device automatically. + pixelTrack::TrackSoADevice tracks_d(stream); + testTrackSoAHeterogeneousT::runKernels(tracks_d.view(), stream); + + // Instantate tracks on host. This is where the data will be + // copied to from device. + pixelTrack::TrackSoAHost tracks_h(stream); + //tracks_d.copyToHost(tracks_h.buffer(), stream); + cudaCheck(cudaMemcpyAsync( + tracks_h.buffer().get(), tracks_d.const_buffer().get(), tracks_d.bufferSize(), cudaMemcpyDeviceToHost, stream)); + cudaCheck(cudaGetLastError()); + + // Print results + std::cout << "pt" + << "\t" + << "eta" + << "\t" + << "chi2" + << "\t" + << "quality" + << "\t" + << "nLayers" + << "\t" + << "hitIndices off" << std::endl; + + for (int i = 0; i < 10; ++i) { + std::cout << tracks_h.view()[i].pt() << "\t" << tracks_h.view()[i].eta() << "\t" << tracks_h.view()[i].chi2() + << "\t" << (int)tracks_h.view()[i].quality() << "\t" << (int)tracks_h.view()[i].nLayers() << "\t" + << tracks_h.view().hitIndices().off[i] << std::endl; + } + } + cudaCheck(cudaStreamDestroy(stream)); + + return 0; +} diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu new file mode 100644 index 0000000000000..8273f011ace80 --- /dev/null +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu @@ -0,0 +1,52 @@ +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +namespace testTrackSoAHeterogeneousT { + + __global__ void fill(pixelTrack::TrackSoAView tracks_view) { + int i = threadIdx.x; + if (i == 0) { + tracks_view.nTracks() = 420; + } + + for (int j = i; j < tracks_view.metadata().size(); j += blockDim.x) { + tracks_view[j].pt() = (float)j; + tracks_view[j].eta() = (float)j; + tracks_view[j].chi2() = (float)j; + tracks_view[j].quality() = (uint8_t)j % 256; + tracks_view[j].nLayers() = j % 128; + tracks_view.hitIndices().off[j] = j; + } + } + + // TODO: Use TrackSoAConstView when https://github.com/cms-sw/cmssw/pull/39919 is merged + __global__ void verify(pixelTrack::TrackSoAView tracks_view) { + int i = threadIdx.x; + + if (i == 0) { + printf("SoA size: % d, block dims: % d\n", tracks_view.metadata().size(), blockDim.x); + assert(tracks_view.nTracks() == 420); + } + for (int j = i; j < tracks_view.metadata().size(); j += blockDim.x) { + assert(abs(tracks_view[j].pt() - (float)j) < .0001); + assert(abs(tracks_view[j].eta() - (float)j) < .0001); + assert(abs(tracks_view[j].chi2() - (float)j) < .0001); + assert(tracks_view[j].quality() == j % 256); + assert(tracks_view[j].nLayers() == j % 128); + assert(tracks_view.hitIndices().off[j] == j); + } + } + + void runKernels(pixelTrack::TrackSoAView tracks_view, cudaStream_t stream) { + fill<<<1, 1024, 0, stream>>>(tracks_view); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + + verify<<<1, 1024, 0, stream>>>(tracks_view); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + } + +} // namespace testTrackSoAHeterogeneousT diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp deleted file mode 100644 index d6ff539a642b0..0000000000000 --- a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp +++ /dev/null @@ -1 +0,0 @@ -#include "TrajectoryStateSOA_t.h" diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu deleted file mode 100644 index d6ff539a642b0..0000000000000 --- a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu +++ /dev/null @@ -1 +0,0 @@ -#include "TrajectoryStateSOA_t.h" diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h deleted file mode 100644 index 97b88873c2613..0000000000000 --- a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h +++ /dev/null @@ -1,75 +0,0 @@ -#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h" - -using Vector5d = Eigen::Matrix; -using Matrix5d = Eigen::Matrix; - -__host__ __device__ Matrix5d loadCov(Vector5d const& e) { - Matrix5d cov; - for (int i = 0; i < 5; ++i) - cov(i, i) = e(i) * e(i); - for (int i = 0; i < 5; ++i) { - for (int j = 0; j < i; ++j) { - double v = 0.3 * std::sqrt(cov(i, i) * cov(j, j)); // this makes the matrix pos defined - cov(i, j) = (i + j) % 2 ? -0.4 * v : 0.1 * v; - cov(j, i) = cov(i, j); - } - } - return cov; -} - -using TS = TrajectoryStateSoAT<128>; - -__global__ void testTSSoA(TS* pts, int n) { - assert(n <= 128); - - Vector5d par0; - par0 << 0.2, 0.1, 3.5, 0.8, 0.1; - Vector5d e0; - e0 << 0.01, 0.01, 0.035, -0.03, -0.01; - auto cov0 = loadCov(e0); - - TS& ts = *pts; - - int first = threadIdx.x + blockIdx.x * blockDim.x; - - for (int i = first; i < n; i += blockDim.x * gridDim.x) { - ts.copyFromDense(par0, cov0, i); - Vector5d par1; - Matrix5d cov1; - ts.copyToDense(par1, cov1, i); - Vector5d delV = par1 - par0; - Matrix5d delM = cov1 - cov0; - for (int j = 0; j < 5; ++j) { - assert(std::abs(delV(j)) < 1.e-5); - for (auto k = j; k < 5; ++k) { - assert(cov0(k, j) == cov0(j, k)); - assert(cov1(k, j) == cov1(j, k)); - assert(std::abs(delM(k, j)) < 1.e-5); - } - } - } -} - -#ifdef __CUDACC__ -#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#endif - -int main() { -#ifdef __CUDACC__ - cms::cudatest::requireDevices(); -#endif - - TS ts; - -#ifdef __CUDACC__ - TS* ts_d; - cudaCheck(cudaMalloc(&ts_d, sizeof(TS))); - testTSSoA<<<1, 64>>>(ts_d, 128); - cudaCheck(cudaGetLastError()); - cudaCheck(cudaMemcpy(&ts, ts_d, sizeof(TS), cudaMemcpyDefault)); - cudaCheck(cudaDeviceSynchronize()); -#else - testTSSoA(&ts, 128); -#endif -} diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h index 8ce37f280ac6c..98112285fce13 100644 --- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h @@ -4,6 +4,7 @@ #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h" #include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" template class TrackingRecHit2DHeterogeneous { diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h new file mode 100644 index 0000000000000..104eab337af3f --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h @@ -0,0 +1,62 @@ +#ifndef CUDADataFormats_Track_TrackHeterogeneousDevice_H +#define CUDADataFormats_Track_TrackHeterogeneousDevice_H + +#include + +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" +#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +class TrackingRecHitSoADevice : public cms::cuda::PortableDeviceCollection> { +public: + TrackingRecHitSoADevice() = default; // cms::cuda::Product needs this + + // Constructor which specifies the SoA size + explicit TrackingRecHitSoADevice(uint32_t nHits, bool isPhase2, int32_t offsetBPIX2, pixelCPEforGPU::ParamsOnGPU const* cpeParams, uint32_t const* hitsModuleStart, cudaStream_t stream) + : PortableDeviceCollection>(nHits, stream), nHits_(nHits), cpeParams_(cpeParams), hitsModuleStart_(hitsModuleStart) + { + nModules_ = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules; + phiBinner_ = &(view().phiBinner()); + // phiBinner_ = cms::cuda::make_device_unique(stream).get(); + cudaCheck(cudaMemcpyAsync(&(view().nHits()), &nHits, sizeof(uint32_t),cudaMemcpyHostToDevice,stream)); + cudaCheck(cudaMemcpyAsync(&(view().nMaxModules()), &nModules_, sizeof(uint32_t),cudaMemcpyHostToDevice,stream)); + cudaCheck(cudaMemcpyAsync(&(view().hitsModuleStart()), hitsModuleStart, sizeof(uint32_t) * int(nModules_),cudaMemcpyHostToDevice,stream)); + // cudaCheck(cudaMemcpyAsync(&(view().cpeParams()), cpeParams, int(sizeof(pixelCPEforGPU::ParamsOnGPU)),cudaMemcpyHostToDevice,stream)); + cudaCheck(cudaMemcpyAsync(&(view().offsetBPIX2()), &offsetBPIX2, sizeof(int32_t),cudaMemcpyHostToDevice,stream)); + + } + + uint32_t nHits() const { return nHits_; } //go to size of view + uint32_t nModules() const { return nModules_; } + + cms::cuda::host::unique_ptr localCoordToHostAsync(cudaStream_t stream) const { + auto ret = cms::cuda::make_host_unique(4 * nHits(), stream); + size_t rowSize = sizeof(float) * nHits(); + printf("%d \n",nModules()); + printf("%d \n",nHits()); + cudaCheck(cudaMemcpyAsync(ret.get(), view().xLocal() , rowSize * 4, cudaMemcpyDeviceToHost, stream)); + // cudaCheck(cudaMemcpyAsync(ret.get() + rowSize , view().yLocal() , rowSize, cudaMemcpyDeviceToHost, stream)); + // cudaCheck(cudaMemcpyAsync(ret.get() + size_t(rowSize * 2), view().xerrLocal() , rowSize, cudaMemcpyDeviceToHost, stream)); + // cudaCheck(cudaMemcpyAsync(ret.get() + size_t(rowSize * 3) , view().yerrLocal() , rowSize, cudaMemcpyDeviceToHost, stream)); + return ret; + } //move to utilities + + + auto phiBinnerStorage() { return phiBinnerStorage_; } + auto hitsModuleStart() const { return hitsModuleStart_; } + auto phiBinner() { return phiBinner_; } + + private: + uint32_t nHits_; //Needed for the host SoA size + pixelCPEforGPU::ParamsOnGPU const* cpeParams_; //TODO: this is used not that much from the hits (only once in BrokenLineFit), would make sens to remove it from this class. + uint32_t const* hitsModuleStart_; + + uint32_t nModules_; + trackingRecHitSoA::PhiBinnerStorageType* phiBinnerStorage_; + trackingRecHitSoA::PhiBinner* phiBinner_; + +}; + + +#endif // CUDADataFormats_Track_TrackHeterogeneousT_H diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h new file mode 100644 index 0000000000000..cb76d538474da --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h @@ -0,0 +1,45 @@ +#ifndef CUDADataFormats_Track_TrackHeterogeneousHost_H +#define CUDADataFormats_Track_TrackHeterogeneousHost_H + +#include + +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" +#include "CUDADataFormats/Common/interface/PortableHostCollection.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +class TrackingRecHitSoAHost : public cms::cuda::PortableHostCollection> { +public: + TrackingRecHitSoAHost() = default; + + // This SoA Host is used basically only for DQM + // so we just need a slim constructor + explicit TrackingRecHitSoAHost(uint32_t nHits, cudaStream_t stream) + : PortableHostCollection>(nHits, stream) {} + + explicit TrackingRecHitSoAHost(uint32_t nHits, bool isPhase2, int32_t offsetBPIX2, pixelCPEforGPU::ParamsOnGPU const* cpeParams, uint32_t const* hitsModuleStart, cudaStream_t stream) + : PortableHostCollection>(nHits, stream), nHits_(nHits), cpeParams_(cpeParams) + { + nModules_ = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules; + + view().nHits() = nHits; + view().nMaxModules() = nModules_; + std::copy(hitsModuleStart,hitsModuleStart+nModules_,view().hitsModuleStart().begin()); + + view().offsetBPIX2() = offsetBPIX2; + + } + + uint32_t nHits() const { return nHits_; } + uint32_t nModules() const { return nModules_; } + auto phiBinnerStorage() { return phiBinnerStorage_; } + + private: + uint32_t nHits_; //Needed for the host SoA size + pixelCPEforGPU::ParamsOnGPU const* cpeParams_; + uint32_t nModules_; + trackingRecHitSoA::PhiBinnerStorageType* phiBinnerStorage_; +}; + + +#endif // CUDADataFormats_Track_TrackHeterogeneousT_H diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h new file mode 100644 index 0000000000000..c37636d68a138 --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h @@ -0,0 +1,82 @@ +#ifndef CUDADataFormats_RecHits_TrackingRecHitsUtilities_h +#define CUDADataFormats_RecHits_TrackingRecHitsUtilities_h + +#include +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "DataFormats/SoATemplate/interface/SoALayout.h" +#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "SiPixelHitStatus.h" + +namespace trackingRecHitSoA{ + + // more information on bit fields : https://en.cppreference.com/w/cpp/language/bit_field + struct SiPixelHitStatusAndCharge { + SiPixelHitStatus status; + uint32_t charge : 24; + }; + + struct Test { + int a; + }; + + using hindex_type = uint32_t; // if above is <=2^32 + using PhiBinner = cms::cuda::HistoContainer; //28 for phase2 geometry + using PhiBinnerStorageType = PhiBinner::index_type; + + using AverageGeometry = pixelTopology::AverageGeometry; + + using ParamsOnGPU = pixelCPEforGPU::ParamsOnGPU; + + using HitLayerStartArray = std::array; + using HitModuleStartArray = std::array; + +} + + +GENERATE_SOA_LAYOUT(TrackingRecHitSoALayout, + SOA_COLUMN(float, xLocal), + SOA_COLUMN(float, yLocal), // this is chi2/ndof as not necessarely all hits are used in the fit + SOA_COLUMN(float, xerrLocal), + SOA_COLUMN(float, yerrLocal), + SOA_COLUMN(float, xGlobal), + SOA_COLUMN(float, yGlobal), + SOA_COLUMN(float, zGlobal), + SOA_COLUMN(float, rGlobal), + SOA_COLUMN(int16_t, iphi), + SOA_COLUMN(trackingRecHitSoA::SiPixelHitStatusAndCharge, chargeAndStatus), + SOA_COLUMN(int16_t, clusterSizeX), + SOA_COLUMN(int16_t, clusterSizeY), + SOA_COLUMN(int16_t, detectorIndex), + SOA_COLUMN(trackingRecHitSoA::PhiBinnerStorageType, phiBinnerStorage), + + SOA_SCALAR(trackingRecHitSoA::HitModuleStartArray,hitsModuleStart), + SOA_SCALAR(trackingRecHitSoA::HitLayerStartArray,hitsLayerStart), + + SOA_SCALAR(trackingRecHitSoA::ParamsOnGPU, cpeParams), + SOA_SCALAR(trackingRecHitSoA::AverageGeometry, averageGeometry), + SOA_SCALAR(trackingRecHitSoA::PhiBinner, phiBinner), + + SOA_SCALAR(uint32_t, nHits), + SOA_SCALAR(int32_t, offsetBPIX2), + SOA_SCALAR(uint32_t, nMaxModules)) + +namespace trackingRecHitSoA +{ + + using HitSoAView = TrackingRecHitSoALayout<>::View; + using HitSoAConstView = TrackingRecHitSoALayout<>::ConstView; + + constexpr size_t columnsSizes = 8 * sizeof(float) + 4 * sizeof(int16_t) + sizeof(trackingRecHitSoA::SiPixelHitStatusAndCharge) + sizeof(trackingRecHitSoA::PhiBinnerStorageType); + + // cms::cuda::host::unique_ptr hitsModuleStartToHostAsync(HitSoAConstView& view, cudaStream_t stream) { + // // printf("%d \n",nModules()); + // auto ret = cms::cuda::make_host_unique(view.nMaxModules() + 1, stream); + // cudaCheck(cudaMemcpyAsync(ret.get(), view.hitsModuleStart().data(), sizeof(uint32_t) * (view.nMaxModules() + 1), cudaMemcpyDeviceToHost, stream)); + // return ret; + // } + + +} +#endif diff --git a/CUDADataFormats/TrackingRecHit/src/classes.h b/CUDADataFormats/TrackingRecHit/src/classes.h index abecfb38797de..b43537c915e3d 100644 --- a/CUDADataFormats/TrackingRecHit/src/classes.h +++ b/CUDADataFormats/TrackingRecHit/src/classes.h @@ -4,6 +4,8 @@ #include "CUDADataFormats/Common/interface/Product.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" #include "DataFormats/Common/interface/Wrapper.h" #endif // CUDADataFormats_SiPixelCluster_src_classes_h diff --git a/CUDADataFormats/TrackingRecHit/src/classes_def.xml b/CUDADataFormats/TrackingRecHit/src/classes_def.xml index f633d77c48ef7..fe92a1b6ea31e 100644 --- a/CUDADataFormats/TrackingRecHit/src/classes_def.xml +++ b/CUDADataFormats/TrackingRecHit/src/classes_def.xml @@ -7,4 +7,8 @@ + + + + diff --git a/CUDADataFormats/TrackingRecHit/test/BuildFile.xml b/CUDADataFormats/TrackingRecHit/test/BuildFile.xml index ce49c46fffba0..77626dbf724ff 100644 --- a/CUDADataFormats/TrackingRecHit/test/BuildFile.xml +++ b/CUDADataFormats/TrackingRecHit/test/BuildFile.xml @@ -1,5 +1,12 @@ + - + + + + + + + diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp new file mode 100644 index 0000000000000..eda9a97c02859 --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp @@ -0,0 +1,59 @@ +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" + +#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h" + +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" + +namespace testTrackingRecHit2DNew { + + void run(TrackingRecHitSoADevice& hits, cudaStream_t stream); + +} + +int main() { + cms::cudatest::requireDevices(); + + cudaStream_t stream; + cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamDefault)); + + + // inner scope to deallocate memory before destroying the stream + { + uint32_t nHits = 2000; + int32_t offset = 100; + uint32_t moduleStart[1856]; + + for (size_t i = 0; i < 1856; i++) { + moduleStart[i] = i*2; + } + + TrackingRecHitSoADevice tkhit(nHits,false,offset,nullptr,&moduleStart[0],stream); + + testTrackingRecHit2DNew::run(tkhit,stream); + printf("tkhit hits %d \n",tkhit.nHits()); + + auto test = tkhit.localCoordToHostAsync(stream); + printf("test[9] %.2f\n",test[9]); + + // auto mods = tkhit.hitsModuleStartToHostAsync(stream); + // auto ret = cms::cuda::make_host_unique(tkhit.nModules() + 1, stream); + // uint32_t* ret; + // // cudaCheck(cudaMemcpyAsync(ret, &(tkhit.view().hitsModuleStart()), sizeof(uint32_t) * (tkhit.nModules() + 1), cudaMemcpyDeviceToHost, stream)); + // size_t skipSize = int(trackingRecHitSoA::columnsSizes * nHits); + // cudaCheck(cudaMemcpyAsync(ret, + // tkhit.const_buffer().get() + skipSize, + // sizeof(uint32_t) * (1856 + 1), + // cudaMemcpyDeviceToHost, + // ctx.stream())); + + printf("mods[9] %d\n",tkhit.hitsModuleStart()[9]); + } + + cudaCheck(cudaStreamDestroy(stream)); + + return 0; +} diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu new file mode 100644 index 0000000000000..eb042219d3a33 --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu @@ -0,0 +1,71 @@ +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" + +namespace testTrackingRecHit2DNew { + + __global__ void fill(trackingRecHitSoA::HitSoAView soa) { + // assert(soa); + + int i = threadIdx.x; + int j = blockIdx.x; + if(i==0 and j==0) + { + + soa.offsetBPIX2() = 22; + soa[10].xLocal() =1.11; + } + + soa[i].iphi() = i%10; + soa.hitsLayerStart()[j] = j; + //k = soa.test().a; + __syncthreads(); + } + + __global__ void show(trackingRecHitSoA::HitSoAView soa) { + // assert(soa); + + int i = threadIdx.x; + int j = blockIdx.x; + + if(i==0 and j==0) + { + printf("nbins = %d \n", soa.phiBinner().nbins()); + printf("mMaxModules = %d \n", soa.nMaxModules()); + printf("offsetBPIX %d ->%d \n",i,soa.offsetBPIX2()); + printf("nHits %d ->%d \n",i,soa.nHits()); + printf("hitsModuleStart %d ->%d \n",i,soa.hitsModuleStart().at(28)); + } + + if(i%d \n",i,soa[i].iphi()); + + if(j*blockDim.x+i < soa.phiBinner().nbins()) + printf(">bin size %d ->%d \n",j*blockDim.x+i,soa.phiBinner().size(j*blockDim.x+i)); + __syncthreads(); + } + + + + void run(TrackingRecHitSoADevice& hits, cudaStream_t stream) { + // assert(soa); + printf("RUN!\n"); + int k = 0; + fill<<<10, 100, 0, stream>>>(hits.view()); + printf("k = %d\n",k); + + cudaCheck(cudaDeviceSynchronize()); + + cms::cuda::fillManyFromVector(hits.phiBinner(), + 10, + hits.view().iphi(), + hits.view().hitsLayerStart().data(), + 2000, + 256, + hits.view().phiBinnerStorage(), + stream); + cudaCheck(cudaDeviceSynchronize()); + show<<<10, 1000, 0, stream>>>(hits.view()); + cudaCheck(cudaDeviceSynchronize()); + } + +} // namespace testTrackingRecHit2D diff --git a/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h deleted file mode 100644 index 417a960951fb1..0000000000000 --- a/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef CUDADataFormatsVertexZVertexHeterogeneous_H -#define CUDADataFormatsVertexZVertexHeterogeneous_H - -#include "CUDADataFormats/Vertex/interface/ZVertexSoA.h" -#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" - -using ZVertexHeterogeneous = HeterogeneousSoA; -#ifndef __CUDACC__ -#include "CUDADataFormats/Common/interface/Product.h" -using ZVertexCUDAProduct = cms::cuda::Product; -#endif - -#endif diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoA.h b/CUDADataFormats/Vertex/interface/ZVertexSoA.h deleted file mode 100644 index e31b87f30fa11..0000000000000 --- a/CUDADataFormats/Vertex/interface/ZVertexSoA.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef CUDADataFormats_Vertex_ZVertexSoA_h -#define CUDADataFormats_Vertex_ZVertexSoA_h - -#include -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" - -// SOA for vertices -// These vertices are clusterized and fitted only along the beam line (z) -// to obtain their global coordinate the beam spot position shall be added (eventually correcting for the beam angle as well) -struct ZVertexSoA { - static constexpr uint32_t MAXTRACKS = 32 * 1024; - static constexpr uint32_t MAXVTX = 1024; - - int16_t idv[MAXTRACKS]; // vertex index for each associated (original) track (-1 == not associate) - float zv[MAXVTX]; // output z-posistion of found vertices - float wv[MAXVTX]; // output weight (1/error^2) on the above - float chi2[MAXVTX]; // vertices chi2 - float ptv2[MAXVTX]; // vertices pt^2 - int32_t ndof[MAXTRACKS]; // vertices number of dof (reused as workspace for the number of nearest neighbours FIXME) - uint16_t sortInd[MAXVTX]; // sorted index (by pt2) ascending - uint32_t nvFinal; // the number of vertices - - __host__ __device__ void init() { nvFinal = 0; } -}; - -#endif // CUDADataFormats_Vertex_ZVertexSoA_h diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h new file mode 100644 index 0000000000000..ca97e2533b8d1 --- /dev/null +++ b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h @@ -0,0 +1,27 @@ +#ifndef CUDADataFormats_Vertex_ZVertexHeterogeneousDevice_H +#define CUDADataFormats_Vertex_ZVertexHeterogeneousDevice_H + +#include + +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" +#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +template +class ZVertexSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection> { +public: + ZVertexSoAHeterogeneousDevice() = default; // cms::cuda::Product needs this + + // Constructor which specifies the SoA size + explicit ZVertexSoAHeterogeneousDevice(cudaStream_t stream) + : PortableDeviceCollection>(S, stream) {} +}; + +namespace ZVertex { + + using ZVertexSoADevice = ZVertexSoAHeterogeneousDevice; + +} // namespace ZVertex + +#endif // CUDADataFormats_Vertex_ZVertexHeterogeneousDevice_H diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h new file mode 100644 index 0000000000000..4c07bb3ffedb4 --- /dev/null +++ b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h @@ -0,0 +1,25 @@ +#ifndef CUDADataFormats_Vertex_ZVertexHeterogeneousHost_H +#define CUDADataFormats_Vertex_ZVertexHeterogeneousHost_H + +#include + +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" +#include "CUDADataFormats/Common/interface/PortableHostCollection.h" + +template +class ZVertexSoAHeterogeneousHost : public cms::cuda::PortableHostCollection> { +public: + ZVertexSoAHeterogeneousHost() = default; + + // Constructor which specifies the SoA size and CUDA stream + explicit ZVertexSoAHeterogeneousHost(cudaStream_t stream) + : PortableHostCollection>(S, stream) {} +}; + +namespace ZVertex { + + using ZVertexSoAHost = ZVertexSoAHeterogeneousHost; + +} // namespace ZVertex + +#endif // CUDADataFormats_Vertex_ZVertexHeterogeneousHost_H diff --git a/CUDADataFormats/Vertex/interface/ZVertexUtilities.h b/CUDADataFormats/Vertex/interface/ZVertexUtilities.h new file mode 100644 index 0000000000000..d0614abee91c9 --- /dev/null +++ b/CUDADataFormats/Vertex/interface/ZVertexUtilities.h @@ -0,0 +1,35 @@ +#ifndef CUDADataFormats_Vertex_ZVertexUtilities_h +#define CUDADataFormats_Vertex_ZVertexUtilities_h + +#include +#include "DataFormats/SoATemplate/interface/SoALayout.h" + +GENERATE_SOA_LAYOUT(ZVertexSoAHeterogeneousLayout, + SOA_COLUMN(int16_t, idv), + SOA_COLUMN(float, zv), + SOA_COLUMN(float, wv), + SOA_COLUMN(float, chi2), + SOA_COLUMN(float, ptv2), + SOA_COLUMN(int32_t, ndof), + SOA_COLUMN(uint16_t, sortInd), + SOA_SCALAR(uint32_t, nvFinal)) + +// Previous ZVertexSoA class methods. +// They operate on View and ConstView of the ZVertexSoA. +namespace ZVertex { + // Common types for both Host and Device code + using ZVertexSoALayout = ZVertexSoAHeterogeneousLayout<>; + using ZVertexSoAView = ZVertexSoAHeterogeneousLayout<>::View; + using ZVertexSoAConstView = ZVertexSoAHeterogeneousLayout<>::ConstView; + + namespace utilities { + + static constexpr uint32_t MAXTRACKS = 32 * 1024; + static constexpr uint32_t MAXVTX = 1024; + + __host__ __device__ inline void init(ZVertexSoAView &vertices) { vertices.nvFinal() = 0; } + + } // namespace utilities +} // namespace ZVertex + +#endif diff --git a/CUDADataFormats/Vertex/src/classes.h b/CUDADataFormats/Vertex/src/classes.h index 7931beaa8f4bd..0340affffa06c 100644 --- a/CUDADataFormats/Vertex/src/classes.h +++ b/CUDADataFormats/Vertex/src/classes.h @@ -1,7 +1,8 @@ #ifndef CUDADataFormats_Vertex_src_classes_h #define CUDADataFormats_Vertex_src_classes_h -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" #include "CUDADataFormats/Common/interface/Product.h" #include "DataFormats/Common/interface/Wrapper.h" diff --git a/CUDADataFormats/Vertex/src/classes_def.xml b/CUDADataFormats/Vertex/src/classes_def.xml index ea633080af9af..58616cbb534fa 100644 --- a/CUDADataFormats/Vertex/src/classes_def.xml +++ b/CUDADataFormats/Vertex/src/classes_def.xml @@ -1,6 +1,8 @@ - - - - + + + + + + diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc index 7b12f694d4e8c..36c045582c942 100644 --- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc +++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc @@ -2,7 +2,7 @@ // Package: SiPixelPhase1CompareTrackSoA // Class: SiPixelPhase1CompareTrackSoA // -/**\class SiPixelPhase1CompareTrackSoA SiPixelPhase1CompareTrackSoA.cc +/**\class SiPixelPhase1CompareTrackSoA SiPixelPhase1CompareTrackSoA.cc */ // // Author: Suvankar Roy Chowdhury @@ -20,7 +20,9 @@ #include "DQMServices/Core/interface/MonitorElement.h" #include "DQMServices/Core/interface/DQMEDAnalyzer.h" #include "DQMServices/Core/interface/DQMStore.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" // for string manipulations #include @@ -71,8 +73,8 @@ class SiPixelPhase1CompareTrackSoA : public DQMEDAnalyzer { static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); private: - const edm::EDGetTokenT tokenSoATrackCPU_; - const edm::EDGetTokenT tokenSoATrackGPU_; + const edm::EDGetTokenT tokenSoATrackCPU_; + const edm::EDGetTokenT tokenSoATrackGPU_; const std::string topFolderName_; const bool useQualityCut_; const pixelTrack::Quality minQuality_; @@ -111,10 +113,12 @@ class SiPixelPhase1CompareTrackSoA : public DQMEDAnalyzer { // // constructors // +// Note that the GPU TrackSoA is also of type TrackSoAHost, as the data have +// been copied from Device to Host SiPixelPhase1CompareTrackSoA::SiPixelPhase1CompareTrackSoA(const edm::ParameterSet& iConfig) - : tokenSoATrackCPU_(consumes(iConfig.getParameter("pixelTrackSrcCPU"))), - tokenSoATrackGPU_(consumes(iConfig.getParameter("pixelTrackSrcGPU"))), + : tokenSoATrackCPU_(consumes(iConfig.getParameter("pixelTrackSrcCPU"))), + tokenSoATrackGPU_(consumes(iConfig.getParameter("pixelTrackSrcGPU"))), topFolderName_(iConfig.getParameter("topFolderName")), useQualityCut_(iConfig.getParameter("useQualityCut")), minQuality_(pixelTrack::qualityByName(iConfig.getParameter("minQuality"))), @@ -138,12 +142,12 @@ void SiPixelPhase1CompareTrackSoA::analyze(const edm::Event& iEvent, const edm:: return; } - auto const& tsoaCPU = *tsoaHandleCPU->get(); - auto const& tsoaGPU = *tsoaHandleGPU->get(); - auto maxTracksCPU = tsoaCPU.stride(); //this should be same for both? - auto maxTracksGPU = tsoaGPU.stride(); //this should be same for both? - auto const* qualityCPU = tsoaCPU.qualityData(); - auto const* qualityGPU = tsoaGPU.qualityData(); + auto& tsoaCPU = *tsoaHandleCPU.product(); + auto& tsoaGPU = *tsoaHandleGPU.product(); + auto maxTracksCPU = tsoaCPU.view().metadata().size(); //this should be same for both? + auto maxTracksGPU = tsoaGPU.view().metadata().size(); //this should be same for both? + auto const* qualityCPU = pixelTrack::utilities::qualityData(tsoaCPU.view()); + auto const* qualityGPU = pixelTrack::utilities::qualityData(tsoaGPU.view()); int32_t nTracksCPU = 0; int32_t nTracksGPU = 0; int32_t nLooseAndAboveTracksCPU = 0; @@ -153,9 +157,9 @@ void SiPixelPhase1CompareTrackSoA::analyze(const edm::Event& iEvent, const edm:: //Loop over GPU tracks and store the indices of the loose tracks. Whats happens if useQualityCut_ is false? std::vector looseTrkidxGPU; for (int32_t jt = 0; jt < maxTracksGPU; ++jt) { - if (tsoaGPU.nHits(jt) == 0) + if (pixelTrack::utilities::nHits(tsoaGPU.view(), jt) == 0) break; // this is a guard - if (!(tsoaGPU.pt(jt) > 0.)) + if (!(tsoaGPU.view()[jt].pt() > 0.)) continue; nTracksGPU++; if (useQualityCut_ && qualityGPU[jt] < minQuality_) @@ -166,9 +170,18 @@ void SiPixelPhase1CompareTrackSoA::analyze(const edm::Event& iEvent, const edm:: //Now loop over CPU tracks//nested loop for loose gPU tracks for (int32_t it = 0; it < maxTracksCPU; ++it) { - if (tsoaCPU.nHits(it) == 0) + float chi2CPU = tsoaCPU.view()[it].chi2(); + int nHitsCPU = pixelTrack::utilities::nHits(tsoaCPU.view(), it); + int8_t nLayersCPU = tsoaCPU.view()[it].nLayers(); + float ptCPU = tsoaCPU.view()[it].pt(); + float etaCPU = tsoaCPU.view()[it].eta(); + float phiCPU = pixelTrack::utilities::phi(tsoaCPU.view(), it); + float zipCPU = pixelTrack::utilities::zip(tsoaCPU.view(), it); + float tipCPU = pixelTrack::utilities::tip(tsoaCPU.view(), it); + + if (nHitsCPU == 0) break; // this is a guard - if (!(tsoaCPU.pt(it) > 0.)) + if (!(ptCPU > 0.)) continue; nTracksCPU++; if (useQualityCut_ && qualityCPU[it] < minQuality_) @@ -178,12 +191,10 @@ void SiPixelPhase1CompareTrackSoA::analyze(const edm::Event& iEvent, const edm:: const int32_t notFound = -1; int32_t closestTkidx = notFound; float mindr2 = dr2cut_; - float etacpu = tsoaCPU.eta(it); - float phicpu = tsoaCPU.phi(it); for (auto gid : looseTrkidxGPU) { - float etagpu = tsoaGPU.eta(gid); - float phigpu = tsoaGPU.phi(gid); - float dr2 = reco::deltaR2(etacpu, phicpu, etagpu, phigpu); + float etaGPU = tsoaGPU.view()[gid].eta(); + float phiGPU = pixelTrack::utilities::phi(tsoaGPU.view(), gid); + float dr2 = reco::deltaR2(etaCPU, phiCPU, etaGPU, phiGPU); if (dr2 > dr2cut_) continue; // this is arbitrary if (mindr2 > dr2) { @@ -192,27 +203,36 @@ void SiPixelPhase1CompareTrackSoA::analyze(const edm::Event& iEvent, const edm:: } } - hpt_eta_tkAllCPU_->Fill(etacpu, tsoaCPU.pt(it)); //all CPU tk - hphi_z_tkAllCPU_->Fill(phicpu, tsoaCPU.zip(it)); + hpt_eta_tkAllCPU_->Fill(etaCPU, ptCPU); //all CPU tk + hphi_z_tkAllCPU_->Fill(phiCPU, zipCPU); if (closestTkidx == notFound) continue; nLooseAndAboveTracksCPU_matchedGPU++; - hchi2_->Fill(tsoaCPU.chi2(it), tsoaGPU.chi2(closestTkidx)); - hnHits_->Fill(tsoaCPU.nHits(it), tsoaGPU.nHits(closestTkidx)); - hnLayers_->Fill(tsoaCPU.nLayers(it), tsoaGPU.nLayers(closestTkidx)); - hpt_->Fill(tsoaCPU.pt(it), tsoaGPU.pt(closestTkidx)); - hptLogLog_->Fill(tsoaCPU.pt(it), tsoaGPU.pt(closestTkidx)); - heta_->Fill(etacpu, tsoaGPU.eta(closestTkidx)); - hphi_->Fill(phicpu, tsoaGPU.phi(closestTkidx)); - hz_->Fill(tsoaCPU.zip(it), tsoaGPU.zip(closestTkidx)); - htip_->Fill(tsoaCPU.tip(it), tsoaGPU.tip(closestTkidx)); - hptdiffMatched_->Fill(tsoaCPU.pt(it) - tsoaGPU.pt(closestTkidx)); - hetadiffMatched_->Fill(etacpu - tsoaGPU.eta(closestTkidx)); - hphidiffMatched_->Fill(reco::deltaPhi(phicpu, tsoaGPU.phi(closestTkidx))); - hzdiffMatched_->Fill(tsoaCPU.zip(it) - tsoaGPU.zip(closestTkidx)); - hpt_eta_tkAllCPUMatched_->Fill(etacpu, tsoaCPU.pt(it)); //matched to gpu - hphi_z_tkAllCPUMatched_->Fill(phicpu, tsoaCPU.zip(it)); + float chi2GPU = tsoaGPU.view()[closestTkidx].chi2(); + int nHitsGPU = pixelTrack::utilities::nHits(tsoaGPU.view(), closestTkidx); + int8_t nLayersGPU = tsoaGPU.view()[closestTkidx].nLayers(); + float ptGPU = tsoaGPU.view()[closestTkidx].pt(); + float etaGPU = tsoaGPU.view()[closestTkidx].eta(); + float phiGPU = pixelTrack::utilities::phi(tsoaGPU.view(), closestTkidx); + float zipGPU = pixelTrack::utilities::zip(tsoaGPU.view(), closestTkidx); + float tipGPU = pixelTrack::utilities::tip(tsoaGPU.view(), closestTkidx); + + hchi2_->Fill(chi2CPU, chi2GPU); + hnHits_->Fill(nHitsCPU, nHitsGPU); + hnLayers_->Fill(nLayersCPU, nLayersGPU); + hpt_->Fill(ptCPU, ptCPU); + hptLogLog_->Fill(ptCPU, ptGPU); + heta_->Fill(etaCPU, etaGPU); + hphi_->Fill(phiCPU, phiGPU); + hz_->Fill(zipCPU, zipGPU); + htip_->Fill(tipCPU, tipGPU); + hptdiffMatched_->Fill(ptCPU - ptGPU); + hetadiffMatched_->Fill(etaCPU - etaGPU); + hphidiffMatched_->Fill(reco::deltaPhi(phiCPU, phiGPU)); + hzdiffMatched_->Fill(zipCPU - zipGPU); + hpt_eta_tkAllCPUMatched_->Fill(etaCPU, ptCPU); //matched to gpu + hphi_z_tkAllCPUMatched_->Fill(phiCPU, zipCPU); } hnTracks_->Fill(nTracksCPU, nTracksGPU); hnLooseAndAboveTracks_->Fill(nLooseAndAboveTracksCPU, nLooseAndAboveTracksGPU); diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc index 0113ea50973d8..9172824631da2 100644 --- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc +++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc @@ -18,7 +18,7 @@ #include "DQMServices/Core/interface/MonitorElement.h" #include "DQMServices/Core/interface/DQMEDAnalyzer.h" #include "DQMServices/Core/interface/DQMStore.h" -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" #include "DataFormats/BeamSpot/interface/BeamSpot.h" class SiPixelPhase1CompareVertexSoA : public DQMEDAnalyzer { @@ -31,8 +31,9 @@ class SiPixelPhase1CompareVertexSoA : public DQMEDAnalyzer { static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); private: - const edm::EDGetTokenT tokenSoAVertexCPU_; - const edm::EDGetTokenT tokenSoAVertexGPU_; + const edm::EDGetTokenT tokenSoAVertexCPU_; + // Note that this has been copied from device to host, hence is a HostCollection + const edm::EDGetTokenT tokenSoAVertexGPU_; const edm::EDGetTokenT tokenBeamSpot_; const std::string topFolderName_; const float dzCut_; @@ -54,8 +55,8 @@ class SiPixelPhase1CompareVertexSoA : public DQMEDAnalyzer { // SiPixelPhase1CompareVertexSoA::SiPixelPhase1CompareVertexSoA(const edm::ParameterSet& iConfig) - : tokenSoAVertexCPU_(consumes(iConfig.getParameter("pixelVertexSrcCPU"))), - tokenSoAVertexGPU_(consumes(iConfig.getParameter("pixelVertexSrcGPU"))), + : tokenSoAVertexCPU_(consumes(iConfig.getParameter("pixelVertexSrcCPU"))), + tokenSoAVertexGPU_(consumes(iConfig.getParameter("pixelVertexSrcGPU"))), tokenBeamSpot_(consumes(iConfig.getParameter("beamSpotSrc"))), topFolderName_(iConfig.getParameter("topFolderName")), dzCut_(iConfig.getParameter("dzCut")) {} @@ -78,10 +79,10 @@ void SiPixelPhase1CompareVertexSoA::analyze(const edm::Event& iEvent, const edm: return; } - auto const& vsoaCPU = *vsoaHandleCPU->get(); - int nVerticesCPU = vsoaCPU.nvFinal; - auto const& vsoaGPU = *vsoaHandleGPU->get(); - int nVerticesGPU = vsoaGPU.nvFinal; + auto& vsoaCPU = *vsoaHandleCPU; + int nVerticesCPU = vsoaCPU.view().nvFinal(); + auto& vsoaGPU = *vsoaHandleGPU; + int nVerticesGPU = vsoaGPU.view().nvFinal(); auto bsHandle = iEvent.getHandle(tokenBeamSpot_); float x0 = 0., y0 = 0., z0 = 0., dxdz = 0., dydz = 0.; @@ -97,22 +98,22 @@ void SiPixelPhase1CompareVertexSoA::analyze(const edm::Event& iEvent, const edm: } for (int ivc = 0; ivc < nVerticesCPU; ivc++) { - auto sic = vsoaCPU.sortInd[ivc]; - auto zc = vsoaCPU.zv[sic]; + auto sic = vsoaCPU.view()[ivc].sortInd(); + auto zc = vsoaCPU.view()[sic].zv(); auto xc = x0 + dxdz * zc; auto yc = y0 + dydz * zc; zc += z0; - auto ndofCPU = vsoaCPU.ndof[sic]; - auto chi2CPU = vsoaCPU.chi2[sic]; + auto ndofCPU = vsoaCPU.view()[sic].ndof(); + auto chi2CPU = vsoaCPU.view()[sic].chi2(); const int32_t notFound = -1; int32_t closestVtxidx = notFound; float mindz = dzCut_; for (int ivg = 0; ivg < nVerticesGPU; ivg++) { - auto sig = vsoaGPU.sortInd[ivg]; - auto zgc = vsoaGPU.zv[sig] + z0; + auto sig = vsoaGPU.view()[ivg].sortInd(); + auto zgc = vsoaGPU.view()[sig].zv() + z0; auto zDist = std::abs(zc - zgc); //insert some matching condition if (zDist > dzCut_) @@ -125,12 +126,12 @@ void SiPixelPhase1CompareVertexSoA::analyze(const edm::Event& iEvent, const edm: if (closestVtxidx == notFound) continue; - auto zg = vsoaGPU.zv[closestVtxidx]; + auto zg = vsoaGPU.view()[closestVtxidx].zv(); auto xg = x0 + dxdz * zg; auto yg = y0 + dydz * zg; zg += z0; - auto ndofGPU = vsoaGPU.ndof[closestVtxidx]; - auto chi2GPU = vsoaGPU.chi2[closestVtxidx]; + auto ndofGPU = vsoaGPU.view()[closestVtxidx].ndof(); + auto chi2GPU = vsoaGPU.view()[closestVtxidx].chi2(); hx_->Fill(xc - x0, xg - x0); hy_->Fill(yc - y0, yg - y0); @@ -140,7 +141,7 @@ void SiPixelPhase1CompareVertexSoA::analyze(const edm::Event& iEvent, const edm: hzdiff_->Fill(zc - zg); hchi2_->Fill(chi2CPU, chi2GPU); hchi2oNdof_->Fill(chi2CPU / ndofCPU, chi2GPU / ndofGPU); - hptv2_->Fill(vsoaCPU.ptv2[sic], vsoaGPU.ptv2[closestVtxidx]); + hptv2_->Fill(vsoaCPU.view()[sic].ptv2(), vsoaGPU.view()[closestVtxidx].ptv2()); hntrks_->Fill(ndofCPU + 1, ndofGPU + 1); } hnVertex_->Fill(nVerticesCPU, nVerticesGPU); diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc index 622895ba07bcc..b4c996afc7055 100644 --- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc +++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc @@ -21,7 +21,8 @@ #include "DQMServices/Core/interface/MonitorElement.h" #include "DQMServices/Core/interface/DQMEDAnalyzer.h" #include "DQMServices/Core/interface/DQMStore.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" // for string manipulations #include @@ -34,7 +35,7 @@ class SiPixelPhase1MonitorTrackSoA : public DQMEDAnalyzer { static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); private: - edm::EDGetTokenT tokenSoATrack_; + edm::EDGetTokenT tokenSoATrack_; std::string topFolderName_; bool useQualityCut_; pixelTrack::Quality minQuality_; @@ -62,7 +63,7 @@ class SiPixelPhase1MonitorTrackSoA : public DQMEDAnalyzer { // SiPixelPhase1MonitorTrackSoA::SiPixelPhase1MonitorTrackSoA(const edm::ParameterSet& iConfig) { - tokenSoATrack_ = consumes(iConfig.getParameter("pixelTrackSrc")); + tokenSoATrack_ = consumes(iConfig.getParameter("pixelTrackSrc")); topFolderName_ = iConfig.getParameter("topFolderName"); //"SiPixelHeterogeneous/PixelTrackSoA"; useQualityCut_ = iConfig.getParameter("useQualityCut"); minQuality_ = pixelTrack::qualityByName(iConfig.getParameter("minQuality")); @@ -78,23 +79,24 @@ void SiPixelPhase1MonitorTrackSoA::analyze(const edm::Event& iEvent, const edm:: return; } - auto const& tsoa = *((tsoaHandle.product())->get()); - auto maxTracks = tsoa.stride(); - auto const* quality = tsoa.qualityData(); + auto& tsoa = *tsoaHandle.product(); + auto maxTracks = tsoa.view().metadata().size(); + auto const* quality = pixelTrack::utilities::qualityData(tsoa.const_view()); int32_t nTracks = 0; int32_t nLooseAndAboveTracks = 0; for (int32_t it = 0; it < maxTracks; ++it) { - auto nHits = tsoa.nHits(it); - auto nLayers = tsoa.nLayers(it); + auto nHits = pixelTrack::utilities::nHits(tsoa.const_view(), it); + auto nLayers = tsoa.view()[it].nLayers(); if (nHits == 0) break; // this is a guard - float pt = tsoa.pt(it); + float pt = tsoa.view()[it].pt(); if (!(pt > 0.)) continue; // fill the quality for all tracks - pixelTrack::Quality qual = tsoa.quality(it); + // pixelTrack::Quality qual = tsoa.view()[it].quality(); + pixelTrack::Quality qual = quality[it]; hquality->Fill(int(qual)); nTracks++; @@ -102,11 +104,11 @@ void SiPixelPhase1MonitorTrackSoA::analyze(const edm::Event& iEvent, const edm:: continue; // fill parameters only for quality >= loose - float chi2 = tsoa.chi2(it); - float phi = tsoa.phi(it); - float zip = tsoa.zip(it); - float eta = tsoa.eta(it); - float tip = tsoa.tip(it); + float chi2 = tsoa.view()[it].chi2(); + float phi = pixelTrack::utilities::phi(tsoa.const_view(), it); + float zip = pixelTrack::utilities::zip(tsoa.const_view(), it); + float eta = tsoa.view()[it].eta(); + float tip = pixelTrack::utilities::tip(tsoa.const_view(), it); hchi2->Fill(chi2); hChi2VsPhi->Fill(phi, chi2); diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc index af6c240a69172..27e0df36a17a4 100644 --- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc +++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc @@ -21,7 +21,7 @@ #include "DQMServices/Core/interface/MonitorElement.h" #include "DQMServices/Core/interface/DQMEDAnalyzer.h" #include "DQMServices/Core/interface/DQMStore.h" -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" #include "DataFormats/BeamSpot/interface/BeamSpot.h" class SiPixelPhase1MonitorVertexSoA : public DQMEDAnalyzer { @@ -34,7 +34,7 @@ class SiPixelPhase1MonitorVertexSoA : public DQMEDAnalyzer { static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); private: - edm::EDGetTokenT tokenSoAVertex_; + edm::EDGetTokenT tokenSoAVertex_; edm::EDGetTokenT tokenBeamSpot_; std::string topFolderName_; MonitorElement* hnVertex; @@ -52,7 +52,7 @@ class SiPixelPhase1MonitorVertexSoA : public DQMEDAnalyzer { // SiPixelPhase1MonitorVertexSoA::SiPixelPhase1MonitorVertexSoA(const edm::ParameterSet& iConfig) { - tokenSoAVertex_ = consumes(iConfig.getParameter("pixelVertexSrc")); + tokenSoAVertex_ = consumes(iConfig.getParameter("pixelVertexSrc")); tokenBeamSpot_ = consumes(iConfig.getParameter("beamSpotSrc")); topFolderName_ = iConfig.getParameter("topFolderName"); } @@ -67,8 +67,8 @@ void SiPixelPhase1MonitorVertexSoA::analyze(const edm::Event& iEvent, const edm: return; } - auto const& vsoa = *((vsoaHandle.product())->get()); - int nVertices = vsoa.nvFinal; + auto& vsoa = *vsoaHandle.product(); + int nVertices = vsoa.view().nvFinal(); auto bsHandle = iEvent.getHandle(tokenBeamSpot_); float x0 = 0., y0 = 0., z0 = 0., dxdz = 0., dydz = 0.; if (!bsHandle.isValid()) { @@ -82,18 +82,18 @@ void SiPixelPhase1MonitorVertexSoA::analyze(const edm::Event& iEvent, const edm: dydz = bs.dydz(); } for (int iv = 0; iv < nVertices; iv++) { - auto si = vsoa.sortInd[iv]; - auto z = vsoa.zv[si]; + auto si = vsoa.view()[iv].sortInd(); + auto z = vsoa.view()[si].zv(); auto x = x0 + dxdz * z; auto y = y0 + dydz * z; z += z0; hx->Fill(x); hy->Fill(y); hz->Fill(z); - auto ndof = vsoa.ndof[si]; - hchi2->Fill(vsoa.chi2[si]); - hchi2oNdof->Fill(vsoa.chi2[si] / ndof); - hptv2->Fill(vsoa.ptv2[si]); + auto ndof = vsoa.view()[si].ndof(); + hchi2->Fill(vsoa.view()[si].chi2()); + hchi2oNdof->Fill(vsoa.view()[si].chi2() / ndof); + hptv2->Fill(vsoa.view()[si].ptv2()); hntrks->Fill(ndof + 1); } hnVertex->Fill(nVertices); diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu index 135254fa6e9f2..0fea13b849ef1 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu @@ -13,6 +13,7 @@ #include "PixelRecHitGPUKernel.h" #include "gpuPixelRecHits.h" +#define GPU_DEBUG namespace { __global__ void setHitsLayerStart(uint32_t const* __restrict__ hitsModuleStart, pixelCPEforGPU::ParamsOnGPU const* cpeParams, @@ -34,7 +35,7 @@ namespace { namespace pixelgpudetails { - TrackingRecHit2DGPU PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const& digis_d, + TrackingRecHitSoADevice PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const& digis_d, SiPixelClustersCUDA const& clusters_d, BeamSpotCUDA const& bs_d, pixelCPEforGPU::ParamsOnGPU const* cpeParams, @@ -42,10 +43,11 @@ namespace pixelgpudetails { cudaStream_t stream) const { auto nHits = clusters_d.nClusters(); - TrackingRecHit2DGPU hits_d( - nHits, isPhase2, clusters_d.offsetBPIX2(), cpeParams, clusters_d.clusModuleStart(), stream); - assert(hits_d.nMaxModules() == isPhase2 ? phase2PixelTopology::numberOfModules - : phase1PixelTopology::numberOfModules); + TrackingRecHitSoADevice hits_d(nHits, isPhase2, clusters_d.offsetBPIX2(), cpeParams, clusters_d.clusModuleStart(), stream); + // TrackingRecHit2DGPU hits_d( + // nHits, isPhase2, clusters_d.offsetBPIX2(), cpeParams, clusters_d.clusModuleStart(), stream); + // assert(hits_d.nMaxModules() == isPhase2 ? phase2PixelTopology::numberOfModules + // : phase1PixelTopology::numberOfModules); int activeModulesWithDigis = digis_d.nModules(); // protect from empty events @@ -65,16 +67,16 @@ namespace pixelgpudetails { // assuming full warp of threads is better than a smaller number... if (nHits) { - setHitsLayerStart<<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart()); + setHitsLayerStart<<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.view().hitsLayerStart().data()); cudaCheck(cudaGetLastError()); auto nLayers = isPhase2 ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers; cms::cuda::fillManyFromVector(hits_d.phiBinner(), nLayers, - hits_d.iphi(), - hits_d.hitsLayerStart(), + hits_d.view().iphi(), + hits_d.view().hitsLayerStart().data(), nHits, 256, - hits_d.phiBinnerStorage(), + hits_d.view().phiBinnerStorage(), stream); cudaCheck(cudaGetLastError()); @@ -83,6 +85,10 @@ namespace pixelgpudetails { #endif } } + #ifdef GPU_DEBUG + cudaCheck(cudaDeviceSynchronize()); + std::cout << "DONE" << std::endl; + #endif return hits_d; } diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h index 8289c8db7f2f4..ada509f57de0a 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h @@ -8,8 +8,9 @@ #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" +#define GPU_DEBUG namespace pixelgpudetails { class PixelRecHitGPUKernel { @@ -22,7 +23,7 @@ namespace pixelgpudetails { PixelRecHitGPUKernel& operator=(const PixelRecHitGPUKernel&) = delete; PixelRecHitGPUKernel& operator=(PixelRecHitGPUKernel&&) = delete; - TrackingRecHit2DGPU makeHitsAsync(SiPixelDigisCUDA const& digis_d, + TrackingRecHitSoADevice makeHitsAsync(SiPixelDigisCUDA const& digis_d, SiPixelClustersCUDA const& clusters_d, BeamSpotCUDA const& bs_d, pixelCPEforGPU::ParamsOnGPU const* cpeParams, diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc index 8112e9ebd19c8..2f5c5710b1586 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc @@ -4,7 +4,7 @@ #include "CUDADataFormats/Common/interface/Product.h" #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" #include "DataFormats/Common/interface/Handle.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" @@ -23,6 +23,8 @@ #include "PixelRecHitGPUKernel.h" +#define GPU_DEBUG + class SiPixelRecHitCUDA : public edm::global::EDProducer<> { public: explicit SiPixelRecHitCUDA(const edm::ParameterSet& iConfig); @@ -37,7 +39,7 @@ class SiPixelRecHitCUDA : public edm::global::EDProducer<> { const edm::EDGetTokenT> tBeamSpot; const edm::EDGetTokenT> token_; const edm::EDGetTokenT> tokenDigi_; - const edm::EDPutTokenT> tokenHit_; + const edm::EDPutTokenT> tokenHit_; const pixelgpudetails::PixelRecHitGPUKernel gpuAlgo_; }; @@ -47,7 +49,7 @@ SiPixelRecHitCUDA::SiPixelRecHitCUDA(const edm::ParameterSet& iConfig) tBeamSpot(consumes>(iConfig.getParameter("beamSpot"))), token_(consumes>(iConfig.getParameter("src"))), tokenDigi_(consumes>(iConfig.getParameter("src"))), - tokenHit_(produces>()) {} + tokenHit_(produces>()) {} void SiPixelRecHitCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { edm::ParameterSetDescription desc; @@ -82,6 +84,7 @@ void SiPixelRecHitCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, cons tokenHit_, gpuAlgo_.makeHitsAsync( digis, clusters, bs, fcpe->getGPUProductAsync(ctx.stream()), fcpe->isPhase2(), ctx.stream())); + std::cout << __LINE__< { public: explicit SiPixelRecHitFromCUDA(const edm::ParameterSet& iConfig); @@ -40,7 +42,7 @@ class SiPixelRecHitFromCUDA : public edm::stream::EDProducer void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; const edm::ESGetToken geomToken_; - const edm::EDGetTokenT> hitsToken_; // CUDA hits + const edm::EDGetTokenT> hitsToken_; // CUDA hits const edm::EDGetTokenT clusterToken_; // legacy clusters const edm::EDPutTokenT rechitsPutToken_; // legacy rechits const edm::EDPutTokenT hostPutToken_; @@ -48,13 +50,14 @@ class SiPixelRecHitFromCUDA : public edm::stream::EDProducer uint32_t nHits_; uint32_t nMaxModules_; cms::cuda::host::unique_ptr store32_; + // uint32_t* hitsModuleStart_; cms::cuda::host::unique_ptr hitsModuleStart_; }; SiPixelRecHitFromCUDA::SiPixelRecHitFromCUDA(const edm::ParameterSet& iConfig) : geomToken_(esConsumes()), hitsToken_( - consumes>(iConfig.getParameter("pixelRecHitSrc"))), + consumes>(iConfig.getParameter("pixelRecHitSrc"))), clusterToken_(consumes(iConfig.getParameter("src"))), rechitsPutToken_(produces()), hostPutToken_(produces()) {} @@ -69,18 +72,35 @@ void SiPixelRecHitFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& des void SiPixelRecHitFromCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - cms::cuda::Product const& inputDataWrapped = iEvent.get(hitsToken_); + cms::cuda::Product const& inputDataWrapped = iEvent.get(hitsToken_); cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; auto const& inputData = ctx.get(inputDataWrapped); - + std::cout << __LINE__< hclusters = iEvent.getHandle(clusterToken_); auto const& input = *hclusters; constexpr uint32_t maxHitsInModule = gpuClustering::maxHitsInModule(); - + std::cout << __LINE__ << std::endl; int numberOfDetUnits = 0; int numberOfClusters = 0; for (auto const& dsv : input) { diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc index 7532470ebd3d4..aedaf6955c747 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc @@ -24,6 +24,9 @@ #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" + class SiPixelRecHitSoAFromCUDA : public edm::stream::EDProducer { public: explicit SiPixelRecHitSoAFromCUDA(const edm::ParameterSet& iConfig); @@ -38,22 +41,24 @@ class SiPixelRecHitSoAFromCUDA : public edm::stream::EDProducer> hitsTokenGPU_; // CUDA hits - const edm::EDPutTokenT hitsPutTokenCPU_; + const edm::EDGetTokenT> hitsTokenGPU_; // CUDA hits + const edm::EDPutTokenT hitsPutTokenCPU_; const edm::EDPutTokenT hostPutToken_; uint32_t nHits_; + TrackingRecHitSoAHost hits_h_; + uint32_t nMaxModules_; - cms::cuda::host::unique_ptr store32_; - cms::cuda::host::unique_ptr store16_; - cms::cuda::host::unique_ptr hitsModuleStart_; + // cms::cuda::host::unique_ptr store32_; + // cms::cuda::host::unique_ptr store16_; + // cms::cuda::host::unique_ptr hitsModuleStart_; }; SiPixelRecHitSoAFromCUDA::SiPixelRecHitSoAFromCUDA(const edm::ParameterSet& iConfig) : hitsTokenGPU_( - consumes>(iConfig.getParameter("pixelRecHitSrc"))), - hitsPutTokenCPU_(produces()), + consumes>(iConfig.getParameter("pixelRecHitSrc"))), + hitsPutTokenCPU_(produces()), hostPutToken_(produces()) {} void SiPixelRecHitSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { @@ -65,29 +70,42 @@ void SiPixelRecHitSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& void SiPixelRecHitSoAFromCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - cms::cuda::Product const& inputDataWrapped = iEvent.get(hitsTokenGPU_); + cms::cuda::Product const& inputDataWrapped = iEvent.get(hitsTokenGPU_); cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; auto const& inputData = ctx.get(inputDataWrapped); - nHits_ = inputData.nHits(); - LogDebug("SiPixelRecHitSoAFromCUDA") << "copying to cpu SoA" << inputData.nHits() << " Hits"; + nHits_ = inputData.view().nHits(); if (0 == nHits_) return; - nMaxModules_ = inputData.nMaxModules(); - store32_ = inputData.store32ToHostAsync(ctx.stream()); - store16_ = inputData.store16ToHostAsync(ctx.stream()); - hitsModuleStart_ = inputData.hitsModuleStartToHostAsync(ctx.stream()); + + nMaxModules_ = inputData.view().nMaxModules(); + + hits_h_ = TrackingRecHitSoAHost(nHits_,ctx.stream()); + cudaCheck(cudaMemcpyAsync(hits_h_.buffer().get(), + inputData.const_buffer().get(), + inputData.bufferSize(), + cudaMemcpyDeviceToHost, + ctx.stream())); // Copy data from Device to Host + cudaCheck(cudaGetLastError()); + + + LogDebug("SiPixelRecHitSoAFromCUDA") << "copying to cpu SoA" << inputData.nHits() << " Hits"; + + // store32_ = inputData.store32ToHostAsync(ctx.stream()); + // store16_ = inputData.store16ToHostAsync(ctx.stream()); + // hitsModuleStart_ = inputData.hitsModuleStartToHostAsync(ctx.stream()); } void SiPixelRecHitSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& es) { auto hmsp = std::make_unique(nMaxModules_ + 1); if (nHits_ > 0) - std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + nMaxModules_ + 1, hmsp.get()); + std::copy(hits_h_.view().hitsModuleStart().begin(),hits_h_.view().hitsModuleStart().end(),hmsp.get()); + // std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + nMaxModules_ + 1, hmsp.get()); iEvent.emplace(hostPutToken_, std::move(hmsp)); - iEvent.emplace(hitsPutTokenCPU_, store32_.get(), store16_.get(), hitsModuleStart_.get(), nHits_); + iEvent.emplace(hitsPutTokenCPU_, std::move(hits_h_));//store32_.get(), store16_.get(), hitsModuleStart_.get(), nHits_); } DEFINE_FWK_MODULE(SiPixelRecHitSoAFromCUDA); diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc index d23ecec66fea0..663674b2a4145 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc @@ -3,7 +3,7 @@ #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" #include "CUDADataFormats/Common/interface/HostProduct.h" #include "DataFormats/BeamSpot/interface/BeamSpot.h" #include "DataFormats/Common/interface/DetSetVectorNew.h" @@ -25,6 +25,8 @@ #include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEBase.h" #include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h" + #include "gpuPixelRecHits.h" class SiPixelRecHitSoAFromLegacy : public edm::global::EDProducer<> { @@ -44,7 +46,7 @@ class SiPixelRecHitSoAFromLegacy : public edm::global::EDProducer<> { const edm::ESGetToken cpeToken_; const edm::EDGetTokenT bsGetToken_; const edm::EDGetTokenT clusterToken_; // Legacy Clusters - const edm::EDPutTokenT tokenHit_; + const edm::EDPutTokenT tokenHit_; const edm::EDPutTokenT tokenModuleStart_; const bool convert2Legacy_; const bool isPhase2_; @@ -55,7 +57,7 @@ SiPixelRecHitSoAFromLegacy::SiPixelRecHitSoAFromLegacy(const edm::ParameterSet& cpeToken_(esConsumes(edm::ESInputTag("", iConfig.getParameter("CPE")))), bsGetToken_{consumes(iConfig.getParameter("beamSpot"))}, clusterToken_{consumes(iConfig.getParameter("src"))}, - tokenHit_{produces()}, + tokenHit_{produces()}, tokenModuleStart_{produces()}, convert2Legacy_(iConfig.getParameter("convertToLegacy")), isPhase2_(iConfig.getParameter("isPhase2")) { @@ -156,9 +158,9 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv // output SoA // element 96 is the start of BPIX2 (i.e. the number of clusters in BPIX1) - auto output = std::make_unique( + auto output = std::make_unique( numberOfClusters, isPhase2_, hitsModuleStart[startBPIX2], &cpeView, hitsModuleStart, nullptr); - assert(output->nMaxModules() == uint32_t(nMaxModules)); + assert(output->nModules() == uint32_t(nMaxModules)); if (0 == numberOfClusters) { iEvent.put(std::move(output)); @@ -239,9 +241,9 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv gpuPixelRecHits::getHits(&cpeView, &bsHost, digiView, ndigi, &clusterView, output->view()); for (auto h = fc; h < lc; ++h) if (h - fc < maxHitsInModule) - assert(gind == output->view()->detectorIndex(h)); + assert(gind == output->view()[h].detectorIndex()); else - assert(gpuClustering::invalidModuleId == output->view()->detectorIndex(h)); + assert(gpuClustering::invalidModuleId == output->view()[h].detectorIndex()); if (convert2Legacy_) { SiPixelRecHitCollectionNew::FastFiller recHitsOnDetUnit(*legacyOutput, detid); for (auto h = fc; h < lc; ++h) { @@ -250,8 +252,8 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv if (ih >= maxHitsInModule) break; assert(ih < clusterRef.size()); - LocalPoint lp(output->view()->xLocal(h), output->view()->yLocal(h)); - LocalError le(output->view()->xerrLocal(h), 0, output->view()->yerrLocal(h)); + LocalPoint lp(output->view()[h].xLocal(), output->view()[h].yLocal()); + LocalError le(output->view()[h].xerrLocal(), 0, output->view()[h].yerrLocal()); SiPixelRecHitQuality::QualWordType rqw = 0; SiPixelRecHit hit(lp, le, rqw, *genericDet, clusterRef[ih]); recHitsOnDetUnit.push_back(hit); @@ -264,20 +266,21 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv // fill data structure to support CA const auto nLayers = isPhase2_ ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers; for (auto i = 0U; i < nLayers + 1; ++i) { - output->hitsLayerStart()[i] = hitsModuleStart[cpeView.layerGeometry().layerStart[i]]; + output->view().hitsLayerStart()[i] = hitsModuleStart[cpeView.layerGeometry().layerStart[i]]; LogDebug("SiPixelRecHitSoAFromLegacy") << "Layer n." << i << " - starting at module: " << cpeView.layerGeometry().layerStart[i] - << " - starts ad cluster: " << output->hitsLayerStart()[i] << "\n"; + << " - starts ad cluster: " << output->view()[i].hitsLayerStart() << "\n"; } - cms::cuda::fillManyFromVector(output->phiBinner(), + cms::cuda::fillManyFromVector(&(output->view().phiBinner()), nLayers, - output->iphi(), - output->hitsLayerStart(), - numberOfHits, + output->view().iphi(), + output->view().hitsLayerStart().data(), + output->nHits(), 256, output->phiBinnerStorage()); + LogDebug("SiPixelRecHitSoAFromLegacy") << "created HitSoa for " << numberOfClusters << " clusters in " << numberOfDetUnits << " Dets"; iEvent.put(std::move(output)); diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h index 5b862b2cf63b9..9dbab6c900030 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h @@ -7,12 +7,14 @@ #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" #include "DataFormats/Math/interface/approx_atan2.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" +#define GPU_DEBUG namespace gpuPixelRecHits { __global__ void getHits(pixelCPEforGPU::ParamsOnGPU const* __restrict__ cpeParams, @@ -20,14 +22,14 @@ namespace gpuPixelRecHits { SiPixelDigisCUDASOAView const digis, int numElements, SiPixelClustersCUDA::SiPixelClustersCUDASOAView const* __restrict__ pclusters, - TrackingRecHit2DSOAView* phits) { + trackingRecHitSoA::HitSoAView hits) { // FIXME // the compiler seems NOT to optimize loads from views (even in a simple test case) // The whole gimnastic here of copying or not is a pure heuristic exercise that seems to produce the fastest code with the above signature // not using views (passing a gazzilion of array pointers) seems to produce the fastest code (but it is harder to mantain) - assert(phits); + // assert(phits); assert(cpeParams); - auto& hits = *phits; + // auto& hits = *phits; auto const& clusters = *pclusters; auto isPhase2 = cpeParams->commonParams().isPhase2; @@ -175,18 +177,19 @@ namespace gpuPixelRecHits { pixelCPEforGPU::errorFromSize(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic); // store it - hits.setChargeAndStatus(h, clusParams.charge[ic], clusParams.status[ic]); - hits.detectorIndex(h) = me; + hits[h].chargeAndStatus().charge = clusParams.charge[ic]; + hits[h].chargeAndStatus().status = clusParams.status[ic]; + hits[h].detectorIndex() = me; float xl, yl; - hits.xLocal(h) = xl = clusParams.xpos[ic]; - hits.yLocal(h) = yl = clusParams.ypos[ic]; + hits[h].xLocal() = xl = clusParams.xpos[ic]; + hits[h].yLocal() = yl = clusParams.ypos[ic]; - hits.clusterSizeX(h) = clusParams.xsize[ic]; - hits.clusterSizeY(h) = clusParams.ysize[ic]; + hits[h].clusterSizeX() = clusParams.xsize[ic]; + hits[h].clusterSizeY() = clusParams.ysize[ic]; - hits.xerrLocal(h) = clusParams.xerr[ic] * clusParams.xerr[ic] + cpeParams->detParams(me).apeXX; - hits.yerrLocal(h) = clusParams.yerr[ic] * clusParams.yerr[ic] + cpeParams->detParams(me).apeYY; + hits[h].xerrLocal() = clusParams.xerr[ic] * clusParams.xerr[ic] + cpeParams->detParams(me).apeXX; + hits[h].yerrLocal() = clusParams.yerr[ic] * clusParams.yerr[ic] + cpeParams->detParams(me).apeYY; // keep it local for computations float xg, yg, zg; @@ -197,12 +200,12 @@ namespace gpuPixelRecHits { yg -= bs->y; zg -= bs->z; - hits.xGlobal(h) = xg; - hits.yGlobal(h) = yg; - hits.zGlobal(h) = zg; + hits[h].xGlobal() = xg; + hits[h].yGlobal() = yg; + hits[h].zGlobal() = zg; - hits.rGlobal(h) = std::sqrt(xg * xg + yg * yg); - hits.iphi(h) = unsafe_atan2s<7>(yg, xg); + hits[h].rGlobal() = std::sqrt(xg * xg + yg * yg); + hits[h].iphi() = unsafe_atan2s<7>(yg, xg); } __syncthreads(); } // end loop on batches diff --git a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py index 4af0238682abb..11a69fead8ad3 100644 --- a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py +++ b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py @@ -56,7 +56,7 @@ siPixelRecHitsPreSplittingSoA = SwitchProducerCUDA( cpu = cms.EDAlias( siPixelRecHitsPreSplittingCPU = cms.VPSet( - cms.PSet(type = cms.string("cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous")), + cms.PSet(type = cms.string("TrackingRecHitSoAHost")), cms.PSet(type = cms.string("uintAsHostProduct")) )), ) diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc index f3d6022e21654..6bf47b7302da1 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc @@ -1,9 +1,11 @@ #include #include "CUDADataFormats/Common/interface/Product.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" #include "DataFormats/Common/interface/Handle.h" #include "FWCore/Framework/interface/ConsumesCollector.h" #include "FWCore/Framework/interface/Event.h" @@ -30,22 +32,25 @@ class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> { private: void analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const override; const bool m_onGPU; - edm::EDGetTokenT> tokenGPUTrack_; - edm::EDGetTokenT> tokenGPUVertex_; - edm::EDGetTokenT tokenSoATrack_; - edm::EDGetTokenT tokenSoAVertex_; + // GPU + edm::EDGetTokenT> tokenGPUTrack_; + edm::EDGetTokenT> tokenGPUVertex_; + + // CPU + edm::EDGetTokenT tokenSoATrack_; + edm::EDGetTokenT tokenSoAVertex_; }; PixelTrackDumpCUDA::PixelTrackDumpCUDA(const edm::ParameterSet& iConfig) : m_onGPU(iConfig.getParameter("onGPU")) { if (m_onGPU) { tokenGPUTrack_ = - consumes>(iConfig.getParameter("pixelTrackSrc")); + consumes>(iConfig.getParameter("pixelTrackSrc")); tokenGPUVertex_ = - consumes>(iConfig.getParameter("pixelVertexSrc")); + consumes>(iConfig.getParameter("pixelVertexSrc")); } else { - tokenSoATrack_ = consumes(iConfig.getParameter("pixelTrackSrc")); - tokenSoAVertex_ = consumes(iConfig.getParameter("pixelVertexSrc")); + tokenSoATrack_ = consumes(iConfig.getParameter("pixelTrackSrc")); + tokenSoAVertex_ = consumes(iConfig.getParameter("pixelVertexSrc")); } } @@ -66,19 +71,20 @@ void PixelTrackDumpCUDA::analyze(edm::StreamID streamID, cms::cuda::ScopedContextProduce ctx{hTracks}; auto const& tracks = ctx.get(hTracks); - auto const* tsoa = tracks.get(); + auto const* tsoa = &tracks; assert(tsoa); auto const& vertices = ctx.get(iEvent.get(tokenGPUVertex_)); - auto const* vsoa = vertices.get(); + //auto const* vsoa = vertices.get(); + auto const* vsoa = &vertices; assert(vsoa); } else { - auto const* tsoa = iEvent.get(tokenSoATrack_).get(); - assert(tsoa); + auto const& tsoa = iEvent.get(tokenSoATrack_); + assert(tsoa.buffer()); - auto const* vsoa = iEvent.get(tokenSoAVertex_).get(); - assert(vsoa); + auto const& vsoa = iEvent.get(tokenSoAVertex_); + assert(vsoa.buffer()); } } diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc index 59ba877e9e626..36d3dd8c3dcc7 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc @@ -27,7 +27,9 @@ #include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h" #include "CUDADataFormats/Common/interface/HostProduct.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" #include "storeTracks.h" @@ -35,7 +37,7 @@ /** * This class creates "leagcy" reco::Track - * objects from the output of SoA CA. + * objects from the output of SoA CA. */ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> { public: @@ -54,7 +56,7 @@ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> { // Event Data tokens const edm::EDGetTokenT tBeamSpot_; - const edm::EDGetTokenT tokenTrack_; + const edm::EDGetTokenT tokenTrack_; const edm::EDGetTokenT cpuHits_; const edm::EDGetTokenT hmsToken_; // Event Setup tokens @@ -67,7 +69,7 @@ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> { PixelTrackProducerFromSoA::PixelTrackProducerFromSoA(const edm::ParameterSet &iConfig) : tBeamSpot_(consumes(iConfig.getParameter("beamSpot"))), - tokenTrack_(consumes(iConfig.getParameter("trackSrc"))), + tokenTrack_(consumes(iConfig.getParameter("trackSrc"))), cpuHits_(consumes(iConfig.getParameter("pixelRecHitLegacySrc"))), hmsToken_(consumes(iConfig.getParameter("pixelRecHitLegacySrc"))), idealMagneticFieldToken_(esConsumes()), @@ -104,7 +106,6 @@ void PixelTrackProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &iSetup) const { - // enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity }; reco::TrackBase::TrackQuality recoQuality[] = {reco::TrackBase::undefQuality, reco::TrackBase::undefQuality, reco::TrackBase::discarded, @@ -152,12 +153,10 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, std::vector hits; hits.reserve(5); - const auto &tsoa = *iEvent.get(tokenTrack_); - - auto const *quality = tsoa.qualityData(); - auto const &fit = tsoa.stateAtBS; - auto const &hitIndices = tsoa.hitIndices; - auto nTracks = tsoa.nTracks(); + auto &tsoa = iEvent.get(tokenTrack_); + auto const *quality = pixelTrack::utilities::qualityData(tsoa.view()); + auto const hitIndices = tsoa.view().hitIndices(); + auto nTracks = tsoa.view().nTracks(); tracks.reserve(nTracks); @@ -166,13 +165,15 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, //sort index by pt std::vector sortIdxs(nTracks); std::iota(sortIdxs.begin(), sortIdxs.end(), 0); - std::sort( - sortIdxs.begin(), sortIdxs.end(), [&](int32_t const i1, int32_t const i2) { return tsoa.pt(i1) > tsoa.pt(i2); }); + std::sort(sortIdxs.begin(), sortIdxs.end(), [&](int32_t const i1, int32_t const i2) { + return tsoa.view()[i1].pt() > tsoa.view()[i2].pt(); + }); //store the index of the SoA: indToEdm[index_SoAtrack] -> index_edmTrack (if it exists) indToEdm.resize(sortIdxs.size(), -1); for (const auto &it : sortIdxs) { - auto nHits = tsoa.nHits(it); + auto nHits = pixelTrack::utilities::nHits(tsoa.view(), it); + assert(nHits >= 3); auto q = quality[it]; if (q < minQuality_) @@ -189,12 +190,13 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, // mind: this values are respect the beamspot! - float chi2 = tsoa.chi2(it); - float phi = tsoa.phi(it); + float chi2 = tsoa.view()[it].chi2(); + float phi = pixelTrack::utilities::phi(tsoa.view(), it); riemannFit::Vector5d ipar, opar; riemannFit::Matrix5d icov, ocov; - fit.copyToDense(ipar, icov, it); + pixelTrack::utilities::copyToDense(tsoa.view(), ipar, icov, it); + riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov); LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.); @@ -237,7 +239,6 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, // filter??? tracks.emplace_back(track.release(), hits); } - // std::cout << "processed " << nt << " good tuples " << tracks.size() << "out of " << indToEdm.size() << std::endl; // store tracks storeTracks(iEvent, tracks, httopo); diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc index 5cf4aac491901..191e9009f6d6e 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc @@ -2,7 +2,9 @@ #include "CUDADataFormats/Common/interface/Product.h" #include "CUDADataFormats/Common/interface/HostProduct.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" #include "DataFormats/Common/interface/Handle.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" @@ -15,6 +17,7 @@ #include "FWCore/Utilities/interface/EDGetToken.h" #include "FWCore/Utilities/interface/InputTag.h" #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" // Switch on to enable checks and printout for found tracks // #define PIXEL_DEBUG_PRODUCE @@ -32,15 +35,15 @@ class PixelTrackSoAFromCUDA : public edm::stream::EDProducer edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; - edm::EDGetTokenT> tokenCUDA_; - edm::EDPutTokenT tokenSOA_; + edm::EDGetTokenT> tokenCUDA_; + edm::EDPutTokenT tokenSOA_; - cms::cuda::host::unique_ptr soa_; + pixelTrack::TrackSoAHost tracks_h; }; PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig) - : tokenCUDA_(consumes>(iConfig.getParameter("src"))), - tokenSOA_(produces()) {} + : tokenCUDA_(consumes>(iConfig.getParameter("src"))), + tokenSOA_(produces()) {} void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { edm::ParameterSetDescription desc; @@ -52,43 +55,45 @@ void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& des void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - cms::cuda::Product const& inputDataWrapped = iEvent.get(tokenCUDA_); + cms::cuda::Product const& inputDataWrapped = iEvent.get(tokenCUDA_); cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; - auto const& inputData = ctx.get(inputDataWrapped); - - soa_ = inputData.toHostAsync(ctx.stream()); + auto const& tracks_d = ctx.get(inputDataWrapped); // Tracks on device + tracks_h = pixelTrack::TrackSoAHost(ctx.stream()); // Create an instance of Tracks on Host, using the stream + cudaCheck(cudaMemcpyAsync(tracks_h.buffer().get(), + tracks_d.const_buffer().get(), + tracks_d.bufferSize(), + cudaMemcpyDeviceToHost, + ctx.stream())); // Copy data from Device to Host + cudaCheck(cudaGetLastError()); } void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { // check that the fixed-size SoA does not overflow - auto const& tsoa = *soa_; - auto maxTracks = tsoa.stride(); - auto nTracks = tsoa.nTracks(); + auto maxTracks = tracks_h.view().metadata().size(); + auto nTracks = tracks_h.view().nTracks(); assert(nTracks < maxTracks); + if (nTracks == maxTracks - 1) { edm::LogWarning("PixelTracks") << "Unsorted reconstructed pixel tracks truncated to " << maxTracks - 1 << " candidates"; } #ifdef PIXEL_DEBUG_PRODUCE - std::cout << "size of SoA " << sizeof(tsoa) << " stride " << maxTracks << std::endl; - std::cout << "found " << nTracks << " tracks in cpu SoA at " << &tsoa << std::endl; + std::cout << " stride " << maxTracks << std::endl; + std::cout << "found " << nTracks << std::endl; int32_t nt = 0; for (int32_t it = 0; it < maxTracks; ++it) { - auto nHits = tsoa.nHits(it); - assert(nHits == int(tsoa.hitIndices.size(it))); + auto nHits = pixelTrack::utilities::nHits(tracks_h.view(), it); + assert(nHits == int(tracks_h.view().hitIndices().size(it))); if (nHits == 0) break; // this is a guard: maybe we need to move to nTracks... nt++; } assert(nTracks == nt); #endif - // DO NOT make a copy (actually TWO....) - iEvent.emplace(tokenSOA_, std::move(soa_)); - - assert(!soa_); + iEvent.emplace(tokenSOA_, std::move(tracks_h)); } DEFINE_FWK_MODULE(PixelTrackSoAFromCUDA); diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelVertexProducerCUDA.cc new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h index 6ec6afb83cba1..2b2d93cf7415a 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h @@ -18,7 +18,7 @@ using HitsOnGPU = TrackingRecHit2DSOAView; using Tuples = pixelTrack::HitContainer; -using OutputSoA = pixelTrack::TrackSoA; +using OutputSoAView = pixelTrack::TrackSoAView; using tindex_type = caConstants::tindex_type; constexpr auto invalidTkId = std::numeric_limits::max(); @@ -169,12 +169,12 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets, template __global__ void kernel_BLFit(caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity, double bField, - OutputSoA *results, + OutputSoAView results_view, tindex_type const *__restrict__ ptkids, double *__restrict__ phits, float *__restrict__ phits_ge, double *__restrict__ pfast_fit) { - assert(results); + // assert(results_view); // TODO Find equivalent assertion for View assert(pfast_fit); // same as above... @@ -203,10 +203,11 @@ __global__ void kernel_BLFit(caConstants::TupleMultiplicity const *__restrict__ brokenline::lineFit(hits_ge, fast_fit, bField, data, line); brokenline::circleFit(hits, hits_ge, fast_fit, bField, data, circle); - results->stateAtBS.copyFromCircle(circle.par, circle.cov, line.par, line.cov, 1.f / float(bField), tkid); - results->pt(tkid) = float(bField) / float(std::abs(circle.par(2))); - results->eta(tkid) = asinhf(line.par(0)); - results->chi2(tkid) = (circle.chi2 + line.chi2) / (2 * N - 5); + pixelTrack::utilities::copyFromCircle( + results_view, circle.par, circle.cov, line.par, line.cov, 1.f / float(bField), tkid); + results_view[tkid].pt() = float(bField) / float(std::abs(circle.par(2))); + results_view[tkid].eta() = asinhf(line.par(0)); + results_view[tkid].chi2() = (circle.chi2 + line.chi2) / (2 * N - 5); #ifdef BROKENLINE_DEBUG if (!(circle.chi2 >= 0) || !(line.chi2 >= 0)) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc index 72c482c6189db..2e48865b682bf 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc @@ -20,7 +20,8 @@ #include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h" #include "CAHitNtupletGeneratorOnGPU.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" class CAHitNtupletCUDA : public edm::global::EDProducer<> { @@ -39,10 +40,15 @@ class CAHitNtupletCUDA : public edm::global::EDProducer<> { bool onGPU_; edm::ESGetToken tokenField_; + // GPU + // Produces a view on GPU, which is used by PixelTrackSoAFromCUDA edm::EDGetTokenT> tokenHitGPU_; - edm::EDPutTokenT> tokenTrackGPU_; + edm::EDPutTokenT> tokenTrackGPU_; + + // CPU + // Produces a view on CPU, which is used by PixelTrackProducerFromSoA edm::EDGetTokenT tokenHitCPU_; - edm::EDPutTokenT tokenTrackCPU_; + edm::EDPutTokenT tokenTrackCPU_; CAHitNtupletGeneratorOnGPU gpuAlgo_; }; @@ -52,10 +58,10 @@ CAHitNtupletCUDA::CAHitNtupletCUDA(const edm::ParameterSet& iConfig) if (onGPU_) { tokenHitGPU_ = consumes>(iConfig.getParameter("pixelRecHitSrc")); - tokenTrackGPU_ = produces>(); + tokenTrackGPU_ = produces>(); } else { tokenHitCPU_ = consumes(iConfig.getParameter("pixelRecHitSrc")); - tokenTrackCPU_ = produces(); + tokenTrackCPU_ = produces(); } } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc index 66208debdc98d..65a3f3a8dff4c 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -78,15 +78,11 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr } template <> -void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { - auto *tuples_d = &tracks_d->hitIndices; - auto *detId_d = &tracks_d->detIndices; - auto *quality_d = tracks_d->qualityData(); - - assert(tuples_d && quality_d); - +void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, + TkSoAView tracks_view, + cudaStream_t cudaStream) { // zero tuples - cms::cuda::launchZero(tuples_d, cudaStream); + cms::cuda::launchZero(&tracks_view.hitIndices(), cudaStream); auto nhits = hh.nHits(); @@ -119,24 +115,22 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * device_theCells_.get(), device_nCells_, device_theCellTracks_.get(), - tuples_d, + tracks_view, device_hitTuple_apc_, - quality_d, params_.minHitsPerNtuplet_); if (params_.doStats_) kernel_mark_used(device_theCells_.get(), device_nCells_); - cms::cuda::finalizeBulk(device_hitTuple_apc_, tuples_d); + cms::cuda::finalizeBulk(device_hitTuple_apc_, &tracks_view.hitIndices()); - kernel_fillHitDetIndices(tuples_d, hh.view(), detId_d); - kernel_fillNLayers(tracks_d, device_hitTuple_apc_); + kernel_fillHitDetIndices(tracks_view, hh.view()); + kernel_fillNLayers(tracks_view, device_hitTuple_apc_); // remove duplicates (tracks that share a doublet) - kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d, quality_d, params_.dupPassThrough_); - - kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get()); + kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_view, params_.dupPassThrough_); + kernel_countMultiplicity(tracks_view, device_tupleMultiplicity_.get()); cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream); - kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get()); + kernel_fillMultiplicity(tracks_view, device_tupleMultiplicity_.get()); if (nhits > 1 && params_.lateFishbone_) { gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, true); @@ -144,14 +138,14 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * } template <> -void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { +void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, + TkSoAView tracks_view, + cudaStream_t cudaStream) { int32_t nhits = hh.nHits(); - auto const *tuples_d = &tracks_d->hitIndices; - auto *quality_d = tracks_d->qualityData(); - + auto *quality_d = pixelTrack::utilities::qualityData(tracks_view); // classify tracks based on kinematics - kernel_classifyTracks(tuples_d, tracks_d, params_.cuts_, quality_d); + kernel_classifyTracks(tracks_view, quality_d, params_.cuts_); if (params_.lateFishbone_) { // apply fishbone cleaning to good tracks @@ -159,38 +153,34 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA } // remove duplicates (tracks that share a doublet) - kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d, params_.dupPassThrough_); + kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_view, params_.dupPassThrough_); // fill hit->track "map" if (params_.doSharedHitCut_ || params_.doStats_) { - kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get()); + kernel_countHitInTracks(tracks_view, device_hitToTuple_.get()); cms::cuda::launchFinalize(hitToTupleView_, cudaStream); - kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get()); + kernel_fillHitInTracks(tracks_view, device_hitToTuple_.get()); } // remove duplicates (tracks that share at least one hit) if (params_.doSharedHitCut_) { kernel_rejectDuplicate( - tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); - - kernel_sharedHitCleaner(hh.view(), - tracks_d, - quality_d, - params_.minHitsForSharingCut_, - params_.dupPassThrough_, - device_hitToTuple_.get()); + tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + + kernel_sharedHitCleaner( + hh.view(), tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); if (params_.useSimpleTripletCleaner_) { kernel_simpleTripletCleaner( - tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); } else { kernel_tripletCleaner( - tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); } } if (params_.doStats_) { std::lock_guard guard(lock_stat); - kernel_checkOverflows(tuples_d, + kernel_checkOverflows(tracks_view, device_tupleMultiplicity_.get(), device_hitToTuple_.get(), device_hitTuple_apc_, @@ -208,7 +198,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA // counters (add flag???) std::lock_guard guard(lock_stat); kernel_doStatsForHitInTracks(device_hitToTuple_.get(), counters_); - kernel_doStatsForTracks(tuples_d, quality_d, counters_); + kernel_doStatsForTracks(tracks_view, quality_d, counters_); } #ifdef DUMP_GPU_TK_TUPLES @@ -217,7 +207,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA { std::lock_guard guard(lock); ++iev; - kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 0, 1000000, iev); + kernel_print_found_ntuplets(hh.view(), tracks_view, device_hitToTuple_.get(), 0, 1000000, iev); } #endif } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index 913b6d5a32d28..9cbdcae1a13d8 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -2,14 +2,14 @@ #include template <> -void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { +void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, + TkSoAView tracks_view, + cudaStream_t cudaStream) { // these are pointer on GPU! - auto *tuples_d = &tracks_d->hitIndices; - auto *detId_d = &tracks_d->detIndices; - auto *quality_d = tracks_d->qualityData(); + auto *quality_d = pixelTrack::utilities::qualityData(tracks_view); // zero tuples - cms::cuda::launchZero(tuples_d, cudaStream); + cms::cuda::launchZero(&(tracks_view.hitIndices()), cudaStream); int32_t nhits = hh.nHits(); @@ -70,9 +70,8 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * device_theCells_.get(), device_nCells_, device_theCellTracks_.get(), - tuples_d, + tracks_view, device_hitTuple_apc_, - quality_d, params_.minHitsPerNtuplet_); cudaCheck(cudaGetLastError()); @@ -87,26 +86,25 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * blockSize = 128; numberOfBlocks = (HitContainer::ctNOnes() + blockSize - 1) / blockSize; - cms::cuda::finalizeBulk<<>>(device_hitTuple_apc_, tuples_d); + cms::cuda::finalizeBulk<<>>(device_hitTuple_apc_, + &tracks_view.hitIndices()); - kernel_fillHitDetIndices<<>>(tuples_d, hh.view(), detId_d); + kernel_fillHitDetIndices<<>>(tracks_view, hh.view()); cudaCheck(cudaGetLastError()); - kernel_fillNLayers<<>>(tracks_d, device_hitTuple_apc_); + kernel_fillNLayers<<>>(tracks_view, device_hitTuple_apc_); cudaCheck(cudaGetLastError()); // remove duplicates (tracks that share a doublet) numberOfBlocks = nDoubletBlocks(blockSize); kernel_earlyDuplicateRemover<<>>( - device_theCells_.get(), device_nCells_, tracks_d, quality_d, params_.dupPassThrough_); + device_theCells_.get(), device_nCells_, tracks_view, params_.dupPassThrough_); cudaCheck(cudaGetLastError()); blockSize = 128; numberOfBlocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize; - kernel_countMultiplicity<<>>( - tuples_d, quality_d, device_tupleMultiplicity_.get()); + kernel_countMultiplicity<<>>(tracks_view, device_tupleMultiplicity_.get()); cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream); - kernel_fillMultiplicity<<>>( - tuples_d, quality_d, device_tupleMultiplicity_.get()); + kernel_fillMultiplicity<<>>(tracks_view, device_tupleMultiplicity_.get()); cudaCheck(cudaGetLastError()); // do not run the fishbone if there are hits only in BPIX1 @@ -222,10 +220,11 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr } template <> -void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { +void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, + TkSoAView tracks_view, + cudaStream_t cudaStream) { // these are pointer on GPU! - auto const *tuples_d = &tracks_d->hitIndices; - auto *quality_d = tracks_d->qualityData(); + auto *quality_d = pixelTrack::utilities::qualityData(tracks_view); int32_t nhits = hh.nHits(); @@ -233,7 +232,8 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA // classify tracks based on kinematics auto numberOfBlocks = nQuadrupletBlocks(blockSize); - kernel_classifyTracks<<>>(tuples_d, tracks_d, params_.cuts_, quality_d); + kernel_classifyTracks<<>>(tracks_view, quality_d, params_.cuts_); + cudaCheck(cudaGetLastError()); if (params_.lateFishbone_) { @@ -247,7 +247,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA // mark duplicates (tracks that share a doublet) numberOfBlocks = nDoubletBlocks(blockSize); kernel_fastDuplicateRemover<<>>( - device_theCells_.get(), device_nCells_, tracks_d, params_.dupPassThrough_); + device_theCells_.get(), device_nCells_, tracks_view, params_.dupPassThrough_); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaCheck(cudaDeviceSynchronize()); @@ -257,14 +257,13 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA // fill hit->track "map" assert(hitToTupleView_.offSize > nhits); numberOfBlocks = nQuadrupletBlocks(blockSize); - kernel_countHitInTracks<<>>( - tuples_d, quality_d, device_hitToTuple_.get()); + kernel_countHitInTracks<<>>(tracks_view, device_hitToTuple_.get()); cudaCheck(cudaGetLastError()); assert((hitToTupleView_.assoc == device_hitToTuple_.get()) && (hitToTupleView_.offStorage == device_hitToTupleStorage_.get()) && (hitToTupleView_.offSize > 0)); cms::cuda::launchFinalize(hitToTupleView_, cudaStream); cudaCheck(cudaGetLastError()); - kernel_fillHitInTracks<<>>(tuples_d, quality_d, device_hitToTuple_.get()); + kernel_fillHitInTracks<<>>(tracks_view, device_hitToTuple_.get()); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaCheck(cudaDeviceSynchronize()); @@ -276,21 +275,17 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA numberOfBlocks = (hitToTupleView_.offSize + blockSize - 1) / blockSize; kernel_rejectDuplicate<<>>( - tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); - kernel_sharedHitCleaner<<>>(hh.view(), - tracks_d, - quality_d, - params_.minHitsForSharingCut_, - params_.dupPassThrough_, - device_hitToTuple_.get()); + kernel_sharedHitCleaner<<>>( + hh.view(), tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); if (params_.useSimpleTripletCleaner_) { kernel_simpleTripletCleaner<<>>( - tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); } else { kernel_tripletCleaner<<>>( - tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + tracks_view, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); } cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG @@ -300,7 +295,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA if (params_.doStats_) { numberOfBlocks = (std::max(nhits, int(params_.maxNumberOfDoublets_)) + blockSize - 1) / blockSize; - kernel_checkOverflows<<>>(tuples_d, + kernel_checkOverflows<<>>(tracks_view, device_tupleMultiplicity_.get(), device_hitToTuple_.get(), device_hitTuple_apc_, @@ -321,7 +316,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA kernel_doStatsForHitInTracks<<>>(device_hitToTuple_.get(), counters_); cudaCheck(cudaGetLastError()); numberOfBlocks = (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize; - kernel_doStatsForTracks<<>>(tuples_d, quality_d, counters_); + kernel_doStatsForTracks<<>>(tracks_view, quality_d, counters_); cudaCheck(cudaGetLastError()); } #ifdef GPU_DEBUG @@ -337,11 +332,11 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA ++iev; for (int k = 0; k < 20000; k += 500) { kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>( - hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), k, k + 500, iev); + hh.view(), tracks_view, device_hitToTuple_.get(), k, k + 500, iev); cudaDeviceSynchronize(); } kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>( - hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 20000, 1000000, iev); + hh.view(), tracks_view, device_hitToTuple_.get(), 20000, 1000000, iev); cudaDeviceSynchronize(); // cudaStreamSynchronize(cudaStream); } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h index 8af1176fe92c6..529f7de99ea98 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h @@ -3,7 +3,8 @@ // #define GPU_DEBUG -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" + #include "GPUCACell.h" // #define DUMP_GPU_TK_TUPLES @@ -34,7 +35,8 @@ namespace cAHitNtupletGenerator { using TupleMultiplicity = caConstants::TupleMultiplicity; using Quality = pixelTrack::Quality; - using TkSoA = pixelTrack::TrackSoA; + using TkSoAView = pixelTrack::TrackSoAView; + using TkSoAConstView = pixelTrack::TrackSoAConstView; using HitContainer = pixelTrack::HitContainer; struct QualityCuts { @@ -173,7 +175,8 @@ class CAHitNtupletGeneratorKernels { using TupleMultiplicity = caConstants::TupleMultiplicity; using Quality = pixelTrack::Quality; - using TkSoA = pixelTrack::TrackSoA; + using TkSoAView = pixelTrack::TrackSoAView; + using TkSoAConstView = pixelTrack::TrackSoAConstView; using HitContainer = pixelTrack::HitContainer; CAHitNtupletGeneratorKernels(Params const& params) @@ -182,9 +185,9 @@ class CAHitNtupletGeneratorKernels { TupleMultiplicity const* tupleMultiplicity() const { return device_tupleMultiplicity_.get(); } - void launchKernels(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream); + void launchKernels(HitsOnCPU const& hh, TkSoAView tracks_view, cudaStream_t cudaStream); - void classifyTuples(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream); + void classifyTuples(HitsOnCPU const& hh, TkSoAView tracks_view, cudaStream_t cudaStream); void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream); void allocateOnGPU(int32_t nHits, cudaStream_t stream); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index bbe5df891a735..75f52305ab39b 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -14,6 +14,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" #include "CAConstants.h" #include "CAHitNtupletGeneratorKernels.h" @@ -28,7 +29,8 @@ using HitToTuple = caConstants::HitToTuple; using TupleMultiplicity = caConstants::TupleMultiplicity; using Quality = pixelTrack::Quality; -using TkSoA = pixelTrack::TrackSoA; +using TkSoAView = pixelTrack::TrackSoAView; +using TkSoAConstView = pixelTrack::TrackSoAConstView; using HitContainer = pixelTrack::HitContainer; namespace { @@ -39,7 +41,7 @@ namespace { } // namespace -__global__ void kernel_checkOverflows(HitContainer const *foundNtuplets, +__global__ void kernel_checkOverflows(TkSoAView tracks_view, caConstants::TupleMultiplicity const *tupleMultiplicity, CAHitNtupletGeneratorKernelsGPU::HitToTuple const *hitToTuple, cms::cuda::AtomicPairCounter *apc, @@ -72,16 +74,16 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets, nHits, hitToTuple->totOnes()); if (apc->get().m < caConstants::maxNumberOfQuadruplets) { - assert(foundNtuplets->size(apc->get().m) == 0); - assert(foundNtuplets->size() == apc->get().n); + assert(tracks_view.hitIndices().size(apc->get().m) == 0); + assert(tracks_view.hitIndices().size() == apc->get().n); } } - for (int idx = first, nt = foundNtuplets->nOnes(); idx < nt; idx += gridDim.x * blockDim.x) { - if (foundNtuplets->size(idx) > 7) // current real limit - printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx)); - assert(foundNtuplets->size(idx) <= caConstants::maxHitsOnTrack); - for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih) + for (int idx = first, nt = tracks_view.hitIndices().nOnes(); idx < nt; idx += gridDim.x * blockDim.x) { + if (tracks_view.hitIndices().size(idx) > 7) // current real limit + printf("ERROR %d, %d\n", idx, tracks_view.hitIndices().size(idx)); + assert(tracks_view.hitIndices().size(idx) <= caConstants::maxHitsOnTrack); + for (auto ih = tracks_view.hitIndices().begin(idx); ih != tracks_view.hitIndices().end(idx); ++ih) assert(int(*ih) < nHits); } #endif @@ -139,13 +141,10 @@ __global__ void kernel_fishboneCleaner(GPUCACell const *cells, uint32_t const *_ // It does not seem to affect efficiency in any way! __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells, uint32_t const *__restrict__ nCells, - TkSoA const *__restrict__ ptracks, - Quality *quality, + TkSoAView tracks_view, bool dupPassThrough) { // quality to mark rejected - constexpr auto reject = pixelTrack::Quality::edup; /// cannot be loose - - auto const &tracks = *ptracks; + constexpr auto reject = (uint8_t)pixelTrack::Quality::edup; /// cannot be loose assert(nCells); auto first = threadIdx.x + blockIdx.x * blockDim.x; @@ -159,7 +158,7 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells, // find maxNl for (auto it : thisCell.tracks()) { - auto nl = tracks.nLayers(it); + auto nl = tracks_view[it].nLayers(); maxNl = std::max(nl, maxNl); } @@ -168,8 +167,8 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells, // maxNl = std::min(4, maxNl); for (auto it : thisCell.tracks()) { - if (tracks.nLayers(it) < maxNl) - quality[it] = reject; //no race: simple assignment of the same constant + if (tracks_view[it].nLayers() < maxNl) + tracks_view[it].quality() = reject; //no race: simple assignment of the same constant } } } @@ -177,11 +176,11 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells, // assume the above (so, short tracks already removed) __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells, uint32_t const *__restrict__ nCells, - TkSoA *__restrict__ tracks, + TkSoAView tracks_view, bool dupPassThrough) { // quality to mark rejected - auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup; - constexpr auto loose = pixelTrack::Quality::loose; + auto const reject = dupPassThrough ? (uint8_t)pixelTrack::Quality::loose : (uint8_t)pixelTrack::Quality::dup; + constexpr auto loose = (uint8_t)pixelTrack::Quality::loose; assert(nCells); @@ -196,42 +195,42 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells, /* chi2 penalize higher-pt tracks (try rescale it?) auto score = [&](auto it) { - return tracks->nLayers(it) < 4 ? - std::abs(tracks->tip(it)) : // tip for triplets - tracks->chi2(it); //chi2 for quads + return tracks_view[it].nLayers() < 4 ? + std::abs(pixelTrack::utilities::tip(tracks_view, it)) : // tip for triplets + tracks_view[it].chi2(it); //chi2 for quads }; */ - auto score = [&](auto it) { return std::abs(tracks->tip(it)); }; + auto score = [&](auto it) { return std::abs(pixelTrack::utilities::tip(tracks_view, it)); }; // full crazy combinatorics int ntr = thisCell.tracks().size(); for (int i = 0; i < ntr - 1; ++i) { auto it = thisCell.tracks()[i]; - auto qi = tracks->quality(it); + auto qi = tracks_view[it].quality(); if (qi <= reject) continue; - auto opi = tracks->stateAtBS.state(it)(2); - auto e2opi = tracks->stateAtBS.covariance(it)(9); - auto cti = tracks->stateAtBS.state(it)(3); - auto e2cti = tracks->stateAtBS.covariance(it)(12); + auto opi = tracks_view[it].state()(2); + auto e2opi = tracks_view[it].covariance()(9); + auto cti = tracks_view[it].state()(3); + auto e2cti = tracks_view[it].covariance()(12); for (auto j = i + 1; j < ntr; ++j) { auto jt = thisCell.tracks()[j]; - auto qj = tracks->quality(jt); + auto qj = tracks_view[jt].quality(); if (qj <= reject) continue; - auto opj = tracks->stateAtBS.state(jt)(2); - auto ctj = tracks->stateAtBS.state(jt)(3); - auto dct = nSigma2 * (tracks->stateAtBS.covariance(jt)(12) + e2cti); + auto opj = tracks_view[jt].state()(2); + auto ctj = tracks_view[jt].state()(3); + auto dct = nSigma2 * (tracks_view[jt].covariance()(12) + e2cti); if ((cti - ctj) * (cti - ctj) > dct) continue; - auto dop = nSigma2 * (tracks->stateAtBS.covariance(jt)(9) + e2opi); + auto dop = nSigma2 * (tracks_view[jt].covariance()(9) + e2opi); if ((opi - opj) * (opi - opj) > dop) continue; if ((qj < qi) || (qj == qi && score(it) < score(jt))) - tracks->quality(jt) = reject; + tracks_view[jt].quality() = reject; else { - tracks->quality(it) = reject; + tracks_view[it].quality() = reject; break; } } @@ -240,8 +239,8 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells, // find maxQual auto maxQual = reject; // no duplicate! for (auto it : thisCell.tracks()) { - if (tracks->quality(it) > maxQual) - maxQual = tracks->quality(it); + if (tracks_view[it].quality() > maxQual) + maxQual = tracks_view[it].quality(); } if (maxQual <= loose) @@ -249,7 +248,7 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells, // find min score for (auto it : thisCell.tracks()) { - if (tracks->quality(it) == maxQual && score(it) < mc) { + if (tracks_view[it].quality() == maxQual && score(it) < mc) { mc = score(it); im = it; } @@ -260,8 +259,8 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells, // mark all other duplicates (not yet, keep it loose) for (auto it : thisCell.tracks()) { - if (tracks->quality(it) > loose && it != im) - tracks->quality(it) = loose; //no race: simple assignment of the same constant + if (tracks_view[it].quality() > loose && it != im) + tracks_view[it].quality() = loose; //no race: simple assignment of the same constant } } } @@ -337,9 +336,8 @@ __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp, GPUCACell *__restrict__ cells, uint32_t const *nCells, gpuPixelDoublets::CellTracksVector *cellTracks, - HitContainer *foundNtuplets, + TkSoAView tracks_view, cms::cuda::AtomicPairCounter *apc, - Quality *__restrict__ quality, unsigned int minHitsPerNtuplet) { // recursive: not obvious to widen auto const &hh = *hhp; @@ -357,8 +355,15 @@ __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp, if (doit) { GPUCACell::TmpTuple stack; stack.reset(); - thisCell.find_ntuplets<6>( - hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3); + thisCell.find_ntuplets<6>(hh, + cells, + *cellTracks, + tracks_view.hitIndices(), + *apc, + pixelTrack::utilities::qualityData(tracks_view), + stack, + minHitsPerNtuplet, + pid < 3); assert(stack.empty()); // printf("in %d found quadruplets: %d\n", cellIndex, apc->get()); } @@ -374,17 +379,16 @@ __global__ void kernel_mark_used(GPUCACell *__restrict__ cells, uint32_t const * } } -__global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets, - Quality const *__restrict__ quality, - caConstants::TupleMultiplicity *tupleMultiplicity) { +// TODO: change arg type to TkSoAConstview +__global__ void kernel_countMultiplicity(TkSoAView tracks_view, caConstants::TupleMultiplicity *tupleMultiplicity) { auto first = blockIdx.x * blockDim.x + threadIdx.x; - for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) { - auto nhits = foundNtuplets->size(it); + for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) { + auto nhits = tracks_view.hitIndices().size(it); if (nhits < 3) continue; - if (quality[it] == pixelTrack::Quality::edup) + if (tracks_view[it].quality() == (uint8_t)pixelTrack::Quality::edup) continue; - assert(quality[it] == pixelTrack::Quality::bad); + assert(tracks_view[it].quality() == (uint8_t)pixelTrack::Quality::bad); if (nhits > 7) // current limit printf("wrong mult %d %d\n", it, nhits); assert(nhits <= caConstants::maxHitsOnTrack); @@ -392,17 +396,16 @@ __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundN } } -__global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets, - Quality const *__restrict__ quality, - caConstants::TupleMultiplicity *tupleMultiplicity) { +// TODO: change arg type to TkSoAConstview +__global__ void kernel_fillMultiplicity(TkSoAView tracks_view, caConstants::TupleMultiplicity *tupleMultiplicity) { auto first = blockIdx.x * blockDim.x + threadIdx.x; - for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) { - auto nhits = foundNtuplets->size(it); + for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) { + auto nhits = tracks_view.hitIndices().size(it); if (nhits < 3) continue; - if (quality[it] == pixelTrack::Quality::edup) + if (tracks_view[it].quality() == (uint8_t)pixelTrack::Quality::edup) continue; - assert(quality[it] == pixelTrack::Quality::bad); + assert(tracks_view[it].quality() == (uint8_t)pixelTrack::Quality::bad); if (nhits > 7) printf("wrong mult %d %d\n", it, nhits); assert(nhits <= caConstants::maxHitsOnTrack); @@ -410,13 +413,17 @@ __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNt } } -__global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, - TkSoA const *__restrict__ tracks, - CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts, - Quality *__restrict__ quality) { +/* + Supply both the original TkSoA and the TkSoAView which contains +the SoA Data + */ +__global__ void kernel_classifyTracks(TkSoAView tracks_view, + Quality *__restrict__ quality, + CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts) { int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int it = first, nt = tuples->nOnes(); it < nt; it += gridDim.x * blockDim.x) { - auto nhits = tuples->size(it); + + for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) { + auto nhits = tracks_view.hitIndices().size(it); if (nhits == 0) break; // guard @@ -433,11 +440,11 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, // if the fit has any invalid parameters, mark it as bad bool isNaN = false; for (int i = 0; i < 5; ++i) { - isNaN |= std::isnan(tracks->stateAtBS.state(it)(i)); + isNaN |= std::isnan(tracks_view[it].state()(i)); } if (isNaN) { #ifdef NTUPLE_DEBUG - printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks->chi2(it)); + printf("NaN in fit %d size %d chi2 %f\n", it, tracks_view.hitIndices().size(it), tracks_view[it].chi2()); #endif continue; } @@ -468,16 +475,16 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, }; // (see CAHitNtupletGeneratorGPU.cc) - float pt = std::min(tracks->pt(it), cuts.chi2MaxPt); + float pt = std::min(tracks_view[it].pt(), cuts.chi2MaxPt); float chi2Cut = cuts.chi2Scale * (cuts.chi2Coeff[0] + roughLog(pt) * cuts.chi2Coeff[1]); - if (tracks->chi2(it) >= chi2Cut) { + if (tracks_view[it].chi2() >= chi2Cut) { #ifdef NTUPLE_FIT_DEBUG printf("Bad chi2 %d size %d pt %f eta %f chi2 %f\n", it, - tuples->size(it), - tracks->pt(it), - tracks->eta(it), - tracks->chi2(it)); + tracks_view.hitIndices().size(it), + tracks_view[it].pt(), + tracks_view[it].eta(), + tracks_view[it].chi2()); #endif continue; } @@ -490,20 +497,22 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, // - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm // (see CAHitNtupletGeneratorGPU.cc) auto const ®ion = (nhits > 3) ? cuts.quadruplet : cuts.triplet; - bool isOk = (std::abs(tracks->tip(it)) < region.maxTip) and (tracks->pt(it) > region.minPt) and - (std::abs(tracks->zip(it)) < region.maxZip); + bool isOk = (std::abs(pixelTrack::utilities::tip(tracks_view, it)) < region.maxTip) and + (tracks_view[it].pt() > region.minPt) and + (std::abs(pixelTrack::utilities::zip(tracks_view, it)) < region.maxZip); - if (isOk) + if (isOk) { quality[it] = pixelTrack::Quality::highPurity; + } } } -__global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples, +__global__ void kernel_doStatsForTracks(TkSoAView tracks_view, Quality const *__restrict__ quality, CAHitNtupletGeneratorKernelsGPU::Counters *counters) { int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { - if (tuples->size(idx) == 0) + for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (tracks_view.hitIndices().size(idx) == 0) break; //guard if (quality[idx] < pixelTrack::Quality::loose) continue; @@ -514,58 +523,56 @@ __global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples, } } -__global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples, - Quality const *__restrict__ quality, +__global__ void kernel_countHitInTracks(TkSoAView tracks_view, CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) { int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { - if (tuples->size(idx) == 0) + for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (tracks_view.hitIndices().size(idx) == 0) break; // guard - for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h) + for (auto h = tracks_view.hitIndices().begin(idx); h != tracks_view.hitIndices().end(idx); ++h) hitToTuple->count(*h); } } -__global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples, - Quality const *__restrict__ quality, +__global__ void kernel_fillHitInTracks(TkSoAView tracks_view, // TODO: Make ConstView CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) { int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { - if (tuples->size(idx) == 0) + for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (tracks_view.hitIndices().size(idx) == 0) break; // guard - for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h) + for (auto h = tracks_view.hitIndices().begin(idx); h != tracks_view.hitIndices().end(idx); ++h) hitToTuple->fill(*h, idx); } } -__global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples, - TrackingRecHit2DSOAView const *__restrict__ hhp, - HitContainer *__restrict__ hitDetIndices) { +__global__ void kernel_fillHitDetIndices(TkSoAView tracks_view, TrackingRecHit2DSOAView const *__restrict__ hhp) { int first = blockDim.x * blockIdx.x + threadIdx.x; // copy offsets - for (int idx = first, ntot = tuples->totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { - hitDetIndices->off[idx] = tuples->off[idx]; + for (int idx = first, ntot = tracks_view.hitIndices().totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { + tracks_view.detIndices().off[idx] = tracks_view.hitIndices().off[idx]; } // fill hit indices auto const &hh = *hhp; auto nhits = hh.nHits(); - for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) { - assert(tuples->content[idx] < nhits); - hitDetIndices->content[idx] = hh.detectorIndex(tuples->content[idx]); + for (int idx = first, ntot = tracks_view.hitIndices().size(); idx < ntot; idx += gridDim.x * blockDim.x) { + assert(tracks_view.hitIndices().content[idx] < nhits); + tracks_view.detIndices().content[idx] = hh.detectorIndex(tracks_view.hitIndices().content[idx]); } } -__global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks, cms::cuda::AtomicPairCounter *apc) { - auto &tracks = *ptracks; +/* + Needs both TkSoA and TkSoAView for accessing SoA, computeNumberOfLayers(), nHits(), stride() + */ +__global__ void kernel_fillNLayers(TkSoAView tracks_view, cms::cuda::AtomicPairCounter *apc) { auto first = blockIdx.x * blockDim.x + threadIdx.x; // clamp the number of tracks to the capacity of the SoA - auto ntracks = std::min(apc->get().m, tracks.stride() - 1); + auto ntracks = std::min(apc->get().m, tracks_view.metadata().size() - 1); if (0 == first) - tracks.setNTracks(ntracks); + tracks_view.nTracks() = ntracks; for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) { - auto nHits = tracks.nHits(idx); + auto nHits = pixelTrack::utilities::nHits(tracks_view, idx); assert(nHits >= 3); - tracks.nLayers(idx) = tracks.computeNumberOfLayers(idx); + tracks_view[idx].nLayers() = pixelTrack::utilities::computeNumberOfLayers(tracks_view, idx); } } @@ -622,10 +629,8 @@ __global__ void kernel_markSharedHit(int const *__restrict__ nshared, HitContainer const *__restrict__ tuples, Quality *__restrict__ quality, bool dupPassThrough) { - // constexpr auto bad = pixelTrack::Quality::bad; constexpr auto dup = pixelTrack::Quality::dup; constexpr auto loose = pixelTrack::Quality::loose; - // constexpr auto strict = pixelTrack::Quality::strict; // quality to mark rejected auto const reject = dupPassThrough ? loose : dup; @@ -642,16 +647,14 @@ __global__ void kernel_markSharedHit(int const *__restrict__ nshared, } // mostly for very forward triplets..... -__global__ void kernel_rejectDuplicate(TkSoA const *__restrict__ ptracks, - Quality *__restrict__ quality, +__global__ void kernel_rejectDuplicate(TkSoAView tracks_view, uint16_t nmin, bool dupPassThrough, CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) { // quality to mark rejected - auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup; + auto const reject = dupPassThrough ? (uint8_t)pixelTrack::Quality::loose : (uint8_t)pixelTrack::Quality::dup; auto &hitToTuple = *phitToTuple; - auto const &tracks = *ptracks; int first = blockDim.x * blockIdx.x + threadIdx.x; for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { @@ -660,41 +663,41 @@ __global__ void kernel_rejectDuplicate(TkSoA const *__restrict__ ptracks, /* chi2 is bad for large pt auto score = [&](auto it, auto nl) { - return nl < 4 ? std::abs(tracks.tip(it)) : // tip for triplets - tracks.chi2(it); //chi2 + return nl < 4 ? std::abs(pixelTrack::utilities::tip(tracks_view, it)) : // tip for triplets + pixelTrack::utilities::chi2(tracks_view, it); //chi2 }; */ - auto score = [&](auto it, auto nl) { return std::abs(tracks.tip(it)); }; + auto score = [&](auto it, auto nl) { return std::abs(pixelTrack::utilities::tip(tracks_view, it)); }; // full combinatorics for (auto ip = hitToTuple.begin(idx); ip < hitToTuple.end(idx) - 1; ++ip) { auto const it = *ip; - auto qi = quality[it]; + auto qi = tracks_view[it].quality(); if (qi <= reject) continue; - auto opi = tracks.stateAtBS.state(it)(2); - auto e2opi = tracks.stateAtBS.covariance(it)(9); - auto cti = tracks.stateAtBS.state(it)(3); - auto e2cti = tracks.stateAtBS.covariance(it)(12); - auto nli = tracks.nLayers(it); + auto opi = tracks_view[it].state()(2); + auto e2opi = tracks_view[it].covariance()(9); + auto cti = tracks_view[it].state()(3); + auto e2cti = tracks_view[it].covariance()(12); + auto nli = tracks_view[it].nLayers(); for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) { auto const jt = *jp; - auto qj = quality[jt]; + auto qj = tracks_view[jt].quality(); if (qj <= reject) continue; - auto opj = tracks.stateAtBS.state(jt)(2); - auto ctj = tracks.stateAtBS.state(jt)(3); - auto dct = nSigma2 * (tracks.stateAtBS.covariance(jt)(12) + e2cti); + auto opj = tracks_view[jt].state()(2); + auto ctj = tracks_view[jt].state()(3); + auto dct = nSigma2 * (tracks_view[jt].covariance()(12) + e2cti); if ((cti - ctj) * (cti - ctj) > dct) continue; - auto dop = nSigma2 * (tracks.stateAtBS.covariance(jt)(9) + e2opi); + auto dop = nSigma2 * (tracks_view[jt].covariance()(9) + e2opi); if ((opi - opj) * (opi - opj) > dop) continue; - auto nlj = tracks.nLayers(jt); + auto nlj = tracks_view[jt].nLayers(); if (nlj < nli || (nlj == nli && (qj < qi || (qj == qi && score(it, nli) < score(jt, nlj))))) - quality[jt] = reject; + tracks_view[jt].quality() = reject; else { - quality[it] = reject; + tracks_view[it].quality() = reject; break; } } @@ -703,18 +706,16 @@ __global__ void kernel_rejectDuplicate(TkSoA const *__restrict__ ptracks, } __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restrict__ hhp, - TkSoA const *__restrict__ ptracks, - Quality *__restrict__ quality, + TkSoAView tracks_view, int nmin, bool dupPassThrough, CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) { // quality to mark rejected - auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup; + auto const reject = dupPassThrough ? (uint8_t)pixelTrack::Quality::loose : (uint8_t)pixelTrack::Quality::dup; // quality of longest track - auto const longTqual = pixelTrack::Quality::highPurity; + auto const longTqual = (uint8_t)pixelTrack::Quality::highPurity; auto &hitToTuple = *phitToTuple; - auto const &tracks = *ptracks; auto const &hh = *hhp; int l1end = hh.hitsLayerStart()[1]; @@ -728,10 +729,10 @@ __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restric // find maxNl for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { - if (quality[*it] < longTqual) + if (tracks_view[*it].quality() < longTqual) continue; - // if (tracks.nHits(*it)==3) continue; - auto nl = tracks.nLayers(*it); + // if (tracks_view[*it].nHits()==3) continue; + auto nl = tracks_view[*it].nLayers(); maxNl = std::max(nl, maxNl); } @@ -743,30 +744,28 @@ __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restric // kill all tracks shorter than maxHl (only triplets??? for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { - auto nl = tracks.nLayers(*it); + auto nl = tracks_view[*it].nLayers(); //checking if shared hit is on bpix1 and if the tuple is short enough if (idx < l1end and nl > nmin) continue; - if (nl < maxNl && quality[*it] > reject) - quality[*it] = reject; + if (nl < maxNl && tracks_view[*it].quality() > reject) + tracks_view[*it].quality() = reject; } } } -__global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks, - Quality *__restrict__ quality, +__global__ void kernel_tripletCleaner(TkSoAView tracks_view, uint16_t nmin, bool dupPassThrough, CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) { // quality to mark rejected - auto const reject = pixelTrack::Quality::loose; + auto const reject = (uint8_t)pixelTrack::Quality::loose; /// min quality of good - auto const good = pixelTrack::Quality::strict; + auto const good = (uint8_t)pixelTrack::Quality::strict; auto &hitToTuple = *phitToTuple; - auto const &tracks = *ptracks; int first = blockDim.x * blockIdx.x + threadIdx.x; for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { @@ -779,9 +778,9 @@ __global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks, // check if only triplets for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { - if (quality[*it] <= good) + if (tracks_view[*it].quality() <= good) continue; - onlyTriplets &= tracks.isTriplet(*it); + onlyTriplets &= pixelTrack::utilities::isTriplet(tracks_view, *it); if (!onlyTriplets) break; } @@ -793,8 +792,8 @@ __global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks, // for triplets choose best tip! (should we first find best quality???) for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { auto const it = *ip; - if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) { - mc = std::abs(tracks.tip(it)); + if (tracks_view[it].quality() >= good && std::abs(pixelTrack::utilities::tip(tracks_view, it)) < mc) { + mc = std::abs(pixelTrack::utilities::tip(tracks_view, it)); im = it; } } @@ -805,26 +804,24 @@ __global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks, // mark worse ambiguities for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { auto const it = *ip; - if (quality[it] > reject && it != im) - quality[it] = reject; //no race: simple assignment of the same constant + if (tracks_view[it].quality() > reject && it != im) + tracks_view[it].quality() = reject; //no race: simple assignment of the same constant } } // loop over hits } __global__ void kernel_simpleTripletCleaner( - TkSoA const *__restrict__ ptracks, - Quality *__restrict__ quality, + TkSoAView tracks_view, uint16_t nmin, bool dupPassThrough, CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) { // quality to mark rejected - auto const reject = pixelTrack::Quality::loose; + auto const reject = (uint8_t)pixelTrack::Quality::loose; /// min quality of good - auto const good = pixelTrack::Quality::loose; + auto const good = (uint8_t)pixelTrack::Quality::loose; auto &hitToTuple = *phitToTuple; - auto const &tracks = *ptracks; int first = blockDim.x * blockIdx.x + threadIdx.x; for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { @@ -837,8 +834,8 @@ __global__ void kernel_simpleTripletCleaner( // choose best tip! (should we first find best quality???) for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { auto const it = *ip; - if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) { - mc = std::abs(tracks.tip(it)); + if (tracks_view[it].quality() >= good && std::abs(pixelTrack::utilities::tip(tracks_view, it)) < mc) { + mc = std::abs(pixelTrack::utilities::tip(tracks_view, it)); im = it; } } @@ -849,52 +846,50 @@ __global__ void kernel_simpleTripletCleaner( // mark worse ambiguities for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { auto const it = *ip; - if (quality[it] > reject && tracks.isTriplet(it) && it != im) - quality[it] = reject; //no race: simple assignment of the same constant + if (tracks_view[it].quality() > reject && pixelTrack::utilities::isTriplet(tracks_view, it) && it != im) + tracks_view[it].quality() = reject; //no race: simple assignment of the same constant } } // loop over hits } __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__restrict__ hhp, - HitContainer const *__restrict__ ptuples, - TkSoA const *__restrict__ ptracks, - Quality const *__restrict__ quality, + TkSoAView tracks_view, CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple, int32_t firstPrint, int32_t lastPrint, int iev) { - constexpr auto loose = pixelTrack::Quality::loose; + constexpr auto loose = (uint8_t)pixelTrack::Quality::loose; auto const &hh = *hhp; - auto const &foundNtuplets = *ptuples; - auto const &tracks = *ptracks; + // auto const &foundNtuplets = *ptuples; + int first = firstPrint + blockDim.x * blockIdx.x + threadIdx.x; - for (int i = first, np = std::min(lastPrint, foundNtuplets.nOnes()); i < np; i += blockDim.x * gridDim.x) { - auto nh = foundNtuplets.size(i); + for (int i = first, np = std::min(lastPrint, tracks_view.hitIndices().nOnes()); i < np; i += blockDim.x * gridDim.x) { + auto nh = tracks_view.hitIndices().size(i); if (nh < 3) continue; - if (quality[i] < loose) + if (tracks_view[i].quality() < loose) continue; printf("TK: %d %d %d %d %f %f %f %f %f %f %f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n", 10000 * iev + i, - int(quality[i]), + int(tracks_view[i].quality()), nh, - tracks.nLayers(i), - tracks.charge(i), - tracks.pt(i), - tracks.eta(i), - tracks.phi(i), - tracks.tip(i), - tracks.zip(i), + tracks_view[i].nLayers(), + pixelTrack::utilities::charge(tracks_view, i), + tracks_view[i].pt(), + tracks_view[i].eta(), + pixelTrack::utilities::phi(tracks_view, i), + pixelTrack::utilities::tip(tracks_view, i), + pixelTrack::utilities::zip(tracks_view, i), // asinhf(fit_results[i].par(3)), - tracks.chi2(i), - hh.zGlobal(*foundNtuplets.begin(i)), - hh.zGlobal(*(foundNtuplets.begin(i) + 1)), - hh.zGlobal(*(foundNtuplets.begin(i) + 2)), - nh > 3 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 3))) : 0, - nh > 4 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 4))) : 0, - nh > 5 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 5))) : 0, - nh > 6 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + nh - 1))) : 0); + tracks_view[i].chi2(), + hh.zGlobal(*tracks_view.hitIndices().begin(i)), + hh.zGlobal(*(tracks_view.hitIndices().begin(i) + 1)), + hh.zGlobal(*(tracks_view.hitIndices().begin(i) + 2)), + nh > 3 ? hh.zGlobal(int(*(tracks_view.hitIndices().begin(i) + 3))) : 0, + nh > 4 ? hh.zGlobal(int(*(tracks_view.hitIndices().begin(i) + 4))) : 0, + nh > 5 ? hh.zGlobal(int(*(tracks_view.hitIndices().begin(i) + 5))) : 0, + nh > 6 ? hh.zGlobal(int(*(tracks_view.hitIndices().begin(i) + nh - 1))) : 0); } } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc index f650ca8ab2a08..4893ebdcc828f 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -19,6 +19,8 @@ #include "FWCore/Utilities/interface/isFinite.h" #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" #include "TrackingTools/DetLayers/interface/BarrelDetLayer.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" #include "CAHitNtupletGeneratorOnGPU.h" @@ -184,29 +186,27 @@ void CAHitNtupletGeneratorOnGPU::endJob() { } } -PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, - float bfield, - cudaStream_t stream) const { - PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique(stream)); - - auto* soa = tracks.get(); - assert(soa); +pixelTrack::TrackSoADevice CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, + float bfield, + cudaStream_t stream) const { + pixelTrack::TrackSoADevice tracks(stream); + auto* soa = &tracks; CAHitNtupletGeneratorKernelsGPU kernels(m_params); kernels.setCounters(m_counters); kernels.allocateOnGPU(hits_d.nHits(), stream); kernels.buildDoublets(hits_d, stream); - kernels.launchKernels(hits_d, soa, stream); + kernels.launchKernels(hits_d, soa->view(), stream); HelixFitOnGPU fitter(bfield, m_params.fitNas4_); - fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); + fitter.allocateOnGPU(kernels.tupleMultiplicity(), soa->view()); if (m_params.useRiemannFit_) { fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream); } else { fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream); } - kernels.classifyTuples(hits_d, soa, stream); + kernels.classifyTuples(hits_d, soa->view(), stream); #ifdef GPU_DEBUG cudaDeviceSynchronize(); @@ -217,25 +217,22 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH return tracks; } -PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const { - PixelTrackHeterogeneous tracks(std::make_unique()); - - auto* soa = tracks.get(); - assert(soa); +pixelTrack::TrackSoAHost CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const { + pixelTrack::TrackSoAHost tracks(nullptr); CAHitNtupletGeneratorKernelsCPU kernels(m_params); kernels.setCounters(m_counters); kernels.allocateOnGPU(hits_d.nHits(), nullptr); kernels.buildDoublets(hits_d, nullptr); - kernels.launchKernels(hits_d, soa, nullptr); + kernels.launchKernels(hits_d, tracks.view(), nullptr); if (0 == hits_d.nHits()) return tracks; // now fit HelixFitOnGPU fitter(bfield, m_params.fitNas4_); - fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); + fitter.allocateOnGPU(kernels.tupleMultiplicity(), tracks.view()); if (m_params.useRiemannFit_) { fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets); @@ -243,16 +240,16 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets); } - kernels.classifyTuples(hits_d, soa, nullptr); + kernels.classifyTuples(hits_d, tracks.view(), nullptr); #ifdef GPU_DEBUG std::cout << "finished building pixel tracks on CPU" << std::endl; #endif // check that the fixed-size SoA does not overflow - auto const& tsoa = *soa; - auto maxTracks = tsoa.stride(); - auto nTracks = tsoa.nTracks(); + + auto maxTracks = tracks.view().metadata().size(); + auto nTracks = tracks.view().nTracks(); assert(nTracks < maxTracks); if (nTracks == maxTracks - 1) { edm::LogWarning("PixelTracks") << "Unsorted reconstructed pixel tracks truncated to " << maxTracks - 1 diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h index ae4576d883530..323ad0d071f0c 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h @@ -3,7 +3,8 @@ #include #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" #include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h" #include "FWCore/ParameterSet/interface/ParameterSet.h" @@ -28,7 +29,8 @@ class CAHitNtupletGeneratorOnGPU { using hindex_type = TrackingRecHit2DSOAView::hindex_type; using Quality = pixelTrack::Quality; - using OutputSoA = pixelTrack::TrackSoA; + using OutputSoAHost = pixelTrack::TrackSoAHost; + using OutputSoADevice = pixelTrack::TrackSoADevice; using HitContainer = pixelTrack::HitContainer; using Tuple = HitContainer; @@ -47,9 +49,13 @@ class CAHitNtupletGeneratorOnGPU { void beginJob(); void endJob(); - PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const; + // On GPU + pixelTrack::TrackSoADevice makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, + float bfield, + cudaStream_t stream) const; - PixelTrackHeterogeneous makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const; + // On CPU + pixelTrack::TrackSoAHost makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const; private: void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream) const; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h index 2e3747a2b6841..c430a86129173 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h @@ -14,7 +14,8 @@ #include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" + #include "CAConstants.h" class GPUCACell { diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc index 880bdb47dfb5c..f757f574f6142 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc @@ -1,16 +1,14 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HelixFitOnGPU.h" -void HelixFitOnGPU::allocateOnGPU(Tuples const *tuples, - TupleMultiplicity const *tupleMultiplicity, - OutputSoA *helix_fit_results) { - tuples_ = tuples; +void HelixFitOnGPU::allocateOnGPU(TupleMultiplicity const *tupleMultiplicity, OutputSoAView helix_fit_results) { + tuples_ = &helix_fit_results.hitIndices(); tupleMultiplicity_ = tupleMultiplicity; outputSoa_ = helix_fit_results; assert(tuples_); assert(tupleMultiplicity_); - assert(outputSoa_); + // assert(outputSoa_); // TODO find equivalent assertion for View } void HelixFitOnGPU::deallocateOnGPU() {} diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h index 9a9c85970af33..9fd2112476c9d 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h @@ -1,7 +1,7 @@ #ifndef RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h #define RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" #include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h" @@ -36,7 +36,7 @@ class HelixFitOnGPU { using HitsView = TrackingRecHit2DSOAView; using Tuples = pixelTrack::HitContainer; - using OutputSoA = pixelTrack::TrackSoA; + using OutputSoAView = pixelTrack::TrackSoAView; using TupleMultiplicity = caConstants::TupleMultiplicity; @@ -50,7 +50,7 @@ class HelixFitOnGPU { void launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples); void launchBrokenLineKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples); - void allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoA *outputSoA); + void allocateOnGPU(TupleMultiplicity const *tupleMultiplicity, OutputSoAView outputSoA); void deallocateOnGPU(); private: @@ -59,7 +59,7 @@ class HelixFitOnGPU { // fowarded Tuples const *tuples_ = nullptr; TupleMultiplicity const *tupleMultiplicity_ = nullptr; - OutputSoA *outputSoa_; + OutputSoAView outputSoa_; float bField_; const bool fitNas4_; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h index 926002d674b83..e511680bf76b7 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h @@ -16,7 +16,7 @@ using HitsOnGPU = TrackingRecHit2DSOAView; using Tuples = pixelTrack::HitContainer; -using OutputSoA = pixelTrack::TrackSoA; +using OutputSoAView = pixelTrack::TrackSoAView; template __global__ void kernel_FastFit(Tuples const *__restrict__ foundNtuplets, @@ -128,13 +128,13 @@ template __global__ void kernel_LineFit(caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity, uint32_t nHits, double bField, - OutputSoA *results, + OutputSoAView results_view, double *__restrict__ phits, float *__restrict__ phits_ge, double *__restrict__ pfast_fit_input, riemannFit::CircleFit *__restrict__ circle_fit, uint32_t offset) { - assert(results); + // assert(results); // TODO find equivalent for View assert(circle_fit); assert(N <= nHits); @@ -149,7 +149,7 @@ __global__ void kernel_LineFit(caConstants::TupleMultiplicity const *__restrict_ break; // get it for the ntuple container (one to one to helix) - auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); + int32_t tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); riemannFit::Map3xNd hits(phits + local_idx); riemannFit::Map4d fast_fit(pfast_fit_input + local_idx); @@ -159,11 +159,16 @@ __global__ void kernel_LineFit(caConstants::TupleMultiplicity const *__restrict_ riemannFit::fromCircleToPerigee(circle_fit[local_idx]); - results->stateAtBS.copyFromCircle( - circle_fit[local_idx].par, circle_fit[local_idx].cov, line_fit.par, line_fit.cov, 1.f / float(bField), tkid); - results->pt(tkid) = bField / std::abs(circle_fit[local_idx].par(2)); - results->eta(tkid) = asinhf(line_fit.par(0)); - results->chi2(tkid) = (circle_fit[local_idx].chi2 + line_fit.chi2) / (2 * N - 5); + pixelTrack::utilities::copyFromCircle(results_view, + circle_fit[local_idx].par, + circle_fit[local_idx].cov, + line_fit.par, + line_fit.cov, + 1.f / float(bField), + tkid); + results_view[tkid].pt() = bField / std::abs(circle_fit[local_idx].par(2)); + results_view[tkid].eta() = asinhf(line_fit.par(0)); + results_view[tkid].chi2() = (circle_fit[local_idx].chi2 + line_fit.chi2) / (2 * N - 5); #ifdef RIEMANN_DEBUG printf("kernelLineFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n", diff --git a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml index d480d7408b9e2..522b186f3351b 100644 --- a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml +++ b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml @@ -26,4 +26,5 @@ + diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc index 34b0ed9e29fc1..db60fb7ebf4bb 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc @@ -16,6 +16,10 @@ #include "FWCore/Utilities/interface/EDGetToken.h" #include "FWCore/Utilities/interface/RunningAverage.h" #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" #include "gpuVertexFinder.h" @@ -35,10 +39,10 @@ class PixelVertexProducerCUDA : public edm::global::EDProducer<> { bool onGPU_; - edm::EDGetTokenT> tokenGPUTrack_; - edm::EDPutTokenT tokenGPUVertex_; - edm::EDGetTokenT tokenCPUTrack_; - edm::EDPutTokenT tokenCPUVertex_; + edm::EDGetTokenT> tokenGPUTrack_; + edm::EDPutTokenT> tokenGPUVertex_; + edm::EDGetTokenT tokenCPUTrack_; + edm::EDPutTokenT tokenCPUVertex_; const gpuVertexFinder::Producer gpuAlgo_; @@ -62,11 +66,11 @@ PixelVertexProducerCUDA::PixelVertexProducerCUDA(const edm::ParameterSet& conf) { if (onGPU_) { tokenGPUTrack_ = - consumes>(conf.getParameter("pixelTrackSrc")); - tokenGPUVertex_ = produces(); + consumes>(conf.getParameter("pixelTrackSrc")); + tokenGPUVertex_ = produces>(); } else { - tokenCPUTrack_ = consumes(conf.getParameter("pixelTrackSrc")); - tokenCPUVertex_ = produces(); + tokenCPUTrack_ = consumes(conf.getParameter("pixelTrackSrc")); + tokenCPUVertex_ = produces(); } } @@ -97,40 +101,35 @@ void PixelVertexProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions& d void PixelVertexProducerCUDA::produceOnGPU(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { - edm::Handle> hTracks; + edm::Handle> hTracks; iEvent.getByToken(tokenGPUTrack_, hTracks); cms::cuda::ScopedContextProduce ctx{*hTracks}; - auto const* tracks = ctx.get(*hTracks).get(); + auto& tracks = ctx.get(*hTracks); - assert(tracks); - - ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks, ptMin_, ptMax_)); + ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks.view(), ptMin_, ptMax_)); } void PixelVertexProducerCUDA::produceOnCPU(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { - auto const* tracks = iEvent.get(tokenCPUTrack_).get(); - assert(tracks); + auto& tracks = iEvent.get(tokenCPUTrack_); #ifdef PIXVERTEX_DEBUG_PRODUCE - auto const& tsoa = *tracks; - auto maxTracks = tsoa.stride(); - std::cout << "size of SoA " << sizeof(tsoa) << " stride " << maxTracks << std::endl; + + auto maxTracks = tracks.view().metadata().size(); int32_t nt = 0; for (int32_t it = 0; it < maxTracks; ++it) { - auto nHits = tsoa.nHits(it); - assert(nHits == int(tsoa.hitIndices.size(it))); + auto nHits = pixelTrack::utilities::nHits(tracks.view(), it); + assert(nHits == int(tracks.view().hitIndices().size(it))); if (nHits == 0) break; // this is a guard: maybe we need to move to nTracks... nt++; } - std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE - iEvent.emplace(tokenCPUVertex_, gpuAlgo_.make(tracks, ptMin_, ptMax_)); + iEvent.emplace(tokenCPUVertex_, gpuAlgo_.make(tracks.view(), ptMin_, ptMax_)); } void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc index 029c619b42e58..61ec3f9a6a5be 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc @@ -1,4 +1,5 @@ -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" #include "DataFormats/BeamSpot/interface/BeamSpot.h" #include "DataFormats/Common/interface/OrphanHandle.h" #include "DataFormats/TrackReco/interface/Track.h" @@ -35,14 +36,14 @@ class PixelVertexProducerFromSoA : public edm::global::EDProducer<> { private: void produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &iSetup) const override; - edm::EDGetTokenT tokenVertex_; + edm::EDGetTokenT tokenVertex_; edm::EDGetTokenT tokenBeamSpot_; edm::EDGetTokenT tokenTracks_; edm::EDGetTokenT tokenIndToEdm_; }; PixelVertexProducerFromSoA::PixelVertexProducerFromSoA(const edm::ParameterSet &conf) - : tokenVertex_(consumes(conf.getParameter("src"))), + : tokenVertex_(consumes(conf.getParameter("src"))), tokenBeamSpot_(consumes(conf.getParameter("beamSpot"))), tokenTracks_(consumes(conf.getParameter("TrackCollection"))), tokenIndToEdm_(consumes(conf.getParameter("TrackCollection"))) { @@ -81,9 +82,9 @@ void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEv dydz = bs.dydz(); } - auto const &soa = *(iEvent.get(tokenVertex_).get()); + auto const &soa = iEvent.get(tokenVertex_); - int nv = soa.nvFinal; + int nv = soa.view().nvFinal(); #ifdef PIXVERTEX_DEBUG_PRODUCE std::cout << "converting " << nv << " vertices " @@ -92,20 +93,20 @@ void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEv std::set uind; // for verifing index consistency for (int j = nv - 1; j >= 0; --j) { - auto i = soa.sortInd[j]; // on gpu sorted in ascending order.... + auto i = soa.view()[j].sortInd(); // on gpu sorted in ascending order.... assert(i < nv); uind.insert(i); assert(itrk.empty()); - auto z = soa.zv[i]; + auto z = soa.view()[i].zv(); auto x = x0 + dxdz * z; auto y = y0 + dydz * z; z += z0; reco::Vertex::Error err; - err(2, 2) = 1.f / soa.wv[i]; + err(2, 2) = 1.f / soa.view()[i].wv(); err(2, 2) *= 2.; // artifically inflate error //Copy also the tracks (no intention to be efficient....) for (auto k = 0U; k < indToEdm.size(); ++k) { - if (soa.idv[k] == int16_t(i)) + if (soa.view()[k].idv() == int16_t(i)) itrk.push_back(k); } auto nt = itrk.size(); @@ -119,7 +120,7 @@ void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEv itrk.clear(); continue; } // remove outliers - (*vertexes).emplace_back(reco::Vertex::Point(x, y, z), err, soa.chi2[i], soa.ndof[i], nt); + (*vertexes).emplace_back(reco::Vertex::Point(x, y, z), err, soa.view()[i].chi2(), soa.view()[i].ndof(), nt); auto &v = (*vertexes).back(); v.reserve(itrk.size()); for (auto it : itrk) { diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc index dc125878b1058..7dd714f22de6f 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc @@ -2,7 +2,8 @@ #include "CUDADataFormats/Common/interface/Product.h" #include "CUDADataFormats/Common/interface/HostProduct.h" -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" #include "DataFormats/Common/interface/Handle.h" #include "FWCore/Framework/interface/ESHandle.h" #include "FWCore/Framework/interface/Event.h" @@ -30,15 +31,15 @@ class PixelVertexSoAFromCUDA : public edm::stream::EDProducer edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; - edm::EDGetTokenT> tokenCUDA_; - edm::EDPutTokenT tokenSOA_; + edm::EDGetTokenT> tokenCUDA_; + edm::EDPutTokenT tokenSOA_; - cms::cuda::host::unique_ptr m_soa; + ZVertex::ZVertexSoAHost zvertex_h; }; PixelVertexSoAFromCUDA::PixelVertexSoAFromCUDA(const edm::ParameterSet& iConfig) - : tokenCUDA_(consumes>(iConfig.getParameter("src"))), - tokenSOA_(produces()) {} + : tokenCUDA_(consumes>(iConfig.getParameter("src"))), + tokenSOA_(produces()) {} void PixelVertexSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { edm::ParameterSetDescription desc; @@ -50,16 +51,21 @@ void PixelVertexSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& de void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - auto const& inputDataWrapped = iEvent.get(tokenCUDA_); + cms::cuda::Product const& inputDataWrapped = iEvent.get(tokenCUDA_); cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; - auto const& inputData = ctx.get(inputDataWrapped); - - m_soa = inputData.toHostAsync(ctx.stream()); + auto const& zvertex_d = ctx.get(inputDataWrapped); // Tracks on device + zvertex_h = ZVertex::ZVertexSoAHost(ctx.stream()); // Create an instance of Tracks on Host, using the stream + cudaCheck(cudaMemcpyAsync(zvertex_h.buffer().get(), + zvertex_d.const_buffer().get(), + zvertex_d.bufferSize(), + cudaMemcpyDeviceToHost, + ctx.stream())); // Copy data from Device to Host + cudaCheck(cudaGetLastError()); } void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { // No copies.... - iEvent.emplace(tokenSOA_, ZVertexHeterogeneous(std::move(m_soa))); + iEvent.emplace(tokenSOA_, std::move(zvertex_h)); } DEFINE_FWK_MODULE(PixelVertexSoAFromCUDA); diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h new file mode 100644 index 0000000000000..1c704d1374ca7 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h @@ -0,0 +1,25 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_WorkSpaceSoAHeterogeneousDevice_h +#define RecoPixelVertexing_PixelVertexFinding_WorkSpaceSoAHeterogeneousDevice_h + +#include +#include "WorkSpaceUtilities.h" +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" +#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h" +#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" + +template +class WorkSpaceSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection> { +public: + WorkSpaceSoAHeterogeneousDevice() = default; + + // Constructor which specifies the SoA size and CUDA stream + explicit WorkSpaceSoAHeterogeneousDevice(cudaStream_t stream) + : PortableDeviceCollection>(S, stream) {} +}; + +namespace gpuVertexFinder { + namespace workSpace { + using WorkSpaceSoADevice = WorkSpaceSoAHeterogeneousDevice; + } +} // namespace gpuVertexFinder +#endif diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h new file mode 100644 index 0000000000000..1051da0bbcee8 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h @@ -0,0 +1,25 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_WorkSpaceSoAHeterogeneousHost_h +#define RecoPixelVertexing_PixelVertexFinding_WorkSpaceSoAHeterogeneousHost_h + +#include +#include "WorkSpaceUtilities.h" +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" +#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h" +#include "CUDADataFormats/Common/interface/PortableHostCollection.h" + +template +class WorkSpaceSoAHeterogeneousHost : public cms::cuda::PortableHostCollection> { +public: + WorkSpaceSoAHeterogeneousHost() = default; + + // Constructor which specifies the SoA size and CUDA stream + explicit WorkSpaceSoAHeterogeneousHost(cudaStream_t stream) + : PortableHostCollection>(S, stream) {} +}; + +namespace gpuVertexFinder { + namespace workSpace { + using WorkSpaceSoAHost = WorkSpaceSoAHeterogeneousHost; + } +} // namespace gpuVertexFinder +#endif diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h new file mode 100644 index 0000000000000..a86ade097ec7c --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h @@ -0,0 +1,36 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_WorkSpace_h +#define RecoPixelVertexing_PixelVertexFinding_WorkSpace_h + +#include +#include +#include "DataFormats/SoATemplate/interface/SoALayout.h" + +// Intermediate data used in the vertex reco algos +// For internal use only +GENERATE_SOA_LAYOUT(WorkSpaceSoAHeterogeneousLayout, + SOA_COLUMN(uint16_t, itrk), // index of original track + SOA_COLUMN(float, zt), // input track z at bs + SOA_COLUMN(float, ezt2), // input error^2 on the above + SOA_COLUMN(float, ptt2), // input pt^2 on the above + SOA_COLUMN(uint8_t, izt), // interized z-position of input tracks + SOA_COLUMN(int32_t, iv), // vertex index for each associated track + SOA_SCALAR(uint32_t, ntrks), // number of "selected tracks" + SOA_SCALAR(uint32_t, nvIntermediate)) // the number of vertices after splitting pruning etc. + +// Methods that operate on View and ConstView of the WorkSpaceSoALayout. +namespace gpuVertexFinder { + namespace workSpace { + using WorkSpaceSoALayout = WorkSpaceSoAHeterogeneousLayout<>; + using WorkSpaceSoAView = WorkSpaceSoAHeterogeneousLayout<>::View; + using WorkSpaceSoAConstView = WorkSpaceSoAHeterogeneousLayout<>::ConstView; + + namespace utilities { + __host__ __device__ inline void init(WorkSpaceSoAView &workspace_view) { + workspace_view.ntrks() = 0; + workspace_view.nvIntermediate() = 0; + } + } // namespace utilities + } // namespace workSpace +} // namespace gpuVertexFinder + +#endif diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h index f71aa56842a67..4124f80e017eb 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h @@ -17,8 +17,8 @@ namespace gpuVertexFinder { // // based on Rodrighez&Laio algo // - __device__ __forceinline__ void clusterTracksByDensity(gpuVertexFinder::ZVertices* pdata, - gpuVertexFinder::WorkSpace* pws, + __device__ __forceinline__ void clusterTracksByDensity(VtxSoAView pdata, + WsSoAView pws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" @@ -32,20 +32,21 @@ namespace gpuVertexFinder { auto er2mx = errmax * errmax; - auto& __restrict__ data = *pdata; - auto& __restrict__ ws = *pws; - auto nt = ws.ntrks; - float const* __restrict__ zt = ws.zt; - float const* __restrict__ ezt2 = ws.ezt2; + auto& __restrict__ data = pdata; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ zt = ws.zt(); + float const* __restrict__ ezt2 = ws.ezt2(); - uint32_t& nvFinal = data.nvFinal; - uint32_t& nvIntermediate = ws.nvIntermediate; + uint32_t& nvFinal = data.nvFinal(); + uint32_t& nvIntermediate = ws.nvIntermediate(); - uint8_t* __restrict__ izt = ws.izt; - int32_t* __restrict__ nn = data.ndof; - int32_t* __restrict__ iv = ws.iv; + uint8_t* __restrict__ izt = ws.izt(); + int32_t* __restrict__ nn = data.ndof(); + int32_t* __restrict__ iv = ws.iv(); - assert(pdata); + //TODO: check if there is a way to assert this + //assert(pdata); assert(zt); using Hist = cms::cuda::HistoContainer; @@ -63,7 +64,7 @@ namespace gpuVertexFinder { // fill hist (bin shall be wider than "eps") for (auto i = threadIdx.x; i < nt; i += blockDim.x) { - assert(i < ZVertices::MAXTRACKS); + assert(i < ZVertex::utilities::MAXTRACKS); int iz = int(zt[i] * 10.); // valid if eps<=0.1 // iz = std::clamp(iz, INT8_MIN, INT8_MAX); // sorry c++17 only iz = std::min(std::max(iz, INT8_MIN), INT8_MAX); @@ -197,7 +198,7 @@ namespace gpuVertexFinder { } __syncthreads(); - assert(foundClusters < ZVertices::MAXVTX); + assert(foundClusters < ZVertex::utilities::MAXVTX); // propagate the negative id to all the tracks in the cluster. for (auto i = threadIdx.x; i < nt; i += blockDim.x) { @@ -219,8 +220,8 @@ namespace gpuVertexFinder { printf("found %d proto vertices\n", foundClusters); } - __global__ void clusterTracksByDensityKernel(gpuVertexFinder::ZVertices* pdata, - gpuVertexFinder::WorkSpace* pws, + __global__ void clusterTracksByDensityKernel(VtxSoAView pdata, + WsSoAView pws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h index a11283a7b2065..43e420a4c0cbc 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h @@ -14,8 +14,8 @@ namespace gpuVertexFinder { // this algo does not really scale as it works in a single block... // enough for <10K tracks we have - __global__ void clusterTracksDBSCAN(ZVertices* pdata, - WorkSpace* pws, + __global__ void clusterTracksDBSCAN(VtxSoAView pdata, + WsSoAView pws, int minT, // min number of neighbours to be "core" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" @@ -28,20 +28,21 @@ namespace gpuVertexFinder { auto er2mx = errmax * errmax; - auto& __restrict__ data = *pdata; - auto& __restrict__ ws = *pws; - auto nt = ws.ntrks; - float const* __restrict__ zt = ws.zt; - float const* __restrict__ ezt2 = ws.ezt2; + auto& __restrict__ data = pdata; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ zt = ws.zt(); + float const* __restrict__ ezt2 = ws.ezt2(); - uint32_t& nvFinal = data.nvFinal; - uint32_t& nvIntermediate = ws.nvIntermediate; + uint32_t& nvFinal = data.nvFinal(); + uint32_t& nvIntermediate = ws.nvIntermediate(); - uint8_t* __restrict__ izt = ws.izt; - int32_t* __restrict__ nn = data.ndof; - int32_t* __restrict__ iv = ws.iv; + uint8_t* __restrict__ izt = ws.izt(); + int32_t* __restrict__ nn = data.ndof(); + int32_t* __restrict__ iv = ws.iv(); - assert(pdata); + //TODO: check if there is a way to assert this + //assert(pdata); assert(zt); using Hist = cms::cuda::HistoContainer; @@ -59,7 +60,7 @@ namespace gpuVertexFinder { // fill hist (bin shall be wider than "eps") for (auto i = threadIdx.x; i < nt; i += blockDim.x) { - assert(i < ZVertices::MAXTRACKS); + assert(i < ZVertex::utilities::MAXTRACKS); int iz = int(zt[i] * 10.); // valid if eps<=0.1 iz = std::clamp(iz, INT8_MIN, INT8_MAX); izt[i] = iz - INT8_MIN; @@ -214,7 +215,7 @@ namespace gpuVertexFinder { } __syncthreads(); - assert(foundClusters < ZVertices::MAXVTX); + assert(foundClusters < ZVertex::utilities::MAXVTX); // propagate the negative id to all the tracks in the cluster. for (auto i = threadIdx.x; i < nt; i += blockDim.x) { diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h index 66d246fcfa4fa..1b172cabf9318 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h @@ -14,8 +14,8 @@ namespace gpuVertexFinder { // this algo does not really scale as it works in a single block... // enough for <10K tracks we have - __global__ void clusterTracksIterative(ZVertices* pdata, - WorkSpace* pws, + __global__ void clusterTracksIterative(VtxSoAView pdata, + WsSoAView pws, int minT, // min number of neighbours to be "core" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" @@ -28,20 +28,21 @@ namespace gpuVertexFinder { auto er2mx = errmax * errmax; - auto& __restrict__ data = *pdata; - auto& __restrict__ ws = *pws; - auto nt = ws.ntrks; - float const* __restrict__ zt = ws.zt; - float const* __restrict__ ezt2 = ws.ezt2; + auto& __restrict__ data = pdata; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ zt = ws.zt(); + float const* __restrict__ ezt2 = ws.ezt2(); - uint32_t& nvFinal = data.nvFinal; - uint32_t& nvIntermediate = ws.nvIntermediate; + uint32_t& nvFinal = data.nvFinal(); + uint32_t& nvIntermediate = ws.nvIntermediate(); - uint8_t* __restrict__ izt = ws.izt; - int32_t* __restrict__ nn = data.ndof; - int32_t* __restrict__ iv = ws.iv; + uint8_t* __restrict__ izt = ws.izt(); + int32_t* __restrict__ nn = data.ndof(); + int32_t* __restrict__ iv = ws.iv(); - assert(pdata); + //TODO: check if there is a way to assert this + //assert(pdata); assert(zt); using Hist = cms::cuda::HistoContainer; @@ -59,7 +60,7 @@ namespace gpuVertexFinder { // fill hist (bin shall be wider than "eps") for (auto i = threadIdx.x; i < nt; i += blockDim.x) { - assert(i < ZVertices::MAXTRACKS); + assert(i < ZVertex::utilities::MAXTRACKS); int iz = int(zt[i] * 10.); // valid if eps<=0.1 iz = std::clamp(iz, INT8_MIN, INT8_MAX); izt[i] = iz - INT8_MIN; @@ -185,7 +186,7 @@ namespace gpuVertexFinder { } __syncthreads(); - assert(foundClusters < ZVertices::MAXVTX); + assert(foundClusters < ZVertex::utilities::MAXVTX); // propagate the negative id to all the tracks in the cluster. for (auto i = threadIdx.x; i < nt; i += blockDim.x) { diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h index 0acf67244528a..7b926023b4e19 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h @@ -12,27 +12,28 @@ namespace gpuVertexFinder { - __device__ __forceinline__ void fitVertices(ZVertices* pdata, - WorkSpace* pws, + __device__ __forceinline__ void fitVertices(VtxSoAView pdata, + WsSoAView pws, float chi2Max // for outlier rejection ) { constexpr bool verbose = false; // in principle the compiler should optmize out if false - auto& __restrict__ data = *pdata; - auto& __restrict__ ws = *pws; - auto nt = ws.ntrks; - float const* __restrict__ zt = ws.zt; - float const* __restrict__ ezt2 = ws.ezt2; - float* __restrict__ zv = data.zv; - float* __restrict__ wv = data.wv; - float* __restrict__ chi2 = data.chi2; - uint32_t& nvFinal = data.nvFinal; - uint32_t& nvIntermediate = ws.nvIntermediate; - - int32_t* __restrict__ nn = data.ndof; - int32_t* __restrict__ iv = ws.iv; - - assert(pdata); + auto& __restrict__ data = pdata; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ zt = ws.zt(); + float const* __restrict__ ezt2 = ws.ezt2(); + float* __restrict__ zv = data.zv(); + float* __restrict__ wv = data.wv(); + float* __restrict__ chi2 = data.chi2(); + uint32_t& nvFinal = data.nvFinal(); + uint32_t& nvIntermediate = ws.nvIntermediate(); + + int32_t* __restrict__ nn = data.ndof(); + int32_t* __restrict__ iv = ws.iv(); + + //TODO: check if there is a way to assert this + //assert(pdata); assert(zt); assert(nvFinal <= nvIntermediate); @@ -101,8 +102,8 @@ namespace gpuVertexFinder { printf("and %d noise\n", noise); } - __global__ void fitVerticesKernel(ZVertices* pdata, - WorkSpace* pws, + __global__ void fitVerticesKernel(VtxSoAView pdata, + WsSoAView pws, float chi2Max // for outlier rejection ) { fitVertices(pdata, pws, chi2Max); diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h index 93f78d498b26f..ff8cea612de47 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h @@ -15,16 +15,16 @@ namespace gpuVertexFinder { - __device__ __forceinline__ void sortByPt2(ZVertices* pdata, WorkSpace* pws) { - auto& __restrict__ data = *pdata; - auto& __restrict__ ws = *pws; - auto nt = ws.ntrks; - float const* __restrict__ ptt2 = ws.ptt2; - uint32_t const& nvFinal = data.nvFinal; + __device__ __forceinline__ void sortByPt2(VtxSoAView pdata, WsSoAView pws) { + auto& __restrict__ data = pdata; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ ptt2 = ws.ptt2(); + uint32_t const& nvFinal = data.nvFinal(); - int32_t const* __restrict__ iv = ws.iv; - float* __restrict__ ptv2 = data.ptv2; - uint16_t* __restrict__ sortInd = data.sortInd; + int32_t const* __restrict__ iv = ws.iv(); + float* __restrict__ ptv2 = data.ptv2(); + uint16_t* __restrict__ sortInd = data.sortInd(); // if (threadIdx.x == 0) // printf("sorting %d vertices\n",nvFinal); @@ -34,7 +34,7 @@ namespace gpuVertexFinder { // fill indexing for (auto i = threadIdx.x; i < nt; i += blockDim.x) { - data.idv[ws.itrk[i]] = iv[i]; + data[ws[i].itrk()].idv() = iv[i]; } // can be done asynchronoisly at the end of previous event @@ -66,7 +66,7 @@ namespace gpuVertexFinder { #endif } - __global__ void sortByPt2Kernel(ZVertices* pdata, WorkSpace* pws) { sortByPt2(pdata, pws); } + __global__ void sortByPt2Kernel(VtxSoAView pdata, WsSoAView pws) { sortByPt2(pdata, pws); } } // namespace gpuVertexFinder diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h index 0fe8bd882dcc5..f90978811b839 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h @@ -12,23 +12,24 @@ namespace gpuVertexFinder { - __device__ __forceinline__ void splitVertices(ZVertices* pdata, WorkSpace* pws, float maxChi2) { + __device__ __forceinline__ void splitVertices(VtxSoAView pdata, WsSoAView pws, float maxChi2) { constexpr bool verbose = false; // in principle the compiler should optmize out if false - auto& __restrict__ data = *pdata; - auto& __restrict__ ws = *pws; - auto nt = ws.ntrks; - float const* __restrict__ zt = ws.zt; - float const* __restrict__ ezt2 = ws.ezt2; - float* __restrict__ zv = data.zv; - float* __restrict__ wv = data.wv; - float const* __restrict__ chi2 = data.chi2; - uint32_t& nvFinal = data.nvFinal; - - int32_t const* __restrict__ nn = data.ndof; - int32_t* __restrict__ iv = ws.iv; - - assert(pdata); + auto& __restrict__ data = pdata; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ zt = ws.zt(); + float const* __restrict__ ezt2 = ws.ezt2(); + float* __restrict__ zv = data.zv(); + float* __restrict__ wv = data.wv(); + float const* __restrict__ chi2 = data.chi2(); + uint32_t& nvFinal = data.nvFinal(); + + int32_t const* __restrict__ nn = data.ndof(); + int32_t* __restrict__ iv = ws.iv(); + + //TODO: check if there is a way to assert this + //assert(pdata); assert(zt); // one vertex per block @@ -120,7 +121,7 @@ namespace gpuVertexFinder { // get a new global vertex __shared__ uint32_t igv; if (0 == threadIdx.x) - igv = atomicAdd(&ws.nvIntermediate, 1); + igv = atomicAdd(&ws.nvIntermediate(), 1); __syncthreads(); for (auto k = threadIdx.x; k < nq; k += blockDim.x) { if (1 == newV[k]) @@ -130,7 +131,7 @@ namespace gpuVertexFinder { } // loop on vertices } - __global__ void splitVerticesKernel(ZVertices* pdata, WorkSpace* pws, float maxChi2) { + __global__ void splitVerticesKernel(VtxSoAView pdata, WsSoAView pws, float maxChi2) { splitVertices(pdata, pws, maxChi2); } diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc index 20b007d2d029f..686f9899d8439 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc @@ -1,5 +1,8 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" + #include "gpuClusterTracksByDensity.h" #include "gpuClusterTracksDBSCAN.h" #include "gpuClusterTracksIterative.h" @@ -18,27 +21,25 @@ namespace gpuVertexFinder { // split vertices with a chi2/NDoF greater than this constexpr float maxChi2ForSplit = 9.f; - __global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin, float ptMax) { - assert(ptracks); - assert(soa); - auto const& tracks = *ptracks; - auto const& fit = tracks.stateAtBS; - auto const* quality = tracks.qualityData(); + __global__ void loadTracks(TkSoAConstView tracks_view, VtxSoAView soa, WsSoAView pws, float ptMin, float ptMax) { + //TODO: check if there is a way to assert this + //assert(soa); + auto const* quality = pixelTrack::utilities::qualityData(tracks_view); auto first = blockIdx.x * blockDim.x + threadIdx.x; - for (int idx = first, nt = tracks.nTracks(); idx < nt; idx += gridDim.x * blockDim.x) { - auto nHits = tracks.nHits(idx); + for (int idx = first, nt = tracks_view.nTracks(); idx < nt; idx += gridDim.x * blockDim.x) { + auto nHits = pixelTrack::utilities::nHits(tracks_view, idx); assert(nHits >= 3); // initialize soa... - soa->idv[idx] = -1; + soa[idx].idv() = -1; - if (tracks.isTriplet(idx)) + if (pixelTrack::utilities::isTriplet(tracks_view, idx)) continue; // no triplets if (quality[idx] < pixelTrack::Quality::highPurity) continue; - auto pt = tracks.pt(idx); + auto pt = tracks_view[idx].pt(); if (pt < ptMin) continue; @@ -46,19 +47,19 @@ namespace gpuVertexFinder { // clamp pt pt = std::min(pt, ptMax); - auto& data = *pws; - auto it = atomicAdd(&data.ntrks, 1); - data.itrk[it] = idx; - data.zt[it] = tracks.zip(idx); - data.ezt2[it] = fit.covariance(idx)(14); - data.ptt2[it] = pt * pt; + auto& data = pws; + auto it = atomicAdd(&data.ntrks(), 1); + data[it].itrk() = idx; + data[it].zt() = pixelTrack::utilities::zip(tracks_view, idx); + data[it].ezt2() = tracks_view[idx].covariance()(14); + data[it].ptt2() = pt * pt; } } // #define THREE_KERNELS #ifndef THREE_KERNELS - __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata, - gpuVertexFinder::WorkSpace* pws, + __global__ void vertexFinderOneKernel(VtxSoAView pdata, + WsSoAView pws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" @@ -75,8 +76,8 @@ namespace gpuVertexFinder { sortByPt2(pdata, pws); } #else - __global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata, - gpuVertexFinder::WorkSpace* pws, + __global__ void vertexFinderKernel1(VtxSoAView pdata, + WsSoAView pws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" @@ -87,7 +88,7 @@ namespace gpuVertexFinder { fitVertices(pdata, pws, maxChi2ForFirstFit); } - __global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws) { + __global__ void vertexFinderKernel2(VtxSoAView pdata, WsSoAView pws) { fitVertices(pdata, pws, maxChi2ForFinalFit); __syncthreads(); sortByPt2(pdata, pws); @@ -95,37 +96,40 @@ namespace gpuVertexFinder { #endif #ifdef __CUDACC__ - ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin, float ptMax) const { + ZVertex::ZVertexSoADevice Producer::makeAsync(cudaStream_t stream, + TkSoAConstView tracks_view, + float ptMin, + float ptMax) const { #ifdef PIXVERTEX_DEBUG_PRODUCE std::cout << "producing Vertices on GPU" << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE - ZVertexHeterogeneous vertices(cms::cuda::make_device_unique(stream)); + ZVertex::ZVertexSoADevice vertices(stream); #else - ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin, float ptMax) const { + ZVertex::ZVertexSoAHost Producer::make(TkSoAConstView tracks_view, float ptMin, float ptMax) const { #ifdef PIXVERTEX_DEBUG_PRODUCE std::cout << "producing Vertices on CPU" << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE - ZVertexHeterogeneous vertices(std::make_unique()); + ZVertex::ZVertexSoAHost vertices(nullptr); #endif - assert(tksoa); - auto* soa = vertices.get(); - assert(soa); + auto soa = vertices.view(); + //TODO: check if there is a way to assert this + //assert(soa); #ifdef __CUDACC__ - auto ws_d = cms::cuda::make_device_unique(stream); + auto ws_d = gpuVertexFinder::workSpace::WorkSpaceSoADevice(stream); #else - auto ws_d = std::make_unique(); + auto ws_d = gpuVertexFinder::workSpace::WorkSpaceSoAHost(nullptr); #endif #ifdef __CUDACC__ - init<<<1, 1, 0, stream>>>(soa, ws_d.get()); + init<<<1, 1, 0, stream>>>(soa, ws_d.view()); auto blockSize = 128; - auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize; - loadTracks<<>>(tksoa, soa, ws_d.get(), ptMin, ptMax); + auto numberOfBlocks = (tracks_view.metadata().size() + blockSize - 1) / blockSize; + loadTracks<<>>(tracks_view, soa, ws_d.view(), ptMin, ptMax); cudaCheck(cudaGetLastError()); #else - init(soa, ws_d.get()); - loadTracks(tksoa, soa, ws_d.get(), ptMin, ptMax); + init(soa, ws_d.view()); + loadTracks(tracks_view, soa, ws_d.view(), ptMin, ptMax); #endif #ifdef __CUDACC__ @@ -137,50 +141,51 @@ namespace gpuVertexFinder { if (oneKernel_) { // implemented only for density clustesrs #ifndef THREE_KERNELS - vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), minT, eps, errmax, chi2max); #else - vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), minT, eps, errmax, chi2max); cudaCheck(cudaGetLastError()); // one block per vertex... - splitVerticesKernel<<>>(soa, ws_d.get(), maxChi2ForSplit); + splitVerticesKernel<<>>(soa, ws_d.view(), maxChi2ForSplit); cudaCheck(cudaGetLastError()); - vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get()); + vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view()); #endif } else { // five kernels if (useDensity_) { - clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>( + soa, ws_d.view(), minT, eps, errmax, chi2max); } else if (useDBSCAN_) { - clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), minT, eps, errmax, chi2max); } else if (useIterative_) { - clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), minT, eps, errmax, chi2max); } cudaCheck(cudaGetLastError()); - fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFirstFit); + fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), maxChi2ForFirstFit); cudaCheck(cudaGetLastError()); // one block per vertex... - splitVerticesKernel<<>>(soa, ws_d.get(), maxChi2ForSplit); + splitVerticesKernel<<>>(soa, ws_d.view(), maxChi2ForSplit); cudaCheck(cudaGetLastError()); - fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFinalFit); + fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), maxChi2ForFinalFit); cudaCheck(cudaGetLastError()); - sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get()); + sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view()); } cudaCheck(cudaGetLastError()); #else // __CUDACC__ if (useDensity_) { - clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max); + clusterTracksByDensity(soa, ws_d.view(), minT, eps, errmax, chi2max); } else if (useDBSCAN_) { - clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max); + clusterTracksDBSCAN(soa, ws_d.view(), minT, eps, errmax, chi2max); } else if (useIterative_) { - clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max); + clusterTracksIterative(soa, ws_d.view(), minT, eps, errmax, chi2max); } #ifdef PIXVERTEX_DEBUG_PRODUCE - std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl; + std::cout << "found " << ws_d.view().nvIntermediate() << " vertices " << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE - fitVertices(soa, ws_d.get(), maxChi2ForFirstFit); + fitVertices(soa, ws_d.view(), maxChi2ForFirstFit); // one block per vertex! - splitVertices(soa, ws_d.get(), maxChi2ForSplit); - fitVertices(soa, ws_d.get(), maxChi2ForFinalFit); - sortByPt2(soa, ws_d.get()); + splitVertices(soa, ws_d.view(), maxChi2ForSplit); + fitVertices(soa, ws_d.view(), maxChi2ForFinalFit); + sortByPt2(soa, ws_d.view()); #endif return vertices; diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h index 2b6a8107d927f..cc8224521680c 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h @@ -4,46 +4,27 @@ #include #include -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" +#include "WorkSpaceUtilities.h" +#include "WorkSpaceSoAHeterogeneousHost.h" +#include "WorkSpaceSoAHeterogeneousDevice.h" namespace gpuVertexFinder { - using ZVertices = ZVertexSoA; - using TkSoA = pixelTrack::TrackSoA; + using VtxSoAView = ZVertex::ZVertexSoAView; + using TkSoAConstView = pixelTrack::TrackSoAConstView; + using WsSoAView = gpuVertexFinder::workSpace::WorkSpaceSoAView; - // workspace used in the vertex reco algos - struct WorkSpace { - static constexpr uint32_t MAXTRACKS = ZVertexSoA::MAXTRACKS; - static constexpr uint32_t MAXVTX = ZVertexSoA::MAXVTX; - - uint32_t ntrks; // number of "selected tracks" - uint16_t itrk[MAXTRACKS]; // index of original track - float zt[MAXTRACKS]; // input track z at bs - float ezt2[MAXTRACKS]; // input error^2 on the above - float ptt2[MAXTRACKS]; // input pt^2 on the above - uint8_t izt[MAXTRACKS]; // interized z-position of input tracks - int32_t iv[MAXTRACKS]; // vertex index for each associated track - - uint32_t nvIntermediate; // the number of vertices after splitting pruning etc. - - __host__ __device__ void init() { - ntrks = 0; - nvIntermediate = 0; - } - }; - - __global__ void init(ZVertexSoA* pdata, WorkSpace* pws) { - pdata->init(); - pws->init(); + __global__ void init(VtxSoAView pdata, WsSoAView pws) { + ZVertex::utilities::init(pdata); + gpuVertexFinder::workSpace::utilities::init(pws); } class Producer { public: - using ZVertices = ZVertexSoA; - using WorkSpace = gpuVertexFinder::WorkSpace; - using TkSoA = pixelTrack::TrackSoA; - Producer(bool oneKernel, bool useDensity, bool useDBSCAN, @@ -64,8 +45,8 @@ namespace gpuVertexFinder { ~Producer() = default; - ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin, float ptMax) const; - ZVertexHeterogeneous make(TkSoA const* tksoa, float ptMin, float ptMax) const; + ZVertex::ZVertexSoADevice makeAsync(cudaStream_t stream, TkSoAConstView tracks_view, float ptMin, float ptMax) const; + ZVertex::ZVertexSoAHost make(TkSoAConstView tracks_view, float ptMin, float ptMax) const; private: const bool oneKernel_; diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h index 5f8a0646c726a..c2cea5a9a1f13 100644 --- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h +++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h @@ -7,6 +7,17 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/launch.h" +#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h" +#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h" + +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" // TODO: included in order to compile Eigen columns first!!! +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" + +#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h" +#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h" +#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h" #ifdef USE_DBSCAN #include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h" #define CLUSTERIZE gpuVertexFinder::clusterTracksDBSCAN @@ -23,22 +34,22 @@ #ifdef ONE_KERNEL #ifdef __CUDACC__ -__global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata, - gpuVertexFinder::WorkSpace* pws, +__global__ void vertexFinderOneKernel(gpuVertexFinder::VtxSoAView pdata, + gpuVertexFinder::WsSoAView pws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" float chi2max // max normalized distance to cluster, ) { - clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max); + gpuVertexFinder::clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max); __syncthreads(); - fitVertices(pdata, pws, 50.); + gpuVertexFinder::fitVertices(pdata, pws, 50.); __syncthreads(); - splitVertices(pdata, pws, 9.f); + gpuVertexFinder::splitVertices(pdata, pws, 9.f); __syncthreads(); - fitVertices(pdata, pws, 5000.); + gpuVertexFinder::fitVertices(pdata, pws, 5000.); __syncthreads(); - sortByPt2(pdata, pws); + gpuVertexFinder::sortByPt2(pdata, pws); } #endif #endif @@ -101,25 +112,24 @@ struct ClusterGenerator { std::exponential_distribution ptGen; }; -// a macro SORRY -#define LOC_ONGPU(M) ((char*)(onGPU_d.get()) + offsetof(gpuVertexFinder::ZVertices, M)) -#define LOC_WS(M) ((char*)(ws_d.get()) + offsetof(gpuVertexFinder::WorkSpace, M)) - -__global__ void print(gpuVertexFinder::ZVertices const* pdata, gpuVertexFinder::WorkSpace const* pws) { - auto const& __restrict__ data = *pdata; - auto const& __restrict__ ws = *pws; - printf("nt,nv %d %d,%d\n", ws.ntrks, data.nvFinal, ws.nvIntermediate); +__global__ void print(gpuVertexFinder::VtxSoAView pdata, gpuVertexFinder::WsSoAView pws) { + auto& __restrict__ ws = pws; + printf("nt,nv %d %d,%d\n", ws.ntrks(), pdata.nvFinal(), ws.nvIntermediate()); } int main() { + cudaStream_t stream; #ifdef __CUDACC__ cms::cudatest::requireDevices(); + cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - auto onGPU_d = cms::cuda::make_device_unique(1, nullptr); - auto ws_d = cms::cuda::make_device_unique(1, nullptr); + ZVertex::ZVertexSoADevice onGPU_d(stream); + gpuVertexFinder::workSpace::WorkSpaceSoADevice ws_d(stream); #else - auto onGPU_d = std::make_unique(); - auto ws_d = std::make_unique(); + stream = nullptr; + + ZVertex::ZVertexSoAHost onGPU_d(stream); + gpuVertexFinder::workSpace::WorkSpaceSoAHost ws_d(stream); #endif Event ev; @@ -135,24 +145,26 @@ int main() { gen(ev); #ifdef __CUDACC__ - init<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get()); + gpuVertexFinder::init<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.view()); #else - onGPU_d->init(); - ws_d->init(); + gpuVertexFinder::init(onGPU_d.view(), ws_d.view()); #endif std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl; auto nt = ev.ztrack.size(); #ifdef __CUDACC__ - cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice)); - cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice)); - cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); - cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(&ws_d.view().ntrks(), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice)); + cudaCheck( + cudaMemcpy(ws_d.view().zt(), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice)); + cudaCheck( + cudaMemcpy(ws_d.view().ezt2(), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); + cudaCheck( + cudaMemcpy(ws_d.view().ptt2(), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); #else - ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t)); - ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size()); - ::memcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size()); - ::memcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size()); + ::memcpy(&ws_d.view().ntrks(), &nt, sizeof(uint32_t)); + ::memcpy(ws_d.view().zt(), ev.ztrack.data(), sizeof(float) * ev.ztrack.size()); + ::memcpy(ws_d.view().ezt2(), ev.eztrack.data(), sizeof(float) * ev.eztrack.size()); + ::memcpy(ws_d.view().ptt2(), ev.pttrack.data(), sizeof(float) * ev.eztrack.size()); #endif std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (i % 4) << std::endl; @@ -168,30 +180,30 @@ int main() { uint32_t nv = 0; #ifdef __CUDACC__ - print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get()); + print<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.view()); cudaCheck(cudaGetLastError()); cudaDeviceSynchronize(); #ifdef ONE_KERNEL - cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]); + cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.view(), ws_d.view(), kk, par[0], par[1], par[2]); #else - cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]); + cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.view(), ws_d.view(), kk, par[0], par[1], par[2]); #endif - print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get()); + print<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.view()); cudaCheck(cudaGetLastError()); cudaDeviceSynchronize(); - cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); + cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.view(), 50.f); cudaCheck(cudaGetLastError()); - cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else - print(onGPU_d.get(), ws_d.get()); - CLUSTERIZE(onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]); - print(onGPU_d.get(), ws_d.get()); - fitVertices(onGPU_d.get(), ws_d.get(), 50.f); - nv = onGPU_d->nvFinal; + print(onGPU_d.view(), ws_d.view()); + CLUSTERIZE(onGPU_d.view(), ws_d.view(), kk, par[0], par[1], par[2]); + print(onGPU_d.view(), ws_d.view()); + gpuVertexFinder::fitVertices(onGPU_d.view(), ws_d.view(), 50.f); + nv = onGPU_d.view().nvFinal(); #endif if (nv == 0) { @@ -221,18 +233,18 @@ int main() { nn = hnn; ind = hind; #else - zv = onGPU_d->zv; - wv = onGPU_d->wv; - ptv2 = onGPU_d->ptv2; - nn = onGPU_d->ndof; - ind = onGPU_d->sortInd; + zv = onGPU_d.view().zv(); + wv = onGPU_d.view().wv(); + ptv2 = onGPU_d.view().ptv2(); + nn = onGPU_d.view().ndof(); + ind = onGPU_d.view().sortInd(); #endif #ifdef __CUDACC__ - cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(nn, onGPU_d.view().ndof(), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float), cudaMemcpyDeviceToHost)); #else - memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); + memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float)); #endif for (auto j = 0U; j < nv; ++j) @@ -244,14 +256,14 @@ int main() { } #ifdef __CUDACC__ - cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); - cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.view(), 50.f); + cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(nn, onGPU_d.view().ndof(), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float), cudaMemcpyDeviceToHost)); #else - fitVertices(onGPU_d.get(), ws_d.get(), 50.f); - nv = onGPU_d->nvFinal; - memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); + gpuVertexFinder::fitVertices(onGPU_d.view(), ws_d.view(), 50.f); + nv = onGPU_d.view().nvFinal(); + memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float)); #endif for (auto j = 0U; j < nv; ++j) @@ -264,26 +276,26 @@ int main() { #ifdef __CUDACC__ // one vertex per block!!! - cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f); - cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.view(), ws_d.view(), 9.f); + cudaCheck(cudaMemcpy(&nv, &ws_d.view().nvIntermediate(), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else - splitVertices(onGPU_d.get(), ws_d.get(), 9.f); - nv = ws_d->nvIntermediate; + gpuVertexFinder::splitVertices(onGPU_d.view(), ws_d.view(), 9.f); + nv = ws_d.view().nvIntermediate(); #endif std::cout << "after split " << nv << std::endl; #ifdef __CUDACC__ - cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f); + cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.view(), 5000.f); cudaCheck(cudaGetLastError()); - cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get()); + cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.view(), ws_d.view()); cudaCheck(cudaGetLastError()); - cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else - fitVertices(onGPU_d.get(), ws_d.get(), 5000.f); - sortByPt2(onGPU_d.get(), ws_d.get()); - nv = onGPU_d->nvFinal; - memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); + gpuVertexFinder::fitVertices(onGPU_d.view(), ws_d.view(), 5000.f); + gpuVertexFinder::sortByPt2(onGPU_d.view(), ws_d.view()); + nv = onGPU_d.view().nvFinal(); + memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float)); #endif if (nv == 0) { @@ -292,12 +304,12 @@ int main() { } #ifdef __CUDACC__ - cudaCheck(cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(zv, onGPU_d.view().zv(), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(wv, onGPU_d.view().wv(), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(ptv2, onGPU_d.view().ptv2(), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(nn, onGPU_d.view().ndof(), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(ind, onGPU_d.view().sortInd(), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost)); #endif for (auto j = 0U; j < nv; ++j) if (nn[j] > 0) diff --git a/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc b/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc index 34e04b0f7aedb..aa8565e9aed1f 100644 --- a/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc +++ b/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc @@ -45,12 +45,14 @@ #include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h" #include "DataFormats/GeometrySurface/interface/Plane.h" #include "DataFormats/BeamSpot/interface/BeamSpot.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" #include "MagneticField/Records/interface/IdealMagneticFieldRecord.h" #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h" -#include "CUDADataFormats/Vertex/interface/ZVertexSoA.h" -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" namespace L2TauTagNNv1 { constexpr int nCellEta = 5; @@ -179,16 +181,16 @@ class L2TauNNProducer : public edm::stream::EDProducer& allTaus, - const pixelTrack::TrackSoA& patatracks_tsoa, - const ZVertexSoA& patavtx_soa, + const pixelTrack::TrackSoAHost& patatracks_tsoa, + const ZVertex::ZVertexSoAHost& patavtx_soa, const reco::BeamSpot& beamspot, const MagneticField* magfi); - void selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa, - const pixelTrack::TrackSoA& patatracks_tsoa, + void selectGoodTracksAndVertices(const ZVertex::ZVertexSoAHost& patavtx_soa, + const pixelTrack::TrackSoAHost& patatracks_tsoa, std::vector& trkGood, std::vector& vtxGood); std::pair impactParameter(int it, - const pixelTrack::TrackSoA& patatracks_tsoa, + const pixelTrack::TrackSoAHost& patatracks_tsoa, float patatrackPhi, const reco::BeamSpot& beamspot, const MagneticField* magfi); @@ -207,8 +209,8 @@ class L2TauNNProducer : public edm::stream::EDProducer eeToken_; const edm::ESGetToken geometryToken_; const edm::ESGetToken bFieldToken_; - const edm::EDGetTokenT pataVerticesToken_; - const edm::EDGetTokenT pataTracksToken_; + const edm::EDGetTokenT pataVerticesToken_; + const edm::EDGetTokenT pataTracksToken_; const edm::EDGetTokenT beamSpotToken_; const unsigned int maxVtx_; const float fractionSumPt2_; @@ -292,8 +294,8 @@ L2TauNNProducer::L2TauNNProducer(const edm::ParameterSet& cfg, const L2TauNNProd eeToken_(consumes(cfg.getParameter("eeInput"))), geometryToken_(esConsumes()), bFieldToken_(esConsumes()), - pataVerticesToken_(consumes(cfg.getParameter("pataVertices"))), - pataTracksToken_(consumes(cfg.getParameter("pataTracks"))), + pataVerticesToken_(consumes(cfg.getParameter("pataVertices"))), + pataTracksToken_(consumes(cfg.getParameter("pataTracks"))), beamSpotToken_(consumes(cfg.getParameter("BeamSpot"))), maxVtx_(cfg.getParameter("maxVtx")), fractionSumPt2_(cfg.getParameter("fractionSumPt2")), @@ -569,32 +571,32 @@ void L2TauNNProducer::fillCaloRecHits(tensorflow::Tensor& cellGridMatrix, } } -void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa, - const pixelTrack::TrackSoA& patatracks_tsoa, +void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertex::ZVertexSoAHost& patavtx_soa, + const pixelTrack::TrackSoAHost& patatracks_tsoa, std::vector& trkGood, std::vector& vtxGood) { - const auto maxTracks = patatracks_tsoa.stride(); - const int nv = patavtx_soa.nvFinal; + const auto maxTracks = patatracks_tsoa.view().metadata().size(); + const int nv = patavtx_soa.view().nvFinal(); trkGood.clear(); trkGood.reserve(maxTracks); vtxGood.clear(); vtxGood.reserve(nv); - auto const* quality = patatracks_tsoa.qualityData(); + auto const* quality = pixelTrack::utilities::qualityData(patatracks_tsoa.view()); // No need to sort either as the algorithms is just using the max (not even the location, just the max value of pt2sum). std::vector pTSquaredSum(nv, 0); std::vector nTrkAssociated(nv, 0); for (int32_t trk_idx = 0; trk_idx < maxTracks; ++trk_idx) { - auto nHits = patatracks_tsoa.nHits(trk_idx); + auto nHits = pixelTrack::utilities::nHits(patatracks_tsoa.view(), trk_idx); if (nHits == 0) { break; } - int vtx_ass_to_track = patavtx_soa.idv[trk_idx]; + int vtx_ass_to_track = patavtx_soa.view()[trk_idx].idv(); if (vtx_ass_to_track >= 0 && vtx_ass_to_track < nv) { - auto patatrackPt = patatracks_tsoa.pt[trk_idx]; + auto patatrackPt = patatracks_tsoa.view()[trk_idx].pt(); ++nTrkAssociated[vtx_ass_to_track]; - if (patatrackPt >= trackPtMin_ && patatracks_tsoa.chi2(trk_idx) <= trackChi2Max_) { + if (patatrackPt >= trackPtMin_ && patatracks_tsoa.const_view()[trk_idx].chi2() <= trackChi2Max_) { patatrackPt = std::min(patatrackPt, trackPtMax_); pTSquaredSum[vtx_ass_to_track] += patatrackPt * patatrackPt; } @@ -606,7 +608,7 @@ void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa, if (nv > 0) { const auto minFOM_fromFrac = (*std::max_element(pTSquaredSum.begin(), pTSquaredSum.end())) * fractionSumPt2_; for (int j = nv - 1; j >= 0 && vtxGood.size() < maxVtx_; --j) { - auto vtx_idx = patavtx_soa.sortInd[j]; + auto vtx_idx = patavtx_soa.view()[j].sortInd(); assert(vtx_idx < nv); if (nTrkAssociated[vtx_idx] >= 2 && pTSquaredSum[vtx_idx] >= minFOM_fromFrac && pTSquaredSum[vtx_idx] > minSumPt2_) { @@ -617,15 +619,15 @@ void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa, } std::pair L2TauNNProducer::impactParameter(int it, - const pixelTrack::TrackSoA& patatracks_tsoa, + const pixelTrack::TrackSoAHost& patatracks_tsoa, float patatrackPhi, const reco::BeamSpot& beamspot, const MagneticField* magfi) { - auto const& fit = patatracks_tsoa.stateAtBS; + // auto const& fit = patatracks_tsoa.stateAtBS; /* dxy and dz */ riemannFit::Vector5d ipar, opar; riemannFit::Matrix5d icov, ocov; - fit.copyToDense(ipar, icov, it); + pixelTrack::utilities::copyToDense(patatracks_tsoa.view(), ipar, icov, it); riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov); LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.); float sp = std::sin(patatrackPhi); @@ -650,8 +652,8 @@ std::pair L2TauNNProducer::impactParameter(int it, void L2TauNNProducer::fillPatatracks(tensorflow::Tensor& cellGridMatrix, const std::vector& allTaus, - const pixelTrack::TrackSoA& patatracks_tsoa, - const ZVertexSoA& patavtx_soa, + const pixelTrack::TrackSoAHost& patatracks_tsoa, + const ZVertex::ZVertexSoAHost& patavtx_soa, const reco::BeamSpot& beamspot, const MagneticField* magfi) { using NNInputs = L2TauTagNNv1::NNInputs; @@ -675,19 +677,19 @@ void L2TauNNProducer::fillPatatracks(tensorflow::Tensor& cellGridMatrix, const float tauPhi = allTaus[tau_idx]->phi(); for (const auto it : trkGood) { - const float patatrackPt = patatracks_tsoa.pt[it]; + const float patatrackPt = patatracks_tsoa.const_view()[it].pt(); if (patatrackPt <= 0) continue; - const float patatrackPhi = patatracks_tsoa.phi(it); - const float patatrackEta = patatracks_tsoa.eta(it); - const float patatrackCharge = patatracks_tsoa.charge(it); - const float patatrackChi2OverNdof = patatracks_tsoa.chi2(it); - const auto nHits = patatracks_tsoa.nHits(it); + const float patatrackPhi = pixelTrack::utilities::phi(patatracks_tsoa.const_view(), it); + const float patatrackEta = patatracks_tsoa.const_view()[it].eta(); + const float patatrackCharge = pixelTrack::utilities::charge(patatracks_tsoa.const_view(), it); + const float patatrackChi2OverNdof = patatracks_tsoa.view()[it].chi2(); + const auto nHits = pixelTrack::utilities::nHits(patatracks_tsoa.const_view(), it); if (nHits <= 0) continue; const int patatrackNdof = 2 * std::min(6, nHits) - 5; - const int vtx_idx_assTrk = patavtx_soa.idv[it]; + const int vtx_idx_assTrk = patavtx_soa.view()[it].idv(); if (reco::deltaR2(patatrackEta, patatrackPhi, tauEta, tauPhi) < dR2_max) { std::tie(deta, dphi, eta_idx, phi_idx) = getEtaPhiIndices(patatrackEta, patatrackPhi, allTaus[tau_idx]->polarP4()); @@ -763,8 +765,8 @@ void L2TauNNProducer::produce(edm::Event& event, const edm::EventSetup& eventset const auto eeCal = event.getHandle(eeToken_); const auto hbhe = event.getHandle(hbheToken_); const auto ho = event.getHandle(hoToken_); - const auto& patatracks_SoA = *event.get(pataTracksToken_); - const auto& vertices_SoA = *event.get(pataVerticesToken_); + auto& patatracks_SoA = event.get(pataTracksToken_); + auto& vertices_SoA = event.get(pataVerticesToken_); const auto bsHandle = event.getHandle(beamSpotToken_); auto const fieldESH = eventsetup.getHandle(bFieldToken_); diff --git a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc index 0e5823fc46c46..4301749e441fe 100644 --- a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc +++ b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc @@ -1,4 +1,6 @@ -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +//#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" #include "DataFormats/BeamSpot/interface/BeamSpot.h" #include "DataFormats/GeometrySurface/interface/Plane.h" #include "DataFormats/TrackerCommon/interface/TrackerTopology.h" @@ -45,7 +47,7 @@ class SeedProducerFromSoA : public edm::global::EDProducer<> { // Event data tokens const edm::EDGetTokenT tBeamSpot_; - const edm::EDGetTokenT tokenTrack_; + const edm::EDGetTokenT tokenTrack_; // Event setup tokens const edm::ESGetToken idealMagneticFieldToken_; const edm::ESGetToken trackerDigiGeometryToken_; @@ -55,7 +57,7 @@ class SeedProducerFromSoA : public edm::global::EDProducer<> { SeedProducerFromSoA::SeedProducerFromSoA(const edm::ParameterSet& iConfig) : tBeamSpot_(consumes(iConfig.getParameter("beamSpot"))), - tokenTrack_(consumes(iConfig.getParameter("src"))), + tokenTrack_(consumes(iConfig.getParameter("src"))), idealMagneticFieldToken_(esConsumes()), trackerDigiGeometryToken_(esConsumes()), trackerPropagatorToken_(esConsumes(edm::ESInputTag("PropagatorWithMaterial"))), @@ -89,16 +91,16 @@ void SeedProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, co // std::cout << "beamspot " << bsh.x0() << ' ' << bsh.y0() << ' ' << bsh.z0() << std::endl; GlobalPoint bs(bsh.x0(), bsh.y0(), bsh.z0()); - const auto& tsoa = *(iEvent.get(tokenTrack_)); + auto& tsoa = iEvent.get(tokenTrack_); - auto const* quality = tsoa.qualityData(); - auto const& fit = tsoa.stateAtBS; - auto const& detIndices = tsoa.detIndices; - auto maxTracks = tsoa.stride(); + auto const* quality = pixelTrack::utilities::qualityData(tsoa.view()); + //auto const& fit = tsoa.stateAtBS; + auto const& detIndices = tsoa.view().detIndices(); + auto maxTracks = tsoa.view().metadata().size(); int32_t nt = 0; for (int32_t it = 0; it < maxTracks; ++it) { - auto nHits = tsoa.nHits(it); + auto nHits = pixelTrack::utilities::nHits(tsoa.view(), it); if (nHits == 0) break; // this is a guard: maybe we need to move to nTracks... @@ -120,11 +122,11 @@ void SeedProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, co // mind: this values are respect the beamspot! - float phi = tsoa.phi(it); + float phi = pixelTrack::utilities::phi(tsoa.view(), it); riemannFit::Vector5d ipar, opar; riemannFit::Matrix5d icov, ocov; - fit.copyToDense(ipar, icov, it); + pixelTrack::utilities::copyToDense(tsoa.view(), ipar, icov, it); riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov); LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.);