From 4ee4c37d8c22842239482449a4f43f81ec2955f9 Mon Sep 17 00:00:00 2001 From: Eric Cano Date: Tue, 13 Feb 2024 16:43:48 +0100 Subject: [PATCH] Splits ZVertexSoA into 2 layouts and wraps those in a multi layout collection. --- .../plugins/SiPixelCompareVertexSoAAlpaka.cc | 4 +- .../plugins/SiPixelMonitorVertexSoAAlpaka.cc | 16 ++-- .../VertexSoA/interface/ZVertexDevice.h | 12 +-- DataFormats/VertexSoA/interface/ZVertexHost.h | 12 +-- DataFormats/VertexSoA/interface/ZVertexSoA.h | 13 ++- DataFormats/VertexSoA/src/classes_def.xml | 4 +- .../VertexSoA/test/alpaka/ZVertexSoA_test.cc | 14 ++-- .../test/alpaka/ZVertexSoA_test.dev.cc | 26 +++--- .../src/L2TauTagNNProducerAlpaka.cc | 4 +- .../plugins/PixelTrackDumpAlpaka.cc | 4 +- .../PixelVertexProducerFromSoAAlpaka.cc | 5 +- .../plugins/alpaka/clusterTracksByDensity.h | 60 +++++++------- .../plugins/alpaka/clusterTracksDBSCAN.h | 59 +++++++------- .../plugins/alpaka/clusterTracksIterative.h | 4 +- .../plugins/alpaka/fitVertices.h | 7 +- .../plugins/alpaka/sortByPt2.h | 11 +-- .../plugins/alpaka/splitVertices.h | 11 ++- .../plugins/alpaka/vertexFinder.dev.cc | 50 ++++++++---- .../plugins/alpaka/vertexFinder.h | 1 + .../test/alpaka/VertexFinder_t.dev.cc | 80 ++++++++++++++----- 20 files changed, 245 insertions(+), 152 deletions(-) diff --git a/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareVertexSoAAlpaka.cc b/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareVertexSoAAlpaka.cc index 2eea6a980d9c5..4bcdd2a1bb1cb 100644 --- a/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareVertexSoAAlpaka.cc +++ b/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareVertexSoAAlpaka.cc @@ -104,7 +104,7 @@ void SiPixelCompareVertexSoAAlpaka::analyze(const edm::Event& iEvent, const edm: auto yc = y0 + dydz * zc; zc += z0; - auto ndofHost = vsoaHost.view()[sic].ndof(); + auto ndofHost = vsoaHost.view()[sic].ndof(); auto chi2Host = vsoaHost.view()[sic].chi2(); const int32_t notFound = -1; @@ -130,7 +130,7 @@ void SiPixelCompareVertexSoAAlpaka::analyze(const edm::Event& iEvent, const edm: auto xg = x0 + dxdz * zg; auto yg = y0 + dydz * zg; zg += z0; - auto ndofDevice = vsoaDevice.view()[closestVtxidx].ndof(); + auto ndofDevice = vsoaDevice.view()[closestVtxidx].ndof(); auto chi2Device = vsoaDevice.view()[closestVtxidx].chi2(); hx_->Fill(xc - x0, xg - x0); diff --git a/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorVertexSoAAlpaka.cc b/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorVertexSoAAlpaka.cc index d3121f77bccb8..fc57393975101 100644 --- a/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorVertexSoAAlpaka.cc +++ b/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorVertexSoAAlpaka.cc @@ -67,7 +67,9 @@ void SiPixelMonitorVertexSoAAlpaka::analyze(const edm::Event& iEvent, const edm: } auto const& vsoa = *vsoaHandle; - int nVertices = vsoa.view().nvFinal(); + auto vtx_view = vsoa.view(); + auto trx_view = vsoa.view(); + int nVertices = vtx_view.nvFinal(); auto bsHandle = iEvent.getHandle(tokenBeamSpot_); float x0 = 0., y0 = 0., z0 = 0., dxdz = 0., dydz = 0.; if (!bsHandle.isValid()) { @@ -82,8 +84,8 @@ void SiPixelMonitorVertexSoAAlpaka::analyze(const edm::Event& iEvent, const edm: } for (int iv = 0; iv < nVertices; iv++) { - auto si = vsoa.view()[iv].sortInd(); - auto z = vsoa.view()[si].zv(); + auto si = vtx_view[iv].sortInd(); + auto z = vtx_view[si].zv(); auto x = x0 + dxdz * z; auto y = y0 + dydz * z; @@ -91,10 +93,10 @@ void SiPixelMonitorVertexSoAAlpaka::analyze(const edm::Event& iEvent, const edm: hx->Fill(x); hy->Fill(y); hz->Fill(z); - auto ndof = vsoa.view()[si].ndof(); - hchi2->Fill(vsoa.view()[si].chi2()); - hchi2oNdof->Fill(vsoa.view()[si].chi2() / ndof); - hptv2->Fill(vsoa.view()[si].ptv2()); + auto ndof = trx_view[si].ndof(); + hchi2->Fill(vtx_view[si].chi2()); + hchi2oNdof->Fill(vtx_view[si].chi2() / ndof); + hptv2->Fill(vtx_view[si].ptv2()); hntrks->Fill(ndof + 1); } hnVertex->Fill(nVertices); diff --git a/DataFormats/VertexSoA/interface/ZVertexDevice.h b/DataFormats/VertexSoA/interface/ZVertexDevice.h index 8d120ae190f3c..fb61450cf980e 100644 --- a/DataFormats/VertexSoA/interface/ZVertexDevice.h +++ b/DataFormats/VertexSoA/interface/ZVertexDevice.h @@ -9,18 +9,18 @@ #include "DataFormats/VertexSoA/interface/ZVertexHost.h" #include "DataFormats/Portable/interface/PortableDeviceCollection.h" -template -class ZVertexDeviceSoA : public PortableDeviceCollection, TDev> { +template +class ZVertexDeviceSoA : public PortableDeviceMultiCollection { public: ZVertexDeviceSoA() = default; // necessary for ROOT dictionaries - // Constructor which specifies the SoA size + // Constructor which specifies the queue template - explicit ZVertexDeviceSoA(TQueue queue) : PortableDeviceCollection, TDev>(S, queue) {} + explicit ZVertexDeviceSoA(TQueue queue) + : PortableDeviceMultiCollection({{NVTX, NTRX}}, queue) {} }; -using namespace ::zVertex; template -using ZVertexDevice = ZVertexDeviceSoA; +using ZVertexDevice = ZVertexDeviceSoA; #endif // DataFormats_VertexSoA_interface_ZVertexDevice_h diff --git a/DataFormats/VertexSoA/interface/ZVertexHost.h b/DataFormats/VertexSoA/interface/ZVertexHost.h index 2d72b83bfe385..1d1a1fd1a9483 100644 --- a/DataFormats/VertexSoA/interface/ZVertexHost.h +++ b/DataFormats/VertexSoA/interface/ZVertexHost.h @@ -10,20 +10,22 @@ #include "DataFormats/VertexSoA/interface/ZVertexDefinitions.h" #include "DataFormats/Portable/interface/PortableHostCollection.h" -template -class ZVertexHostSoA : public PortableHostCollection { +template +class ZVertexHostSoA : public PortableHostCollection2 { public: ZVertexHostSoA() = default; // Constructor which specifies the queue template - explicit ZVertexHostSoA(TQueue queue) : PortableHostCollection(S, queue) {} + explicit ZVertexHostSoA(TQueue queue) + : PortableHostCollection2({{NVTX, NTRX}}, queue) {} // Constructor which specifies the DevHost - explicit ZVertexHostSoA(alpaka_common::DevHost const& host) : PortableHostCollection(S, host) {} + explicit ZVertexHostSoA(alpaka_common::DevHost const& host) + : PortableHostCollection2({{NVTX, NTRX}}, host) {} }; //using namespace ::zVertex; -using ZVertexHost = ZVertexHostSoA; +using ZVertexHost = ZVertexHostSoA; #endif // DataFormats_VertexSoA_ZVertexHost_H diff --git a/DataFormats/VertexSoA/interface/ZVertexSoA.h b/DataFormats/VertexSoA/interface/ZVertexSoA.h index 940818a61caf8..8c2ff9c5c1db1 100644 --- a/DataFormats/VertexSoA/interface/ZVertexSoA.h +++ b/DataFormats/VertexSoA/interface/ZVertexSoA.h @@ -10,20 +10,29 @@ namespace reco { GENERATE_SOA_LAYOUT(ZVertexLayout, - SOA_COLUMN(int16_t, idv), // vertex index for each associated (original) track (-1 == not associate SOA_COLUMN(float, zv), // output z-posistion of found vertices SOA_COLUMN(float, wv), // output weight (1/error^2) on the above SOA_COLUMN(float, chi2), // vertices chi2 SOA_COLUMN(float, ptv2), // vertices pt^2 - SOA_COLUMN(int32_t, ndof), // vertices number of dof (reused as workspace for the number of nearest neighbours FIXME) SOA_COLUMN(uint16_t, sortInd), // sorted index (by pt2) ascending SOA_SCALAR(uint32_t, nvFinal)) // the number of vertices + GENERATE_SOA_LAYOUT( + ZVertexTracksLayout, + SOA_COLUMN(int16_t, idv), // vertex index for each associated (original) track (-1 == not associate + SOA_COLUMN(int32_t, + ndof)) // vertices number of dof (reused as workspace for the number of nearest neighbours FIXME) + // Common types for both Host and Device code using ZVertexSoA = ZVertexLayout<>; using ZVertexSoAView = ZVertexSoA::View; using ZVertexSoAConstView = ZVertexSoA::ConstView; + // Common types for both Host and Device code + using ZVertexTracksSoA = ZVertexTracksLayout<>; + using ZVertexTracksSoAView = ZVertexTracksSoA::View; + using ZVertexTracksSoAConstView = ZVertexTracksSoA::ConstView; + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void init(ZVertexSoAView &vertices) { vertices.nvFinal() = 0; } } // namespace reco diff --git a/DataFormats/VertexSoA/src/classes_def.xml b/DataFormats/VertexSoA/src/classes_def.xml index 820d28ecc3493..a1fb3f6ac8593 100644 --- a/DataFormats/VertexSoA/src/classes_def.xml +++ b/DataFormats/VertexSoA/src/classes_def.xml @@ -1,8 +1,8 @@ - - + + diff --git a/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.cc b/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.cc index 0c0c8e8591df9..43b55ee5a0582 100644 --- a/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.cc +++ b/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.cc @@ -30,7 +30,7 @@ using namespace reco; namespace ALPAKA_ACCELERATOR_NAMESPACE { namespace testZVertexSoAT { - void runKernels(ZVertexSoAView zvertex_view, Queue& queue); + void runKernels(ZVertexSoAView zvertex_view, ZVertexTracksSoAView zvertextracks_view, Queue& queue); } } // namespace ALPAKA_ACCELERATOR_NAMESPACE @@ -44,7 +44,7 @@ int main() { // Instantiate vertices on device. PortableCollection allocates // SoA on device automatically. ZVertexSoACollection zvertex_d(queue); - testZVertexSoAT::runKernels(zvertex_d.view(), queue); + testZVertexSoAT::runKernels(zvertex_d.view(), zvertex_d.view(), queue); // Instantate vertices on host. This is where the data will be // copied to from device. @@ -70,11 +70,13 @@ int main() { << "\t" << "nvFinal" << std::endl; + auto vtx_v = zvertex_h.view(); + auto trx_v = zvertex_h.view(); for (int i = 0; i < 10; ++i) { - std::cout << (int)zvertex_h.view()[i].idv() << "\t" << zvertex_h.view()[i].zv() << "\t" - << zvertex_h.view()[i].wv() << "\t" << zvertex_h.view()[i].chi2() << "\t" << zvertex_h.view()[i].ptv2() - << "\t" << (int)zvertex_h.view()[i].ndof() << "\t" << (int)zvertex_h.view()[i].sortInd() << "\t" - << (int)zvertex_h.view().nvFinal() << std::endl; + auto vi = vtx_v[i]; + auto ti = trx_v[i]; + std::cout << (int)ti.idv() << "\t" << vi.zv() << "\t" << vi.wv() << "\t" << vi.chi2() << "\t" << vi.ptv2() << "\t" + << (int)ti.ndof() << "\t" << vi.sortInd() << "\t" << (int)vtx_v.nvFinal() << std::endl; } } diff --git a/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.dev.cc b/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.dev.cc index 749073d1f916f..45802461aae9d 100644 --- a/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.dev.cc +++ b/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.dev.cc @@ -12,49 +12,57 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { class TestFillKernel { public: template >> - ALPAKA_FN_ACC void operator()(TAcc const& acc, reco::ZVertexSoAView zvertex_view) const { + ALPAKA_FN_ACC void operator()(TAcc const& acc, + reco::ZVertexSoAView zvertex_view, + reco::ZVertexTracksSoAView ztracks_view) const { if (cms::alpakatools::once_per_grid(acc)) { zvertex_view.nvFinal() = 420; } for (int32_t j : elements_with_stride(acc, zvertex_view.metadata().size())) { - zvertex_view[j].idv() = (int16_t)j; zvertex_view[j].zv() = (float)j; zvertex_view[j].wv() = (float)j; zvertex_view[j].chi2() = (float)j; zvertex_view[j].ptv2() = (float)j; - zvertex_view[j].ndof() = (int32_t)j; zvertex_view[j].sortInd() = (uint16_t)j; } + for (int32_t j : elements_with_stride(acc, ztracks_view.metadata().size())) { + ztracks_view[j].idv() = (int16_t)j; + ztracks_view[j].ndof() = (int32_t)j; + } } }; class TestVerifyKernel { public: template >> - ALPAKA_FN_ACC void operator()(TAcc const& acc, reco::ZVertexSoAView zvertex_view) const { + ALPAKA_FN_ACC void operator()(TAcc const& acc, + reco::ZVertexSoAView zvertex_view, + reco::ZVertexTracksSoAView ztracks_view) const { if (cms::alpakatools::once_per_grid(acc)) { ALPAKA_ASSERT_ACC(zvertex_view.nvFinal() == 420); } for (int32_t j : elements_with_stride(acc, zvertex_view.nvFinal())) { - assert(zvertex_view[j].idv() == j); assert(zvertex_view[j].zv() - (float)j < 0.0001); assert(zvertex_view[j].wv() - (float)j < 0.0001); assert(zvertex_view[j].chi2() - (float)j < 0.0001); assert(zvertex_view[j].ptv2() - (float)j < 0.0001); - assert(zvertex_view[j].ndof() == j); assert(zvertex_view[j].sortInd() == uint32_t(j)); } + for (int32_t j : elements_with_stride(acc, ztracks_view.metadata().size())) { + assert(ztracks_view[j].idv() == j); + assert(ztracks_view[j].ndof() == j); + } } }; - void runKernels(reco::ZVertexSoAView zvertex_view, Queue& queue) { + void runKernels(reco::ZVertexSoAView zvertex_view, reco::ZVertexTracksSoAView ztracks_view, Queue& queue) { uint32_t items = 64; uint32_t groups = divide_up_by(zvertex_view.metadata().size(), items); auto workDiv = make_workdiv(groups, items); - alpaka::exec(queue, workDiv, TestFillKernel{}, zvertex_view); - alpaka::exec(queue, workDiv, TestVerifyKernel{}, zvertex_view); + alpaka::exec(queue, workDiv, TestFillKernel{}, zvertex_view, ztracks_view); + alpaka::exec(queue, workDiv, TestVerifyKernel{}, zvertex_view, ztracks_view); } } // namespace testZVertexSoAT diff --git a/RecoTauTag/HLTProducers/src/L2TauTagNNProducerAlpaka.cc b/RecoTauTag/HLTProducers/src/L2TauTagNNProducerAlpaka.cc index 9772366c6b22e..8bc685a311832 100644 --- a/RecoTauTag/HLTProducers/src/L2TauTagNNProducerAlpaka.cc +++ b/RecoTauTag/HLTProducers/src/L2TauTagNNProducerAlpaka.cc @@ -595,7 +595,7 @@ void L2TauNNProducerAlpaka::selectGoodTracksAndVertices(const ZVertexHost& patav if (nHits == 0) { break; } - int vtx_ass_to_track = patavtx_soa.view()[trk_idx].idv(); + int vtx_ass_to_track = patavtx_soa.view()[trk_idx].idv(); if (vtx_ass_to_track >= 0 && vtx_ass_to_track < nv) { auto patatrackPt = patatracks_tsoa.view()[trk_idx].pt(); ++nTrkAssociated[vtx_ass_to_track]; @@ -692,7 +692,7 @@ void L2TauNNProducerAlpaka::fillPatatracks(tensorflow::Tensor& cellGridMatrix, continue; const int patatrackNdof = 2 * std::min(6, nHits) - 5; - const int vtx_idx_assTrk = patavtx_soa.view()[it].idv(); + const int vtx_idx_assTrk = patavtx_soa.view()[it].idv(); if (reco::deltaR2(patatrackEta, patatrackPhi, tauEta, tauPhi) < dR2_max) { std::tie(deta, dphi, eta_idx, phi_idx) = getEtaPhiIndices(patatrackEta, patatrackPhi, allTaus[tau_idx]->polarP4()); diff --git a/RecoTracker/PixelTrackFitting/plugins/PixelTrackDumpAlpaka.cc b/RecoTracker/PixelTrackFitting/plugins/PixelTrackDumpAlpaka.cc index c4f0b97dba8a9..6ccb7789fc098 100644 --- a/RecoTracker/PixelTrackFitting/plugins/PixelTrackDumpAlpaka.cc +++ b/RecoTracker/PixelTrackFitting/plugins/PixelTrackDumpAlpaka.cc @@ -59,12 +59,12 @@ void PixelTrackDumpAlpakaT::analyze(edm::StreamID streamID, assert(tracks.view().nTracks()); auto const& vertices = iEvent.get(tokenSoAVertex_); - assert(vertices.view().idv()); + assert(vertices.view().idv()); assert(vertices.view().zv()); assert(vertices.view().wv()); assert(vertices.view().chi2()); assert(vertices.view().ptv2()); - assert(vertices.view().ndof()); + assert(vertices.view().ndof()); assert(vertices.view().sortInd()); assert(vertices.view().nvFinal()); } diff --git a/RecoTracker/PixelVertexFinding/plugins/PixelVertexProducerFromSoAAlpaka.cc b/RecoTracker/PixelVertexFinding/plugins/PixelVertexProducerFromSoAAlpaka.cc index 6e542f7870c2e..86561db386303 100644 --- a/RecoTracker/PixelVertexFinding/plugins/PixelVertexProducerFromSoAAlpaka.cc +++ b/RecoTracker/PixelVertexFinding/plugins/PixelVertexProducerFromSoAAlpaka.cc @@ -103,7 +103,7 @@ void PixelVertexProducerFromSoAAlpaka::produce(edm::StreamID streamID, err(2, 2) *= 2.; // artifically inflate error //Copy also the tracks (no intention to be efficient....) for (auto k = 0U; k < indToEdm.size(); ++k) { - if (soa.view()[k].idv() == int16_t(i)) + if (soa.view()[k].idv() == int16_t(i)) itrk.push_back(k); } auto nt = itrk.size(); @@ -117,7 +117,8 @@ void PixelVertexProducerFromSoAAlpaka::produce(edm::StreamID streamID, itrk.clear(); continue; } // remove outliers - (*vertexes).emplace_back(reco::Vertex::Point(x, y, z), err, soa.view()[i].chi2(), soa.view()[i].ndof(), nt); + (*vertexes).emplace_back( + reco::Vertex::Point(x, y, z), err, soa.view()[i].chi2(), soa.view()[i].ndof(), nt); auto &v = (*vertexes).back(); v.reserve(itrk.size()); for (auto it : itrk) { diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksByDensity.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksByDensity.h index 122457a7d05d2..cddc1561606cd 100644 --- a/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksByDensity.h +++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksByDensity.h @@ -16,27 +16,29 @@ #include "vertexFinder.h" namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { - - using VtxSoAView = ::reco::ZVertexSoAView; - using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView; - // this algo does not really scale as it works in a single block... - // enough for <10K tracks we have - // - // based on Rodrighez&Laio algo - // - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void __attribute__((always_inline)) - clusterTracksByDensity(const TAcc& acc, - VtxSoAView& pdata, - WsSoAView& pws, - int minT, // min number of neighbours to be "seed" - float eps, // max absolute distance to cluster - float errmax, // max error to be "seed" - float chi2max // max normalized distance to cluster - ) { - using namespace vertexFinder; - constexpr bool verbose = false; // in principle the compiler should optmize out if false - const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); + + using VtxSoAView = ::reco::ZVertexSoAView; + using TrxSoAView = ::reco::ZVertexTracksSoAView; + using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView; + // this algo does not really scale as it works in a single block... + // enough for <10K tracks we have + // + // based on Rodrighez&Laio algo + // + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void __attribute__((always_inline)) + clusterTracksByDensity(const TAcc& acc, + VtxSoAView& pdata, + TrxSoAView ptrxdata, + WsSoAView& pws, + int minT, // min number of neighbours to be "seed" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster + ) { + using namespace vertexFinder; + constexpr bool verbose = false; // in principle the compiler should optmize out if false + const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); if constexpr (verbose) { if (cms::alpakatools::once_per_block(acc)) @@ -44,17 +46,18 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { } auto er2mx = errmax * errmax; - auto& __restrict__ data = pdata; - auto& __restrict__ ws = pws; - auto nt = ws.ntrks(); - float const* __restrict__ zt = ws.zt(); - float const* __restrict__ ezt2 = ws.ezt2(); + auto& __restrict__ data = pdata; + auto& __restrict__ trxdata = ptrxdata; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ zt = ws.zt(); + float const* __restrict__ ezt2 = ws.ezt2(); uint32_t& nvFinal = data.nvFinal(); uint32_t& nvIntermediate = ws.nvIntermediate(); uint8_t* __restrict__ izt = ws.izt(); - int32_t* __restrict__ nn = data.ndof(); + int32_t* __restrict__ nn = trxdata.ndof(); int32_t* __restrict__ iv = ws.iv(); ALPAKA_ASSERT_ACC(zt); @@ -238,13 +241,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { template ALPAKA_FN_ACC void operator()(const TAcc& acc, VtxSoAView pdata, + TrxSoAView ptrxdata, WsSoAView pws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" float chi2max // max normalized distance to cluster ) const { - clusterTracksByDensity(acc, pdata, pws, minT, eps, errmax, chi2max); + clusterTracksByDensity(acc, pdata, ptrxdata, pws, minT, eps, errmax, chi2max); } }; diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksDBSCAN.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksDBSCAN.h index 7090599dcfdb0..6a7fdf9de7c45 100644 --- a/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksDBSCAN.h +++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksDBSCAN.h @@ -17,40 +17,43 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { - using VtxSoAView = ::reco::ZVertexSoAView; - using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView; - // this algo does not really scale as it works in a single block... - // enough for <10K tracks we have - class ClusterTracksDBSCAN { - public: - template - ALPAKA_FN_ACC void operator()(const TAcc& acc, - VtxSoAView pdata, - WsSoAView pws, - int minT, // min number of neighbours to be "core" - float eps, // max absolute distance to cluster - float errmax, // max error to be "seed" - float chi2max // max normalized distance to cluster - ) const { - constexpr bool verbose = false; // in principle the compiler should optmize out if false - const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); - if constexpr (verbose) { - if (cms::alpakatools::once_per_block(acc)) - printf("params %d %f %f %f\n", minT, eps, errmax, chi2max); - } - auto er2mx = errmax * errmax; + using VtxSoAView = ::reco::ZVertexSoAView; + using TrxSoAView = ::reco::ZVertexTracksSoAView; + using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView; + // this algo does not really scale as it works in a single block... + // enough for <10K tracks we have + class ClusterTracksDBSCAN { + public: + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, + VtxSoAView pdata, + TrxSoAView ptrxdata, + WsSoAView pws, + int minT, // min number of neighbours to be "core" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster + ) const { + constexpr bool verbose = false; // in principle the compiler should optmize out if false + const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); + if constexpr (verbose) { + if (cms::alpakatools::once_per_block(acc)) + printf("params %d %f %f %f\n", minT, eps, errmax, chi2max); + } + auto er2mx = errmax * errmax; - auto& __restrict__ data = pdata; - auto& __restrict__ ws = pws; - auto nt = ws.ntrks(); - float const* __restrict__ zt = ws.zt(); - float const* __restrict__ ezt2 = ws.ezt2(); + auto& __restrict__ data = pdata; + auto& __restrict__ trxdata = ptrxdata; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ zt = ws.zt(); + float const* __restrict__ ezt2 = ws.ezt2(); uint32_t& nvFinal = data.nvFinal(); uint32_t& nvIntermediate = ws.nvIntermediate(); uint8_t* __restrict__ izt = ws.izt(); - int32_t* __restrict__ nn = data.ndof(); + int32_t* __restrict__ nn = trxdata.ndof(); int32_t* __restrict__ iv = ws.iv(); ALPAKA_ASSERT_ACC(zt); diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksIterative.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksIterative.h index 38e8429c0d28f..4527f59e6f7eb 100644 --- a/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksIterative.h +++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksIterative.h @@ -25,6 +25,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { template ALPAKA_FN_ACC void operator()(const TAcc& acc, VtxSoAView pdata, + TrxSoAView ptrxdata, WsSoAView pws, int minT, // min number of neighbours to be "core" float eps, // max absolute distance to cluster @@ -40,6 +41,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { auto er2mx = errmax * errmax; auto& __restrict__ data = pdata; + auto& __restrict__ trxdata = ptrxdata; auto& __restrict__ ws = pws; auto nt = ws.ntrks(); float const* __restrict__ zt = ws.zt(); @@ -49,7 +51,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { uint32_t& nvIntermediate = ws.nvIntermediate(); uint8_t* __restrict__ izt = ws.izt(); - int32_t* __restrict__ nn = data.ndof(); + int32_t* __restrict__ nn = trxdata.ndof(); int32_t* __restrict__ iv = ws.iv(); ALPAKA_ASSERT_ACC(zt); diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/fitVertices.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/fitVertices.h index a8c428e2f5a00..9eb1d851e49ba 100644 --- a/RecoTracker/PixelVertexFinding/plugins/alpaka/fitVertices.h +++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/fitVertices.h @@ -18,12 +18,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { template ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void fitVertices(const TAcc& acc, VtxSoAView& pdata, + TrxSoAView& ptrxdata, WsSoAView& pws, float chi2Max // for outlier rejection ) { constexpr bool verbose = false; // in principle the compiler should optmize out if false auto& __restrict__ data = pdata; + auto& __restrict__ trxdata = ptrxdata; auto& __restrict__ ws = pws; auto nt = ws.ntrks(); float const* __restrict__ zt = ws.zt(); @@ -34,7 +36,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { uint32_t& nvFinal = data.nvFinal(); uint32_t& nvIntermediate = ws.nvIntermediate(); - int32_t* __restrict__ nn = data.ndof(); + int32_t* __restrict__ nn = trxdata.ndof(); int32_t* __restrict__ iv = ws.iv(); ALPAKA_ASSERT_ACC(nvFinal <= nvIntermediate); @@ -114,10 +116,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { template ALPAKA_FN_ACC void operator()(const TAcc& acc, VtxSoAView pdata, + TrxSoAView ptrxdata, WsSoAView pws, float chi2Max // for outlier rejection ) const { - fitVertices(acc, pdata, pws, chi2Max); + fitVertices(acc, pdata, ptrxdata, pws, chi2Max); } }; diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/sortByPt2.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/sortByPt2.h index ff8fab8ab635f..ee89bd298f565 100644 --- a/RecoTracker/PixelVertexFinding/plugins/alpaka/sortByPt2.h +++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/sortByPt2.h @@ -20,10 +20,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { using VtxSoAView = ::reco::ZVertexSoAView; + using TrxSoAView = ::reco::ZVertexTracksSoAView; using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView; - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void sortByPt2(const TAcc& acc, VtxSoAView& data, WsSoAView& ws) { + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void sortByPt2(const TAcc& acc, VtxSoAView& data, TrxSoAView& trxdata, WsSoAView& ws) { auto nt = ws.ntrks(); float const* __restrict__ ptt2 = ws.ptt2(); uint32_t const& nvFinal = data.nvFinal(); @@ -37,7 +38,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { // fill indexing for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { - data.idv()[ws.itrk()[i]] = iv[i]; + trxdata.idv()[ws.itrk()[i]] = iv[i]; }; // can be done asynchronously at the end of previous event @@ -74,8 +75,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { class SortByPt2Kernel { public: template - ALPAKA_FN_ACC void operator()(const TAcc& acc, VtxSoAView pdata, WsSoAView pws) const { - sortByPt2(acc, pdata, pws); + ALPAKA_FN_ACC void operator()(const TAcc& acc, VtxSoAView pdata, TrxSoAView ptrxdata, WsSoAView pws) const { + sortByPt2(acc, pdata, ptrxdata, pws); } }; diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/splitVertices.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/splitVertices.h index e2ba0b46b8be4..46585f79ccdc1 100644 --- a/RecoTracker/PixelVertexFinding/plugins/alpaka/splitVertices.h +++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/splitVertices.h @@ -16,16 +16,19 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { using VtxSoAView = ::reco::ZVertexSoAView; + using TrxSoAView = ::reco::ZVertexTracksSoAView; using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView; template ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void splitVertices(const TAcc& acc, VtxSoAView& pdata, + TrxSoAView ptrxdata, WsSoAView& pws, float maxChi2) { constexpr bool verbose = false; // in principle the compiler should optmize out if false const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); auto& __restrict__ data = pdata; + auto& __restrict__ trxdata = ptrxdata; auto& __restrict__ ws = pws; auto nt = ws.ntrks(); float const* __restrict__ zt = ws.zt(); @@ -35,7 +38,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { float const* __restrict__ chi2 = data.chi2(); uint32_t& nvFinal = data.nvFinal(); - int32_t const* __restrict__ nn = data.ndof(); + int32_t const* __restrict__ nn = trxdata.ndof(); int32_t* __restrict__ iv = ws.iv(); ALPAKA_ASSERT_ACC(zt); @@ -78,7 +81,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { ww[old] = 1.f / ezt2[k]; it[old] = k; } - } + } // the new vertices auto& znew = alpaka::declareSharedVar(acc); @@ -156,8 +159,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { class SplitVerticesKernel { public: template - ALPAKA_FN_ACC void operator()(const TAcc& acc, VtxSoAView pdata, WsSoAView pws, float maxChi2) const { - splitVertices(acc, pdata, pws, maxChi2); + ALPAKA_FN_ACC void operator()(const TAcc& acc, VtxSoAView pdata, TrxSoAView ptrxdata, WsSoAView pws, float maxChi2) const { + splitVertices(acc, pdata, ptrxdata, pws, maxChi2); } }; diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.dev.cc b/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.dev.cc index b41e07aff56d5..d838d95a57c03 100644 --- a/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.dev.cc +++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.dev.cc @@ -33,6 +33,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ALPAKA_FN_ACC void operator()(const TAcc& acc, reco::TrackSoAConstView tracks_view, VtxSoAView soa, + TrxSoAView trxsoa, WsSoAView pws, float ptMin, float ptMax) const { @@ -44,7 +45,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ALPAKA_ASSERT_ACC(nHits >= 3); // initialize soa... - soa[idx].idv() = -1; + trxsoa[idx].idv() = -1; if (reco::isTriplet(tracks_view, idx)) continue; // no triplets @@ -75,6 +76,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { template >> ALPAKA_FN_ACC void operator()(const TAcc& acc, VtxSoAView pdata, + TrxSoAView ptrxdata, WsSoAView pws, bool doSplit, int minT, // min number of neighbours to be "seed" @@ -82,17 +84,17 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float errmax, // max error to be "seed" float chi2max // max normalized distance to cluster, ) const { - clusterTracksByDensity(acc, pdata, pws, minT, eps, errmax, chi2max); + clusterTracksByDensity(acc, pdata, ptrxdata, pws, minT, eps, errmax, chi2max); alpaka::syncBlockThreads(acc); - fitVertices(acc, pdata, pws, maxChi2ForFirstFit); + fitVertices(acc, pdata, ptrxdata, pws, maxChi2ForFirstFit); alpaka::syncBlockThreads(acc); if (doSplit) { - splitVertices(acc, pdata, pws, maxChi2ForSplit); + splitVertices(acc, pdata, ptrxdata, pws, maxChi2ForSplit); alpaka::syncBlockThreads(acc); - fitVertices(acc, pdata, pws, maxChi2ForFinalFit); + fitVertices(acc, pdata, ptrxdata, pws, maxChi2ForFinalFit); alpaka::syncBlockThreads(acc); } - sortByPt2(acc, pdata, pws); + sortByPt2(acc, pdata, ptrxdata, pws); } }; #else @@ -134,6 +136,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ZVertexSoACollection vertices(queue); auto soa = vertices.view(); + auto trxsoa = vertices.view(); auto ws_d = PixelVertexWorkSpaceSoADevice(::zVertex::MAXTRACKS, queue); @@ -147,7 +150,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { cms::alpakatools::divide_up_by(tracks_view.metadata().size() + blockSize - 1, blockSize); const auto loadTracksWorkDiv = cms::alpakatools::make_workdiv(numberOfBlocks, blockSize); alpaka::exec( - queue, loadTracksWorkDiv, LoadTracks{}, tracks_view, soa, ws_d.view(), ptMin, ptMax); + queue, loadTracksWorkDiv, LoadTracks{}, tracks_view, soa, trxsoa, ws_d.view(), ptMin, ptMax); // Running too many thread lead to problems when printf is enabled. const auto finderSorterWorkDiv = cms::alpakatools::make_workdiv(1, 1024 - 128); @@ -160,6 +163,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { finderSorterWorkDiv, VertexFinderOneKernel{}, soa, + trxsoa, ws_d.view(), doSplitting_, minT, @@ -168,34 +172,46 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { chi2max); #else alpaka::exec( - queue, finderSorterWorkDiv, VertexFinderOneKernel{}, soa, ws_d.view(), minT, eps, errmax, chi2max); + queue, finderSorterWorkDiv, VertexFinderOneKernel{}, soa, trxsoa, ws_d.view(), minT, eps, errmax, chi2max); // one block per vertex... if (doSplitting_) - alpaka::exec(queue, splitterFitterWorkDiv, SplitVerticesKernel{}, soa, ws_d.view(), maxChi2ForSplit); + alpaka::exec( + queue, splitterFitterWorkDiv, SplitVerticesKernel{}, soa, trxsoa, ws_d.view(), maxChi2ForSplit); alpaka::exec(queue, finderSorterWorkDiv{}, soa, ws_d.view()); #endif } else { // five kernels if (useDensity_) { - alpaka::exec( - queue, finderSorterWorkDiv, ClusterTracksByDensityKernel{}, soa, ws_d.view(), minT, eps, errmax, chi2max); + alpaka::exec(queue, + finderSorterWorkDiv, + ClusterTracksByDensityKernel{}, + soa, + trxsoa, + ws_d.view(), + minT, + eps, + errmax, + chi2max); } else if (useDBSCAN_) { alpaka::exec( - queue, finderSorterWorkDiv, ClusterTracksDBSCAN{}, soa, ws_d.view(), minT, eps, errmax, chi2max); + queue, finderSorterWorkDiv, ClusterTracksDBSCAN{}, soa, trxsoa, ws_d.view(), minT, eps, errmax, chi2max); } else if (useIterative_) { alpaka::exec( - queue, finderSorterWorkDiv, ClusterTracksIterative{}, soa, ws_d.view(), minT, eps, errmax, chi2max); + queue, finderSorterWorkDiv, ClusterTracksIterative{}, soa, trxsoa, ws_d.view(), minT, eps, errmax, chi2max); } - alpaka::exec(queue, finderSorterWorkDiv, FitVerticesKernel{}, soa, ws_d.view(), maxChi2ForFirstFit); + alpaka::exec( + queue, finderSorterWorkDiv, FitVerticesKernel{}, soa, trxsoa, ws_d.view(), maxChi2ForFirstFit); // one block per vertex... if (doSplitting_) { - alpaka::exec(queue, splitterFitterWorkDiv, SplitVerticesKernel{}, soa, ws_d.view(), maxChi2ForSplit); + alpaka::exec( + queue, splitterFitterWorkDiv, SplitVerticesKernel{}, soa, trxsoa, ws_d.view(), maxChi2ForSplit); - alpaka::exec(queue, finderSorterWorkDiv, FitVerticesKernel{}, soa, ws_d.view(), maxChi2ForFinalFit); + alpaka::exec( + queue, finderSorterWorkDiv, FitVerticesKernel{}, soa, trxsoa, ws_d.view(), maxChi2ForFinalFit); } - alpaka::exec(queue, finderSorterWorkDiv, SortByPt2Kernel{}, soa, ws_d.view()); + alpaka::exec(queue, finderSorterWorkDiv, SortByPt2Kernel{}, soa, trxsoa, ws_d.view()); } return vertices; diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.h index 92890b89bb9c4..5ffd328ce295f 100644 --- a/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.h +++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.h @@ -19,6 +19,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { using namespace cms::alpakatools; using VtxSoAView = ::reco::ZVertexSoAView; + using TrxSoAView = ::reco::ZVertexTracksSoAView; using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView; class Init { diff --git a/RecoTracker/PixelVertexFinding/test/alpaka/VertexFinder_t.dev.cc b/RecoTracker/PixelVertexFinding/test/alpaka/VertexFinder_t.dev.cc index b632eb50ce158..c860b5b4f327c 100644 --- a/RecoTracker/PixelVertexFinding/test/alpaka/VertexFinder_t.dev.cc +++ b/RecoTracker/PixelVertexFinding/test/alpaka/VertexFinder_t.dev.cc @@ -85,21 +85,22 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { template ALPAKA_FN_ACC void operator()(const TAcc& acc, vertexFinder::VtxSoAView pdata, + vertexFinder::TrxSoAView ptrxdata, vertexFinder::WsSoAView pws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" float chi2max // max normalized distance to cluster, ) const { - vertexFinder::clusterTracksByDensity(acc, pdata, pws, minT, eps, errmax, chi2max); + vertexFinder::clusterTracksByDensity(acc, pdata, ptrxdata, pws, minT, eps, errmax, chi2max); alpaka::syncBlockThreads(acc); - vertexFinder::fitVertices(acc, pdata, pws, 50.); + vertexFinder::fitVertices(acc, pdata, ptrxdata, pws, 50.); alpaka::syncBlockThreads(acc); - vertexFinder::splitVertices(acc, pdata, pws, 9.f); + vertexFinder::splitVertices(acc, pdata, ptrxdata, pws, 9.f); alpaka::syncBlockThreads(acc); - vertexFinder::fitVertices(acc, pdata, pws, 5000.); + vertexFinder::fitVertices(acc, pdata, ptrxdata, pws, 5000.); alpaka::syncBlockThreads(acc); - vertexFinder::sortByPt2(acc, pdata, pws); + vertexFinder::sortByPt2(acc, pdata, ptrxdata, pws); alpaka::syncBlockThreads(acc); } }; @@ -110,6 +111,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { template ALPAKA_FN_ACC void operator()(const TAcc& acc, vertexFinder::VtxSoAView pdata, + // vertexFinder::TrxSoAView ptrxdata, vertexFinder::WsSoAView pws) const { printf("nt,nv %d %d,%d\n", pws.ntrks(), pdata.nvFinal(), pws.nvIntermediate()); } @@ -155,14 +157,23 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { workDivClusterizer, VertexFinderOneKernel{}, vertices_d.view(), + vertices_d.view(), ws_d.view(), kk, par[0], par[1], par[2]); #else - alpaka::exec( - queue, workDivClusterizer, CLUSTERIZE{}, vertices_d.view(), ws_d.view(), kk, par[0], par[1], par[2]); + alpaka::exec(queue, + workDivClusterizer, + CLUSTERIZE{}, + vertices_d.view(), + vertices_d.view(), + ws_d.view(), + kk, + par[0], + par[1], + par[2]); #endif alpaka::wait(queue); alpaka::exec(queue, workDiv1D, Kernel_print{}, vertices_d.view(), ws_d.view()); @@ -170,8 +181,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { auto workDivFitter = make_workdiv(1, 1024 - 256); - alpaka::exec( - queue, workDivFitter, vertexFinder::FitVerticesKernel{}, vertices_d.view(), ws_d.view(), 50.f); + alpaka::exec(queue, + workDivFitter, + vertexFinder::FitVerticesKernel{}, + vertices_d.view(), + vertices_d.view(), + ws_d.view(), + 50.f); alpaka::memcpy(queue, vertices_h.buffer(), vertices_d.buffer()); alpaka::wait(queue); @@ -182,8 +198,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } for (auto j = 0U; j < vertices_h.view().nvFinal(); ++j) - if (vertices_h.view().ndof()[j] > 0) - vertices_h.view().chi2()[j] /= float(vertices_h.view().ndof()[j]); + if (vertices_h.view().ndof()[j] > 0) + vertices_h.view().chi2()[j] /= float(vertices_h.view().ndof()[j]); { auto mx = std::minmax_element(vertices_h.view().chi2(), vertices_h.view().chi2() + vertices_h.view().nvFinal()); @@ -191,14 +207,19 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { << *mx.second << std::endl; } - alpaka::exec( - queue, workDivFitter, vertexFinder::FitVerticesKernel{}, vertices_d.view(), ws_d.view(), 50.f); + alpaka::exec(queue, + workDivFitter, + vertexFinder::FitVerticesKernel{}, + vertices_d.view(), + vertices_d.view(), + ws_d.view(), + 50.f); alpaka::memcpy(queue, vertices_h.buffer(), vertices_d.buffer()); alpaka::wait(queue); for (auto j = 0U; j < vertices_h.view().nvFinal(); ++j) - if (vertices_h.view().ndof()[j] > 0) - vertices_h.view().chi2()[j] /= float(vertices_h.view().ndof()[j]); + if (vertices_h.view().ndof()[j] > 0) + vertices_h.view().chi2()[j] /= float(vertices_h.view().ndof()[j]); { auto mx = std::minmax_element(vertices_h.view().chi2(), vertices_h.view().chi2() + vertices_h.view().nvFinal()); @@ -209,17 +230,32 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { auto workDivSplitter = make_workdiv(1024, 64); // one vertex per block!!! - alpaka::exec( - queue, workDivSplitter, vertexFinder::SplitVerticesKernel{}, vertices_d.view(), ws_d.view(), 9.f); + alpaka::exec(queue, + workDivSplitter, + vertexFinder::SplitVerticesKernel{}, + vertices_d.view(), + vertices_d.view(), + ws_d.view(), + 9.f); alpaka::memcpy(queue, ws_h.buffer(), ws_d.buffer()); alpaka::wait(queue); std::cout << "after split " << ws_h.view().nvIntermediate() << std::endl; - alpaka::exec( - queue, workDivFitter, vertexFinder::FitVerticesKernel{}, vertices_d.view(), ws_d.view(), 5000.f); + alpaka::exec(queue, + workDivFitter, + vertexFinder::FitVerticesKernel{}, + vertices_d.view(), + vertices_d.view(), + ws_d.view(), + 5000.f); auto workDivSorter = make_workdiv(1, 256); - alpaka::exec(queue, workDivSorter, vertexFinder::SortByPt2Kernel{}, vertices_d.view(), ws_d.view()); + alpaka::exec(queue, + workDivSorter, + vertexFinder::SortByPt2Kernel{}, + vertices_d.view(), + vertices_d.view(), + ws_d.view()); alpaka::memcpy(queue, vertices_h.buffer(), vertices_d.buffer()); alpaka::wait(queue); @@ -229,8 +265,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } for (auto j = 0U; j < vertices_h.view().nvFinal(); ++j) - if (vertices_h.view().ndof()[j] > 0) - vertices_h.view().chi2()[j] /= float(vertices_h.view().ndof()[j]); + if (vertices_h.view().ndof()[j] > 0) + vertices_h.view().chi2()[j] /= float(vertices_h.view().ndof()[j]); { auto mx = std::minmax_element(vertices_h.view().chi2(), vertices_h.view().chi2() + vertices_h.view().nvFinal());