From 88da3bc1c3f19de3c87ee87667d77d6ac7c35e85 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Fri, 6 May 2022 09:25:55 +0200 Subject: [PATCH] test CPU as well --- .../interface/TrackingRecHit2DHeterogeneous.h | 188 +---------------- .../interface/TrackingRecHit2DSOAView.h | 1 - .../src/TrackingRecHit2DHeterogeneous.cc | 190 +++++++++++++++++- .../test/TrackingRecHit2DCUDA_t.cpp | 24 ++- 4 files changed, 211 insertions(+), 192 deletions(-) diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h index 701ded4666414..7ee5f7b12e915 100644 --- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h @@ -5,9 +5,7 @@ #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h" -template class TrackingRecHit2DHeterogeneous { public: enum class Storage32 { @@ -44,11 +42,12 @@ class TrackingRecHit2DHeterogeneous { int32_t offsetBPIX2, pixelCPEforGPU::ParamsOnGPU const* cpeParams, uint32_t const* hitsModuleStart, + memoryPool::Where where, cudaStream_t stream, - TrackingRecHit2DHeterogeneous const* input = nullptr); + TrackingRecHit2DHeterogeneous const* input = nullptr); explicit TrackingRecHit2DHeterogeneous( - float* store32, uint16_t* store16, uint32_t* modules, int nHits, cudaStream_t stream = nullptr); + float* store32, uint16_t* store16, uint32_t* modules, int nHits, memoryPool::Where where, cudaStream_t stream = nullptr); ~TrackingRecHit2DHeterogeneous() = default; TrackingRecHit2DHeterogeneous(const TrackingRecHit2DHeterogeneous&) = delete; @@ -77,7 +76,7 @@ class TrackingRecHit2DHeterogeneous { buffer store32ToHostAsync(cudaStream_t stream) const; // needs specialization for Host - void copyFromGPU(TrackingRecHit2DHeterogeneous const* input, cudaStream_t stream); + void copyFromGPU(TrackingRecHit2DHeterogeneous const* input, cudaStream_t stream); private: @@ -106,182 +105,9 @@ class TrackingRecHit2DHeterogeneous { int16_t* m_iphi; }; -using TrackingRecHit2DGPU = TrackingRecHit2DHeterogeneous; -using TrackingRecHit2DCPU = TrackingRecHit2DHeterogeneous; -using TrackingRecHit2DHost = TrackingRecHit2DHeterogeneous; +using TrackingRecHit2DGPU = TrackingRecHit2DHeterogeneous; +using TrackingRecHit2DCPU = TrackingRecHit2DHeterogeneous; +using TrackingRecHit2DHost = TrackingRecHit2DHeterogeneous; -template -TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous( - uint32_t nHits, - bool isPhase2, - int32_t offsetBPIX2, - pixelCPEforGPU::ParamsOnGPU const* cpeParams, - uint32_t const* hitsModuleStart, - cudaStream_t stream, - TrackingRecHit2DHeterogeneous const* input) - : m_nHits(nHits), m_offsetBPIX2(offsetBPIX2), m_hitsModuleStart(hitsModuleStart) { - - using namespace memoryPool::cuda; - - memoryPool::Deleter deleter = memoryPool::Deleter(std::make_shared(stream,where)); - assert(deleter.pool()); - auto view = make_buffer(1,stream, memoryPool::onCPU==where ? memoryPool::onCPU : memoryPool::onHost); - assert(view.get()); - m_nMaxModules = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules; - assert(view.get()); - view->m_nHits = nHits; - view->m_nMaxModules = m_nMaxModules; - m_view = make_buffer(1, deleter); // stream, where); // deleter); // leave it on host and pass it by value? - assert(m_view.get()); - m_AverageGeometryStore = make_buffer(1,deleter); - view->m_averageGeometry = m_AverageGeometryStore.get(); - view->m_cpeParams = cpeParams; - view->m_hitsModuleStart = hitsModuleStart; - - // if empy do not bother - if (0 == nHits) { - if constexpr (memoryPool::onDevice == where) { - cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream)); - } else { - memoryPool::cuda::swapBuffer(m_view,view); - } - return; - } - - // the single arrays are not 128 bit alligned... - // the hits are actually accessed in order only in building - // if ordering is relevant they may have to be stored phi-ordered by layer or so - // this will break 1to1 correspondence with cluster and module locality - // so unless proven VERY inefficient we keep it ordered as generated - - // host copy is "reduced" (to be reviewed at some point) - if constexpr (memoryPool::onHost == where) { - // it has to compile for ALL cases - copyFromGPU(input, stream); - } else { - assert(input == nullptr); - - auto nL = isPhase2 ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers; - - m_store16 = make_buffer(nHits * n16, deleter); - m_store32 = make_buffer(nHits * n32 + nL + 1, deleter); - m_PhiBinnerStore = make_buffer(1,deleter); - } - - static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float)); - static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(TrackingRecHit2DSOAView::PhiBinner::index_type)); - - auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast(i) * nHits; }; - - // copy all the pointers - m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); - m_phiBinnerStorage = view->m_phiBinnerStorage = - reinterpret_cast(get32(Storage32::kPhiStorage)); - - view->m_xl = get32(Storage32::kXLocal); - view->m_yl = get32(Storage32::kYLocal); - view->m_xerr = get32(Storage32::kXerror); - view->m_yerr = get32(Storage32::kYerror); - view->m_chargeAndStatus = reinterpret_cast(get32(Storage32::kCharge)); - - if constexpr (memoryPool::onHost != where) { - assert(input == nullptr); - view->m_xg = get32(Storage32::kXGlobal); - view->m_yg = get32(Storage32::kYGlobal); - view->m_zg = get32(Storage32::kZGlobal); - view->m_rg = get32(Storage32::kRGlobal); - - auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast(i) * nHits; }; - m_iphi = view->m_iphi = reinterpret_cast(get16(Storage16::kPhi)); - - view->m_xsize = reinterpret_cast(get16(Storage16::kXSize)); - view->m_ysize = reinterpret_cast(get16(Storage16::kYSize)); - view->m_detInd = get16(Storage16::kDetId); - - m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); - m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast(get32(Storage32::kLayers)); - } - - // transfer view - if constexpr (memoryPool::onDevice == where) { - cudaCheck(cudaMemcpyAsync(m_view.get(), view.get(),sizeof(TrackingRecHit2DSOAView), cudaMemcpyHostToDevice, stream)); -// cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream)); - } else { - memoryPool::cuda::swapBuffer(m_view,view); - } -} - -//this is intended to be used only for CPU SoA but doesn't hurt to have it for all cases -template -TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous( - float* store32, uint16_t* store16, uint32_t* modules, int nHits, cudaStream_t stream) - : m_nHits(nHits), m_hitsModuleStart(modules) { - - using namespace memoryPool::cuda; - auto view = make_buffer(1,stream,memoryPool::onCPU==where? memoryPool::onCPU : memoryPool::onHost); - - m_view = make_buffer(1,stream,where); - - view->m_nHits = nHits; - - if (0 == nHits) { - if constexpr (memoryPool::onDevice == where) { - cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream)); - } else { - m_view = std::move(view); - } - return; - } - - m_store16 = make_buffer(nHits * n16, stream,where); - m_store32 = make_buffer(nHits * n32, stream,where); - m_PhiBinnerStore = make_buffer(1,stream,where); - m_AverageGeometryStore = make_buffer(1,stream,where); - - view->m_averageGeometry = m_AverageGeometryStore.get(); - view->m_hitsModuleStart = m_hitsModuleStart; - - //store transfer - if constexpr (memoryPool::onDevice == where) { - cudaCheck(cudaMemcpyAsync(m_store32.get(), store32, nHits * n32, cudaMemcpyHostToDevice,stream)); - cudaCheck(cudaMemcpyAsync(m_store16.get(), store16, nHits * n16, cudaMemcpyHostToDevice,stream)); - } else { - std::copy(store32, store32 + nHits * n32, m_store32.get()); // want to copy it - std::copy(store16, store16 + nHits * n16, m_store16.get()); - } - - //getters - auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast(i) * nHits; }; - auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast(i) * nHits; }; - - //Store 32 - view->m_xl = get32(Storage32::kXLocal); - view->m_yl = get32(Storage32::kYLocal); - view->m_xerr = get32(Storage32::kXerror); - view->m_yerr = get32(Storage32::kYerror); - view->m_chargeAndStatus = reinterpret_cast(get32(Storage32::kCharge)); - view->m_xg = get32(Storage32::kXGlobal); - view->m_yg = get32(Storage32::kYGlobal); - view->m_zg = get32(Storage32::kZGlobal); - view->m_rg = get32(Storage32::kRGlobal); - - m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); - m_phiBinnerStorage = view->m_phiBinnerStorage = - reinterpret_cast(get32(Storage32::kPhiStorage)); - - //Store 16 - view->m_detInd = get16(Storage16::kDetId); - m_iphi = view->m_iphi = reinterpret_cast(get16(Storage16::kPhi)); - view->m_xsize = reinterpret_cast(get16(Storage16::kXSize)); - view->m_ysize = reinterpret_cast(get16(Storage16::kYSize)); - - // transfer view - if constexpr (memoryPool::onDevice == where) { - cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream)); - } else { - m_view = std::move(view); - } -} - #endif // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h index 85afb9e8dcbdb..748b8ec6ad593 100644 --- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h @@ -26,7 +26,6 @@ class TrackingRecHit2DSOAView { using AverageGeometry = pixelTopology::AverageGeometry; - template friend class TrackingRecHit2DHeterogeneous; friend class TrackingRecHit2DReduced; diff --git a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc index 96e79e08ed6a7..2b73267b05fc3 100644 --- a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc +++ b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc @@ -1,10 +1,188 @@ #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" + +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h" +#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" + #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h" + +TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous( + uint32_t nHits, + bool isPhase2, + int32_t offsetBPIX2, + pixelCPEforGPU::ParamsOnGPU const* cpeParams, + uint32_t const* hitsModuleStart, + memoryPool::Where where, + cudaStream_t stream, + TrackingRecHit2DHeterogeneous const* input) + : m_nHits(nHits), m_offsetBPIX2(offsetBPIX2), m_hitsModuleStart(hitsModuleStart) { + + using namespace memoryPool::cuda; + + memoryPool::Deleter deleter = memoryPool::Deleter(std::make_shared(stream,where)); + assert(deleter.pool()); + auto view = make_buffer(1,stream, memoryPool::onCPU==where ? memoryPool::onCPU : memoryPool::onHost); + assert(view.get()); + m_nMaxModules = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules; + assert(view.get()); + view->m_nHits = nHits; + view->m_nMaxModules = m_nMaxModules; + m_view = make_buffer(1, deleter); // stream, where); // deleter); // leave it on host and pass it by value? + assert(m_view.get()); + m_AverageGeometryStore = make_buffer(1,deleter); + view->m_averageGeometry = m_AverageGeometryStore.get(); + view->m_cpeParams = cpeParams; + view->m_hitsModuleStart = hitsModuleStart; + + // if empy do not bother + if (0 == nHits) { + if (memoryPool::onDevice == where) { + cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream)); + } else { + memoryPool::cuda::swapBuffer(m_view,view); + } + return; + } + + // the single arrays are not 128 bit alligned... + // the hits are actually accessed in order only in building + // if ordering is relevant they may have to be stored phi-ordered by layer or so + // this will break 1to1 correspondence with cluster and module locality + // so unless proven VERY inefficient we keep it ordered as generated + + // host copy is "reduced" (to be reviewed at some point) + if (memoryPool::onHost == where) { + // it has to compile for ALL cases + copyFromGPU(input, stream); + } else { + assert(input == nullptr); + + auto nL = isPhase2 ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers; + + m_store16 = make_buffer(nHits * n16, deleter); + m_store32 = make_buffer(nHits * n32 + nL + 1, deleter); + m_PhiBinnerStore = make_buffer(1,deleter); + } + + static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float)); + static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(TrackingRecHit2DSOAView::PhiBinner::index_type)); + + auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast(i) * nHits; }; + + // copy all the pointers + m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); + m_phiBinnerStorage = view->m_phiBinnerStorage = + reinterpret_cast(get32(Storage32::kPhiStorage)); + + view->m_xl = get32(Storage32::kXLocal); + view->m_yl = get32(Storage32::kYLocal); + view->m_xerr = get32(Storage32::kXerror); + view->m_yerr = get32(Storage32::kYerror); + view->m_chargeAndStatus = reinterpret_cast(get32(Storage32::kCharge)); + + if (memoryPool::onHost != where) { + assert(input == nullptr); + view->m_xg = get32(Storage32::kXGlobal); + view->m_yg = get32(Storage32::kYGlobal); + view->m_zg = get32(Storage32::kZGlobal); + view->m_rg = get32(Storage32::kRGlobal); + + auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast(i) * nHits; }; + m_iphi = view->m_iphi = reinterpret_cast(get16(Storage16::kPhi)); + + view->m_xsize = reinterpret_cast(get16(Storage16::kXSize)); + view->m_ysize = reinterpret_cast(get16(Storage16::kYSize)); + view->m_detInd = get16(Storage16::kDetId); + + m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); + m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast(get32(Storage32::kLayers)); + } + + // transfer view + if (memoryPool::onDevice == where) { + cudaCheck(cudaMemcpyAsync(m_view.get(), view.get(),sizeof(TrackingRecHit2DSOAView), cudaMemcpyHostToDevice, stream)); +// cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream)); + } else { + memoryPool::cuda::swapBuffer(m_view,view); + } +} + +//this is intended to be used only for CPU SoA but doesn't hurt to have it for all cases +TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous( + float* store32, uint16_t* store16, uint32_t* modules, int nHits, memoryPool::Where where, cudaStream_t stream) + : m_nHits(nHits), m_hitsModuleStart(modules) { + + using namespace memoryPool::cuda; + auto view = make_buffer(1,stream,memoryPool::onCPU==where? memoryPool::onCPU : memoryPool::onHost); + + m_view = make_buffer(1,stream,where); + + view->m_nHits = nHits; + + if (0 == nHits) { + if (memoryPool::onDevice == where) { + cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream)); + } else { + m_view = std::move(view); + } + return; + } + + m_store16 = make_buffer(nHits * n16, stream,where); + m_store32 = make_buffer(nHits * n32, stream,where); + m_PhiBinnerStore = make_buffer(1,stream,where); + m_AverageGeometryStore = make_buffer(1,stream,where); + + view->m_averageGeometry = m_AverageGeometryStore.get(); + view->m_hitsModuleStart = m_hitsModuleStart; + + //store transfer + if (memoryPool::onDevice == where) { + cudaCheck(cudaMemcpyAsync(m_store32.get(), store32, nHits * n32, cudaMemcpyHostToDevice,stream)); + cudaCheck(cudaMemcpyAsync(m_store16.get(), store16, nHits * n16, cudaMemcpyHostToDevice,stream)); + } else { + std::copy(store32, store32 + nHits * n32, m_store32.get()); // want to copy it + std::copy(store16, store16 + nHits * n16, m_store16.get()); + } + + //getters + auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast(i) * nHits; }; + auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast(i) * nHits; }; + + //Store 32 + view->m_xl = get32(Storage32::kXLocal); + view->m_yl = get32(Storage32::kYLocal); + view->m_xerr = get32(Storage32::kXerror); + view->m_yerr = get32(Storage32::kYerror); + view->m_chargeAndStatus = reinterpret_cast(get32(Storage32::kCharge)); + view->m_xg = get32(Storage32::kXGlobal); + view->m_yg = get32(Storage32::kYGlobal); + view->m_zg = get32(Storage32::kZGlobal); + view->m_rg = get32(Storage32::kRGlobal); + + m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); + m_phiBinnerStorage = view->m_phiBinnerStorage = + reinterpret_cast(get32(Storage32::kPhiStorage)); + + //Store 16 + view->m_detInd = get16(Storage16::kDetId); + m_iphi = view->m_iphi = reinterpret_cast(get16(Storage16::kPhi)); + view->m_xsize = reinterpret_cast(get16(Storage16::kXSize)); + view->m_ysize = reinterpret_cast(get16(Storage16::kYSize)); + + // transfer view + if (memoryPool::onDevice == where) { + cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream)); + } else { + m_view = std::move(view); + } +} + + + using namespace memoryPool::cuda; -template <> memoryPool::buffer TrackingRecHit2DGPU::localCoordToHostAsync(cudaStream_t stream) const { auto ret = make_buffer(5 * nHits(), stream,memoryPool::onHost); cudaCheck( @@ -12,7 +190,7 @@ memoryPool::buffer TrackingRecHit2DGPU::localCoordToHostAsync(cudaStream_ return ret; } -template <> + memoryPool::buffer TrackingRecHit2DGPU::store32ToHostAsync(cudaStream_t stream) const { auto ret = make_buffer(static_cast(n32) * nHits(), stream,memoryPool::onHost); cudaCheck( @@ -20,7 +198,7 @@ memoryPool::buffer TrackingRecHit2DGPU::store32ToHostAsync(cudaStream_t s return ret; } -template <> + memoryPool::buffer TrackingRecHit2DGPU::store16ToHostAsync(cudaStream_t stream) const { auto ret = make_buffer(static_cast(n16) * nHits(), stream,memoryPool::onHost); cudaCheck( @@ -28,7 +206,6 @@ memoryPool::buffer TrackingRecHit2DGPU::store16ToHostAsync(cudaStream_ return ret; } -template <> memoryPool::buffer TrackingRecHit2DGPU::hitsModuleStartToHostAsync(cudaStream_t stream) const { auto ret = make_buffer(nMaxModules() + 1, stream,memoryPool::onHost); if (m_hitsModuleStart) cudaCheck( @@ -36,9 +213,10 @@ memoryPool::buffer TrackingRecHit2DGPU::hitsModuleStartToHostAsync(cud return ret; } -// the only specialization needed -template <> void TrackingRecHit2DHost::copyFromGPU(TrackingRecHit2DGPU const* input, cudaStream_t stream) { assert(input); m_store32 = input->localCoordToHostAsync(stream); } + + + diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp index e13feba47ae65..143b20803d867 100644 --- a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp +++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp @@ -4,6 +4,7 @@ #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" #include "HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h" namespace testTrackingRecHit2D { @@ -20,15 +21,15 @@ int main() { auto nHits = 200; // inner scope to deallocate memory before destroying the stream { - TrackingRecHit2DGPU tkhit(nHits, false, 0, nullptr, nullptr, stream); + TrackingRecHit2DGPU tkhit(nHits, false, 0, nullptr, nullptr, memoryPool::onDevice, stream); testTrackingRecHit2D::runKernels(tkhit.view()); - TrackingRecHit2DGPU tkhitPhase2(nHits, true, 0, nullptr, nullptr, stream); + TrackingRecHit2DGPU tkhitPhase2(nHits, true, 0, nullptr, nullptr, memoryPool::onDevice, stream); testTrackingRecHit2D::runKernels(tkhitPhase2.view()); memoryPool::cuda::dumpStat(); - TrackingRecHit2DHost tkhitH(nHits, false, 0, nullptr, nullptr, stream, &tkhit); + TrackingRecHit2DHost tkhitH(nHits, false, 0, nullptr, nullptr, memoryPool::onHost, stream, &tkhit); cudaStreamSynchronize(stream); memoryPool::cuda::dumpStat(); @@ -38,7 +39,7 @@ int main() { assert(tkhitH.view()->nHits() == unsigned(nHits)); assert(tkhitH.view()->nMaxModules() == phase1PixelTopology::numberOfModules); - TrackingRecHit2DHost tkhitHPhase2(nHits, true, 0, nullptr, nullptr, stream, &tkhitPhase2); + TrackingRecHit2DHost tkhitHPhase2(nHits, true, 0, nullptr, nullptr, memoryPool::onHost, stream, &tkhitPhase2); cudaStreamSynchronize(stream); assert(tkhitHPhase2.view()); assert(tkhitHPhase2.view()->nHits() == unsigned(nHits)); @@ -46,6 +47,8 @@ int main() { memoryPool::cuda::dumpStat(); + + } cudaCheck(cudaStreamSynchronize(stream)); @@ -58,5 +61,18 @@ int main() { memoryPool::cuda::dumpStat(); + { + TrackingRecHit2DGPU tkhit(nHits, false, 0, nullptr, nullptr, memoryPool::onCPU, nullptr); + assert(tkhit.view()); + assert(tkhit.view()->nHits() == unsigned(nHits)); + assert(tkhit.view()->nMaxModules() == phase1PixelTopology::numberOfModules); + std::cout << "on CPU" << std::endl; + ((SimplePoolAllocatorImpl*)memoryPool::cuda::getPool(memoryPool::onCPU))->dumpStat(); + } + std::cout << "on CPU" << std::endl; + ((SimplePoolAllocatorImpl*)memoryPool::cuda::getPool(memoryPool::onCPU))->dumpStat(); + + + return 0; }