Skip to content

Commit

Permalink
Implement changes from the CUDA framework review (#429)
Browse files Browse the repository at this point in the history
Rename the cudautils namespace to cms::cuda or cms::cudatest, and drop the CUDA prefix from the symbols defined there.

Always record and query the CUDA event, to minimize need for error checking in CUDAScopedContextProduce destructor.

Add comments to highlight the pieces in CachingDeviceAllocator that have been changed wrt. cub.

Various other updates and clean up:
  - enable CUDA for compute capability 3.5.
  - clean up CUDAService, CUDA tests and plugins.
  - add CUDA existence protections to BuildFiles.
  - mark thread-safe static variables with CMS_THREAD_SAFE.
  • Loading branch information
cmsbuild authored and fwyzard committed Jan 13, 2021
1 parent 66543fc commit f700b40
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 22 deletions.
4 changes: 2 additions & 2 deletions CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@

using ZVertexHeterogeneous = HeterogeneousSoA<ZVertexSoA>;
#ifndef __CUDACC__
#include "CUDADataFormats/Common/interface/CUDAProduct.h"
using ZVertexCUDAProduct = CUDAProduct<ZVertexHeterogeneous>;
#include "CUDADataFormats/Common/interface/Product.h"
using ZVertexCUDAProduct = cms::cuda::Product<ZVertexHeterogeneous>;
#endif

#endif
2 changes: 1 addition & 1 deletion CUDADataFormats/Vertex/src/classes.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#define CUDADataFormats__src_classes_h

#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
#include "CUDADataFormats/Common/interface/CUDAProduct.h"
#include "CUDADataFormats/Common/interface/Product.h"
#include "DataFormats/Common/interface/Wrapper.h"

#endif
2 changes: 1 addition & 1 deletion CUDADataFormats/Vertex/src/classes_def.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<lcgdict>
<class name="CUDAProduct<ZVertexHeterogeneous>" persistent="false"/>
<class name="cms::cuda::Product<ZVertexHeterogeneous>" persistent="false"/>
<class name="edm::Wrapper<ZVertexCUDAProduct>" persistent="false"/>
<class name="ZVertexHeterogeneous" persistent="false"/>
<class name="edm::Wrapper<ZVertexHeterogeneous>" persistent="false"/>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include <cuda_runtime.h>

#include "CUDADataFormats/Common/interface/CUDAProduct.h"
#include "CUDADataFormats/Common/interface/Product.h"
#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
Expand All @@ -18,7 +18,7 @@
#include "FWCore/Utilities/interface/EDGetToken.h"
#include "FWCore/Utilities/interface/InputTag.h"
#include "FWCore/Utilities/interface/RunningAverage.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
#include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"

class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> {
Expand All @@ -31,8 +31,8 @@ class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> {
private:
void analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const override;
const bool m_onGPU;
edm::EDGetTokenT<CUDAProduct<PixelTrackHeterogeneous>> tokenGPUTrack_;
edm::EDGetTokenT<CUDAProduct<ZVertexHeterogeneous>> tokenGPUVertex_;
edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenGPUTrack_;
edm::EDGetTokenT<cms::cuda::Product<ZVertexHeterogeneous>> tokenGPUVertex_;
edm::EDGetTokenT<PixelTrackHeterogeneous> tokenSoATrack_;
edm::EDGetTokenT<ZVertexHeterogeneous> tokenSoAVertex_;
};
Expand All @@ -41,9 +41,9 @@ PixelTrackDumpCUDA::PixelTrackDumpCUDA(const edm::ParameterSet& iConfig)
: m_onGPU(iConfig.getParameter<bool>("onGPU")) {
if (m_onGPU) {
tokenGPUTrack_ =
consumes<CUDAProduct<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
tokenGPUVertex_ =
consumes<CUDAProduct<ZVertexHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
consumes<cms::cuda::Product<ZVertexHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
} else {
tokenSoATrack_ = consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
tokenSoAVertex_ = consumes<ZVertexHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
Expand All @@ -64,7 +64,7 @@ void PixelTrackDumpCUDA::analyze(edm::StreamID streamID,
const edm::EventSetup& iSetup) const {
if (m_onGPU) {
auto const& hTracks = iEvent.get(tokenGPUTrack_);
CUDAScopedContextProduce ctx{hTracks};
cms::cuda::ScopedContextProduce ctx{hTracks};

auto const& tracks = ctx.get(hTracks);
auto const* tsoa = tracks.get();
Expand Down
22 changes: 11 additions & 11 deletions RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#include <vector>

#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
#ifdef USE_DBSCAN
#include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksDBSCAN.h"
Expand Down Expand Up @@ -114,10 +114,10 @@ __global__ void print(ZVertices const* pdata, WorkSpace const* pws) {

int main() {
#ifdef __CUDACC__
requireCUDADevices();
cms::cudatest::requireDevices();

auto onGPU_d = cudautils::make_device_unique<ZVertices[]>(1, nullptr);
auto ws_d = cudautils::make_device_unique<WorkSpace[]>(1, nullptr);
auto onGPU_d = cms::cuda::make_device_unique<ZVertices[]>(1, nullptr);
auto ws_d = cms::cuda::make_device_unique<WorkSpace[]>(1, nullptr);
#else
auto onGPU_d = std::make_unique<ZVertices>();
auto ws_d = std::make_unique<WorkSpace>();
Expand Down Expand Up @@ -174,16 +174,16 @@ int main() {
cudaDeviceSynchronize();

#ifdef ONE_KERNEL
cudautils::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
#else
cudautils::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
#endif
print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());

cudaCheck(cudaGetLastError());
cudaDeviceSynchronize();

cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
cms::cuda::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
cudaCheck(cudaGetLastError());
cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));

Expand Down Expand Up @@ -245,7 +245,7 @@ int main() {
}

#ifdef __CUDACC__
cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
cms::cuda::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
Expand All @@ -265,7 +265,7 @@ int main() {

#ifdef __CUDACC__
// one vertex per block!!!
cudautils::launch(splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
cms::cuda::launch(splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
#else
gridDim.x = 1;
Expand All @@ -277,10 +277,10 @@ int main() {
std::cout << "after split " << nv << std::endl;

#ifdef __CUDACC__
cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
cms::cuda::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
cudaCheck(cudaGetLastError());

cudautils::launch(sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get());
cms::cuda::launch(sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get());
cudaCheck(cudaGetLastError());
cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
#else
Expand Down

0 comments on commit f700b40

Please sign in to comment.