Skip to content

Commit

Permalink
Merge branch 'new-detector4' of github.com:pkurash/AliceO2 into new-d…
Browse files Browse the repository at this point in the history
…etector4
  • Loading branch information
pkurash committed Oct 6, 2024
2 parents 266c9a4 + 5d36321 commit 752d06a
Show file tree
Hide file tree
Showing 134 changed files with 7,985 additions and 831 deletions.
9 changes: 9 additions & 0 deletions .github/workflows/clean-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,15 @@ name: Clean PR checks
description: build/O2/o2-dataflow-cs8
type: boolean
default: true
'check_build/O2/o2/aarch64':
description: build/O2/o2/aarch64
type: boolean
default: true
'check_build/O2/o2_slc9':
description: build/O2/o2_slc9
type: boolean
default: true


permissions: {}

Expand Down
18 changes: 18 additions & 0 deletions CCDB/include/CCDB/BasicCCDBManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,10 @@ class CCDBManagerInstance
template <typename T>
T* getForTimeStamp(std::string const& path, long timestamp);

/// retrieve an object of type T from CCDB as stored under path and using the timestamp in the middle of the run
template <typename T>
T* getForRun(std::string const& path, int runNumber, bool setRunMetadata = false);

/// retrieve an object of type T from CCDB as stored under path, timestamp and metaData
template <typename T>
T* getSpecific(std::string const& path, long timestamp = -1, MD metaData = MD())
Expand Down Expand Up @@ -311,6 +315,20 @@ T* CCDBManagerInstance::getForTimeStamp(std::string const& path, long timestamp)
return ptr;
}

template <typename T>
T* CCDBManagerInstance::getForRun(std::string const& path, int runNumber, bool setRunMetadata)
{
auto [start, stop] = getRunDuration(runNumber);
if (start < 0 || stop < 0) {
if (mFatalWhenNull) {
reportFatal(std::string("Failed to get run duration for run ") + std::to_string(runNumber));
}
return nullptr;
}
mMetaData = setRunMetadata ? MD{{"runNumber", std::to_string(runNumber)}} : MD{};
return getForTimeStamp<T>(path, start / 2 + stop / 2);
}

class BasicCCDBManager : public CCDBManagerInstance
{
public:
Expand Down
71 changes: 71 additions & 0 deletions Common/DCAFitter/GPU/DeviceInterface/GPUInterface.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
// All rights not expressly granted are reserved.
//
// This software is distributed under the terms of the GNU General Public
// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
//
// In applying this license CERN does not waive the privileges and immunities
// granted to it by virtue of its status as an Intergovernmental Organization
// or submit itself to any jurisdiction.

/// \brief Helper interface to the GPU device, meant to be compatible with manual allocation/streams and GPUReconstruction ones.
/// \author [email protected]

#ifndef DCAFITTER_GPU_INTERFACE
#define DCAFITTER_GPU_INTERFACE

#include <thread>
#include <vector>
#include <atomic>

namespace o2
{
namespace vertexing
{
namespace device
{

#if !defined(__HIPCC__) && !defined(__CUDACC__)
typedef struct _dummyStream {
} Stream;
#else
#ifdef __HIPCC__
typedef hipStream_t Stream;
#else
typedef cudaStream_t Stream;
#endif
#endif

class GPUInterface
{
public:
GPUInterface(GPUInterface& other) = delete;
void operator=(const GPUInterface&) = delete;

static GPUInterface* Instance();

// APIs
void registerBuffer(void*, size_t);
void unregisterBuffer(void* addr);
void allocDevice(void**, size_t);
void freeDevice(void*);
Stream& getStream(unsigned short N = 0);
Stream& getNextStream();

protected:
GPUInterface(size_t N = 1);
~GPUInterface();

void resize(size_t);

std::atomic<unsigned short> mLastUsedStream{0};
static GPUInterface* sGPUInterface;
std::vector<std::thread> mPool{};
std::vector<Stream> mStreams{};
};

} // namespace device
} // namespace vertexing
} // namespace o2
#endif
3 changes: 3 additions & 0 deletions Common/DCAFitter/GPU/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,15 @@
o2_add_library(DCAFitterCUDA
TARGETVARNAME targetName
SOURCES DCAFitterN.cu
GPUInterface.cu
PUBLIC_INCLUDE_DIRECTORIES ../../include
PUBLIC_INCLUDE_DIRECTORIES ../
PUBLIC_LINK_LIBRARIES O2::MathUtils
O2::ReconstructionDataFormats
O2::DetectorsBase
PRIVATE_LINK_LIBRARIES O2::GPUTrackingCUDAExternalProvider)
set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
# add_compile_options(-lineinfo)

o2_add_test(DCAFitterNCUDA
SOURCES test/testDCAFitterNGPU.cxx
Expand Down
193 changes: 177 additions & 16 deletions Common/DCAFitter/GPU/cuda/DCAFitterN.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@
#include <cuda.h>
#endif

#include <numeric>

#include "GPUCommonDef.h"
#include "DCAFitter/DCAFitterN.h"
// #include "MathUtils/SMatrixGPU.h"
#include "DeviceInterface/GPUInterface.h"

#define gpuCheckError(x) \
{ \
Expand All @@ -36,34 +38,59 @@ namespace o2::vertexing::device
{
namespace kernel
{
GPUg() void warmUpGpuKernel()
{
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
float ia, ib;
ia = ib = 0.0f;
ib += ia + tid;
}

template <typename Fitter>
GPUg() void printKernel(Fitter* ft)
GPUg() void printKernel(Fitter* fitter)
{
if (threadIdx.x == 0) {
printf(" =============== GPU DCA Fitter %d prongs ================\n", Fitter::getNProngs());
ft->print();
printf(" =============== GPU DCA Fitter %d prongs =================\n", Fitter::getNProngs());
fitter->print();
printf(" =========================================================\n");
}
}

template <typename Fitter>
GPUg() void initFitters(Fitter* fitters, unsigned int off, unsigned int N)
{
for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x + 1}; iThread < N; iThread += blockDim.x * gridDim.x) {
fitters[iThread + off] = fitters[off];
}
}

template <typename Fitter, typename... Tr>
GPUg() void processKernel(Fitter* fitter, int* res, Tr*... tracks)
{
*res = fitter->process(*tracks...);
}

template <typename Fitter, typename... Tr>
GPUg() void processKernel(Fitter* ft, int* res, Tr*... tracks)
GPUg() void processBatchKernel(Fitter* fitters, int* results, unsigned int off, unsigned int N, Tr*... tracks)
{
*res = ft->process(*tracks...);
for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x}; iThread < N; iThread += blockDim.x * gridDim.x) {
results[iThread + off] = fitters[iThread + off].process(tracks[iThread + off]...);
}
}

} // namespace kernel

/// CPU handlers
template <typename Fitter>
void print(const int nBlocks,
const int nThreads,
Fitter& ft)
Fitter& fitter)
{
Fitter* ft_device;
gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&ft_device), sizeof(Fitter)));
gpuCheckError(cudaMemcpy(ft_device, &ft, sizeof(Fitter), cudaMemcpyHostToDevice));
Fitter* fitter_device;
gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&fitter_device), sizeof(Fitter)));
gpuCheckError(cudaMemcpy(fitter_device, &fitter, sizeof(Fitter), cudaMemcpyHostToDevice));

kernel::printKernel<<<nBlocks, nThreads>>>(ft_device);
kernel::printKernel<<<nBlocks, nThreads>>>(fitter_device);

gpuCheckError(cudaPeekAtLastError());
gpuCheckError(cudaDeviceSynchronize());
Expand All @@ -75,11 +102,11 @@ int process(const int nBlocks,
Fitter& fitter,
Tr&... args)
{
Fitter* ft_device;
Fitter* fitter_device;
std::array<o2::track::TrackParCov*, Fitter::getNProngs()> tracks_device;
int result, *result_device;

gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&ft_device), sizeof(Fitter)));
gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&fitter_device), sizeof(Fitter)));
gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&result_device), sizeof(int)));

int iArg{0};
Expand All @@ -90,15 +117,15 @@ int process(const int nBlocks,
}(),
...);

gpuCheckError(cudaMemcpy(ft_device, &fitter, sizeof(Fitter), cudaMemcpyHostToDevice));
gpuCheckError(cudaMemcpy(fitter_device, &fitter, sizeof(Fitter), cudaMemcpyHostToDevice));

std::apply([&](auto&&... args) { kernel::processKernel<<<nBlocks, nThreads>>>(ft_device, result_device, args...); }, tracks_device);
std::apply([&](auto&&... args) { kernel::processKernel<<<nBlocks, nThreads>>>(fitter_device, result_device, args...); }, tracks_device);

gpuCheckError(cudaPeekAtLastError());
gpuCheckError(cudaDeviceSynchronize());

gpuCheckError(cudaMemcpy(&result, result_device, sizeof(int), cudaMemcpyDeviceToHost));
gpuCheckError(cudaMemcpy(&fitter, ft_device, sizeof(Fitter), cudaMemcpyDeviceToHost));
gpuCheckError(cudaMemcpy(&fitter, fitter_device, sizeof(Fitter), cudaMemcpyDeviceToHost));
iArg = 0;
([&] {
gpuCheckError(cudaMemcpy(&args, tracks_device[iArg], sizeof(o2::track::TrackParCov), cudaMemcpyDeviceToHost));
Expand All @@ -107,11 +134,145 @@ int process(const int nBlocks,
}(),
...);

gpuCheckError(cudaFree(fitter_device));
gpuCheckError(cudaFree(result_device));

return result;
}

template <typename Fitter, class... Tr>
void processBulk(const int nBlocks,
const int nThreads,
const int nBatches,
std::vector<Fitter>& fitters,
std::vector<int>& results,
std::vector<Tr>&... args)
{
auto* gpuInterface = GPUInterface::Instance();
kernel::warmUpGpuKernel<<<1, 1, 0, gpuInterface->getNextStream()>>>();
// Benchmarking events
std::vector<float> ioUp(nBatches), ioDown(nBatches), kerElapsed(nBatches);
std::vector<cudaEvent_t> startIOUp(nBatches), endIOUp(nBatches), startIODown(nBatches), endIODown(nBatches), startKer(nBatches), endKer(nBatches);
for (int iBatch{0}; iBatch < nBatches; ++iBatch) {
gpuCheckError(cudaEventCreate(&startIOUp[iBatch]));
gpuCheckError(cudaEventCreate(&endIOUp[iBatch]));
gpuCheckError(cudaEventCreate(&startIODown[iBatch]));
gpuCheckError(cudaEventCreate(&endIODown[iBatch]));
gpuCheckError(cudaEventCreate(&startKer[iBatch]));
gpuCheckError(cudaEventCreate(&endKer[iBatch]));
}
// Tracks
std::array<o2::track::TrackParCov*, Fitter::getNProngs()> tracks_device;
int iArg{0};
([&] {
gpuInterface->registerBuffer(reinterpret_cast<void*>(args.data()), sizeof(Tr) * args.size());
gpuInterface->allocDevice(reinterpret_cast<void**>(&(tracks_device[iArg])), sizeof(Tr) * args.size());
++iArg;
}(),
...);
// Fitters
gpuInterface->registerBuffer(reinterpret_cast<void*>(fitters.data()), sizeof(Fitter) * fitters.size());
Fitter* fitters_device;
gpuInterface->allocDevice(reinterpret_cast<void**>(&fitters_device), sizeof(Fitter) * fitters.size());
// Results
gpuInterface->registerBuffer(reinterpret_cast<void*>(results.data()), sizeof(int) * fitters.size());
int* results_device;
gpuInterface->allocDevice(reinterpret_cast<void**>(&results_device), sizeof(int) * fitters.size());
// R.R. Computation
int totalSize = fitters.size();
int batchSize = totalSize / nBatches;
int remainder = totalSize % nBatches;
for (int iBatch{0}; iBatch < nBatches; ++iBatch) {
auto& stream = gpuInterface->getNextStream();
auto offset = iBatch * batchSize + std::min(iBatch, remainder);
auto nFits = batchSize + (iBatch < remainder ? 1 : 0);
gpuCheckError(cudaEventRecord(startIOUp[iBatch], stream));
gpuCheckError(cudaMemcpyAsync(fitters_device + offset, fitters.data() + offset, sizeof(Fitter) /* * nFits */, cudaMemcpyHostToDevice, stream)); // copying just the first element of the buffer
iArg = 0;
([&] {
gpuCheckError(cudaMemcpyAsync(tracks_device[iArg] + offset, args.data() + offset, sizeof(Tr) * nFits, cudaMemcpyHostToDevice, stream));
++iArg;
}(),
...);
gpuCheckError(cudaEventRecord(endIOUp[iBatch], stream));
gpuCheckError(cudaEventRecord(startKer[iBatch], stream));
kernel::initFitters<<<nBlocks, nThreads, 0, stream>>>(fitters_device, offset, nFits);
std::apply([&](auto&&... args) { kernel::processBatchKernel<<<nBlocks, nThreads, 0, stream>>>(fitters_device, results_device, offset, nFits, args...); }, tracks_device);
gpuCheckError(cudaEventRecord(endKer[iBatch], stream));
gpuCheckError(cudaPeekAtLastError());
iArg = 0;
gpuCheckError(cudaEventRecord(startIODown[iBatch], stream));
([&] {
gpuCheckError(cudaMemcpyAsync(args.data() + offset, tracks_device[iArg] + offset, sizeof(Tr) * nFits, cudaMemcpyDeviceToHost, stream));
++iArg;
}(),
...);
gpuCheckError(cudaMemcpyAsync(fitters.data() + offset, fitters_device + offset, sizeof(Fitter) * nFits, cudaMemcpyDeviceToHost, stream));
gpuCheckError(cudaMemcpyAsync(results.data() + offset, results_device + offset, sizeof(int) * nFits, cudaMemcpyDeviceToHost, stream));
gpuCheckError(cudaEventRecord(endIODown[iBatch], stream));
}
([&] { gpuInterface->unregisterBuffer(args.data()); }(), ...);
for (auto* tracksD : tracks_device) {
gpuInterface->freeDevice(tracksD);
}
gpuInterface->freeDevice(fitters_device);
gpuInterface->freeDevice(results_device);
gpuInterface->unregisterBuffer(fitters.data());
gpuInterface->unregisterBuffer(results.data());
// Do benchmarks
gpuCheckError(cudaDeviceSynchronize());
for (int iBatch{0}; iBatch < nBatches; ++iBatch) {
gpuCheckError(cudaEventElapsedTime(&ioUp[iBatch], startIOUp[iBatch], endIOUp[iBatch]));
gpuCheckError(cudaEventElapsedTime(&kerElapsed[iBatch], startKer[iBatch], endKer[iBatch]));
gpuCheckError(cudaEventElapsedTime(&ioDown[iBatch], startIODown[iBatch], endIODown[iBatch]));
}
float totalUp = std::accumulate(ioUp.begin(), ioUp.end(), 0.f);
float totalDown = std::accumulate(ioDown.begin(), ioDown.end(), 0.f);
float totalKernels = std::accumulate(kerElapsed.begin(), kerElapsed.end(), 0.f);
LOGP(info, "Config: {} batches, {} blocks, {} threads", nBatches, nBlocks, nThreads);
LOGP(info, "Total I/O time: Up {} ms Avg {} ms, Down {} ms Avg {} ms", totalUp, totalUp / float(nBatches), totalDown, totalDown / (float)nBatches);
LOGP(info, "Total Kernel time: {} ms Avg {} ms", totalKernels, totalKernels / (float)nBatches);
for (int iBatch{0}; iBatch < nBatches; ++iBatch) {
gpuCheckError(cudaEventDestroy(startIOUp[iBatch]));
gpuCheckError(cudaEventDestroy(endIOUp[iBatch]));
gpuCheckError(cudaEventDestroy(startIODown[iBatch]));
gpuCheckError(cudaEventDestroy(endIODown[iBatch]));
gpuCheckError(cudaEventDestroy(startKer[iBatch]));
gpuCheckError(cudaEventDestroy(endKer[iBatch]));
}
}
template void processBulk(const int,
const int,
const int,
std::vector<o2::vertexing::DCAFitterN<2>>&,
std::vector<int>&,
std::vector<o2::track::TrackParCov>&,
std::vector<o2::track::TrackParCov>&);
template void processBulk(const int,
const int,
const int,
std::vector<o2::vertexing::DCAFitterN<3>>&,
std::vector<int>&,
std::vector<o2::track::TrackParCov>&,
std::vector<o2::track::TrackParCov>&,
std::vector<o2::track::TrackParCov>&);
template int process(const int, const int, o2::vertexing::DCAFitterN<2>&, o2::track::TrackParCov&, o2::track::TrackParCov&);
template int process(const int, const int, o2::vertexing::DCAFitterN<3>&, o2::track::TrackParCov&, o2::track::TrackParCov&, o2::track::TrackParCov&);
template void print(const int, const int, o2::vertexing::DCAFitterN<2>&);
Expand Down
Loading

0 comments on commit 752d06a

Please sign in to comment.