Merge branch 'new-detector4' of github.com:pkurash/AliceO2 into new-d…

…etector4
pkurash · Oct 6, 2024 · 752d06a · 752d06a
2 parents 266c9a4 + 5d36321
commit 752d06a
Show file tree

Hide file tree

Showing 134 changed files with 7,985 additions and 831 deletions.
diff --git a/.github/workflows/clean-test.yml b/.github/workflows/clean-test.yml
@@ -43,6 +43,15 @@ name: Clean PR checks
         description: build/O2/o2-dataflow-cs8
         type: boolean
         default: true
+      'check_build/O2/o2/aarch64':
+        description: build/O2/o2/aarch64
+        type: boolean
+        default: true
+      'check_build/O2/o2_slc9':
+        description: build/O2/o2_slc9
+        type: boolean
+        default: true
+
 
 permissions: {}
 

diff --git a/CCDB/include/CCDB/BasicCCDBManager.h b/CCDB/include/CCDB/BasicCCDBManager.h
@@ -102,6 +102,10 @@ class CCDBManagerInstance
   template <typename T>
   T* getForTimeStamp(std::string const& path, long timestamp);
 
+  /// retrieve an object of type T from CCDB as stored under path and using the timestamp in the middle of the run
+  template <typename T>
+  T* getForRun(std::string const& path, int runNumber, bool setRunMetadata = false);
+
   /// retrieve an object of type T from CCDB as stored under path, timestamp and metaData
   template <typename T>
   T* getSpecific(std::string const& path, long timestamp = -1, MD metaData = MD())
@@ -311,6 +315,20 @@ T* CCDBManagerInstance::getForTimeStamp(std::string const& path, long timestamp)
   return ptr;
 }
 
+template <typename T>
+T* CCDBManagerInstance::getForRun(std::string const& path, int runNumber, bool setRunMetadata)
+{
+  auto [start, stop] = getRunDuration(runNumber);
+  if (start < 0 || stop < 0) {
+    if (mFatalWhenNull) {
+      reportFatal(std::string("Failed to get run duration for run ") + std::to_string(runNumber));
+    }
+    return nullptr;
+  }
+  mMetaData = setRunMetadata ? MD{{"runNumber", std::to_string(runNumber)}} : MD{};
+  return getForTimeStamp<T>(path, start / 2 + stop / 2);
+}
+
 class BasicCCDBManager : public CCDBManagerInstance
 {
  public:

diff --git a/Common/DCAFitter/GPU/DeviceInterface/GPUInterface.h b/Common/DCAFitter/GPU/DeviceInterface/GPUInterface.h
@@ -0,0 +1,71 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \brief Helper interface to the GPU device, meant to be compatible with manual allocation/streams and GPUReconstruction ones.
+/// \author [email protected]
+
+#ifndef DCAFITTER_GPU_INTERFACE
+#define DCAFITTER_GPU_INTERFACE
+
+#include <thread>
+#include <vector>
+#include <atomic>
+
+namespace o2
+{
+namespace vertexing
+{
+namespace device
+{
+
+#if !defined(__HIPCC__) && !defined(__CUDACC__)
+typedef struct _dummyStream {
+} Stream;
+#else
+#ifdef __HIPCC__
+typedef hipStream_t Stream;
+#else
+typedef cudaStream_t Stream;
+#endif
+#endif
+
+class GPUInterface
+{
+ public:
+  GPUInterface(GPUInterface& other) = delete;
+  void operator=(const GPUInterface&) = delete;
+
+  static GPUInterface* Instance();
+
+  // APIs
+  void registerBuffer(void*, size_t);
+  void unregisterBuffer(void* addr);
+  void allocDevice(void**, size_t);
+  void freeDevice(void*);
+  Stream& getStream(unsigned short N = 0);
+  Stream& getNextStream();
+
+ protected:
+  GPUInterface(size_t N = 1);
+  ~GPUInterface();
+
+  void resize(size_t);
+
+  std::atomic<unsigned short> mLastUsedStream{0};
+  static GPUInterface* sGPUInterface;
+  std::vector<std::thread> mPool{};
+  std::vector<Stream> mStreams{};
+};
+
+} // namespace device
+} // namespace vertexing
+} // namespace o2
+#endif
diff --git a/Common/DCAFitter/GPU/cuda/CMakeLists.txt b/Common/DCAFitter/GPU/cuda/CMakeLists.txt
@@ -12,12 +12,15 @@
 o2_add_library(DCAFitterCUDA
                TARGETVARNAME targetName
                SOURCES DCAFitterN.cu
+                       GPUInterface.cu
                PUBLIC_INCLUDE_DIRECTORIES ../../include
+               PUBLIC_INCLUDE_DIRECTORIES ../
                PUBLIC_LINK_LIBRARIES O2::MathUtils
                                      O2::ReconstructionDataFormats
                                      O2::DetectorsBase
                PRIVATE_LINK_LIBRARIES O2::GPUTrackingCUDAExternalProvider)
 set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+# add_compile_options(-lineinfo)
 
 o2_add_test(DCAFitterNCUDA
             SOURCES test/testDCAFitterNGPU.cxx

diff --git a/Common/DCAFitter/GPU/cuda/DCAFitterN.cu b/Common/DCAFitter/GPU/cuda/DCAFitterN.cu
@@ -15,9 +15,11 @@
 #include <cuda.h>
 #endif
 
+#include <numeric>
+
 #include "GPUCommonDef.h"
 #include "DCAFitter/DCAFitterN.h"
-// #include "MathUtils/SMatrixGPU.h"
+#include "DeviceInterface/GPUInterface.h"
 
 #define gpuCheckError(x)                \
   {                                     \
@@ -36,34 +38,59 @@ namespace o2::vertexing::device
 {
 namespace kernel
 {
+GPUg() void warmUpGpuKernel()
+{
+  unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  float ia, ib;
+  ia = ib = 0.0f;
+  ib += ia + tid;
+}
+
 template <typename Fitter>
-GPUg() void printKernel(Fitter* ft)
+GPUg() void printKernel(Fitter* fitter)
 {
   if (threadIdx.x == 0) {
-    printf(" =============== GPU DCA Fitter %d prongs ================\n", Fitter::getNProngs());
-    ft->print();
+    printf(" =============== GPU DCA Fitter %d prongs =================\n", Fitter::getNProngs());
+    fitter->print();
     printf(" =========================================================\n");
   }
 }
 
+template <typename Fitter>
+GPUg() void initFitters(Fitter* fitters, unsigned int off, unsigned int N)
+{
+  for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x + 1}; iThread < N; iThread += blockDim.x * gridDim.x) {
+    fitters[iThread + off] = fitters[off];
+  }
+}
+
+template <typename Fitter, typename... Tr>
+GPUg() void processKernel(Fitter* fitter, int* res, Tr*... tracks)
+{
+  *res = fitter->process(*tracks...);
+}
+
 template <typename Fitter, typename... Tr>
-GPUg() void processKernel(Fitter* ft, int* res, Tr*... tracks)
+GPUg() void processBatchKernel(Fitter* fitters, int* results, unsigned int off, unsigned int N, Tr*... tracks)
 {
-  *res = ft->process(*tracks...);
+  for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x}; iThread < N; iThread += blockDim.x * gridDim.x) {
+    results[iThread + off] = fitters[iThread + off].process(tracks[iThread + off]...);
+  }
 }
+
 } // namespace kernel
 
 /// CPU handlers
 template <typename Fitter>
 void print(const int nBlocks,
            const int nThreads,
-           Fitter& ft)
+           Fitter& fitter)
 {
-  Fitter* ft_device;
-  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&ft_device), sizeof(Fitter)));
-  gpuCheckError(cudaMemcpy(ft_device, &ft, sizeof(Fitter), cudaMemcpyHostToDevice));
+  Fitter* fitter_device;
+  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&fitter_device), sizeof(Fitter)));
+  gpuCheckError(cudaMemcpy(fitter_device, &fitter, sizeof(Fitter), cudaMemcpyHostToDevice));
 
-  kernel::printKernel<<<nBlocks, nThreads>>>(ft_device);
+  kernel::printKernel<<<nBlocks, nThreads>>>(fitter_device);
 
   gpuCheckError(cudaPeekAtLastError());
   gpuCheckError(cudaDeviceSynchronize());
@@ -75,11 +102,11 @@ int process(const int nBlocks,
             Fitter& fitter,
             Tr&... args)
 {
-  Fitter* ft_device;
+  Fitter* fitter_device;
   std::array<o2::track::TrackParCov*, Fitter::getNProngs()> tracks_device;
   int result, *result_device;
 
-  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&ft_device), sizeof(Fitter)));
+  gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&fitter_device), sizeof(Fitter)));
   gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&result_device), sizeof(int)));
 
   int iArg{0};
@@ -90,15 +117,15 @@ int process(const int nBlocks,
   }(),
    ...);
 
-  gpuCheckError(cudaMemcpy(ft_device, &fitter, sizeof(Fitter), cudaMemcpyHostToDevice));
+  gpuCheckError(cudaMemcpy(fitter_device, &fitter, sizeof(Fitter), cudaMemcpyHostToDevice));
 
-  std::apply([&](auto&&... args) { kernel::processKernel<<<nBlocks, nThreads>>>(ft_device, result_device, args...); }, tracks_device);
+  std::apply([&](auto&&... args) { kernel::processKernel<<<nBlocks, nThreads>>>(fitter_device, result_device, args...); }, tracks_device);
 
   gpuCheckError(cudaPeekAtLastError());
   gpuCheckError(cudaDeviceSynchronize());
 
   gpuCheckError(cudaMemcpy(&result, result_device, sizeof(int), cudaMemcpyDeviceToHost));
-  gpuCheckError(cudaMemcpy(&fitter, ft_device, sizeof(Fitter), cudaMemcpyDeviceToHost));
+  gpuCheckError(cudaMemcpy(&fitter, fitter_device, sizeof(Fitter), cudaMemcpyDeviceToHost));
   iArg = 0;
   ([&] {
     gpuCheckError(cudaMemcpy(&args, tracks_device[iArg], sizeof(o2::track::TrackParCov), cudaMemcpyDeviceToHost));
@@ -107,11 +134,145 @@ int process(const int nBlocks,
   }(),
    ...);
 
+  gpuCheckError(cudaFree(fitter_device));
   gpuCheckError(cudaFree(result_device));
 
   return result;
 }
 
+template <typename Fitter, class... Tr>
+void processBulk(const int nBlocks,
+                 const int nThreads,
+                 const int nBatches,
+                 std::vector<Fitter>& fitters,
+                 std::vector<int>& results,
+                 std::vector<Tr>&... args)
+{
+  auto* gpuInterface = GPUInterface::Instance();
+  kernel::warmUpGpuKernel<<<1, 1, 0, gpuInterface->getNextStream()>>>();
+
+  // Benchmarking events
+  std::vector<float> ioUp(nBatches), ioDown(nBatches), kerElapsed(nBatches);
+  std::vector<cudaEvent_t> startIOUp(nBatches), endIOUp(nBatches), startIODown(nBatches), endIODown(nBatches), startKer(nBatches), endKer(nBatches);
+  for (int iBatch{0}; iBatch < nBatches; ++iBatch) {
+    gpuCheckError(cudaEventCreate(&startIOUp[iBatch]));
+    gpuCheckError(cudaEventCreate(&endIOUp[iBatch]));
+    gpuCheckError(cudaEventCreate(&startIODown[iBatch]));
+    gpuCheckError(cudaEventCreate(&endIODown[iBatch]));
+    gpuCheckError(cudaEventCreate(&startKer[iBatch]));
+    gpuCheckError(cudaEventCreate(&endKer[iBatch]));
+  }
+
+  // Tracks
+  std::array<o2::track::TrackParCov*, Fitter::getNProngs()> tracks_device;
+  int iArg{0};
+  ([&] {
+    gpuInterface->registerBuffer(reinterpret_cast<void*>(args.data()), sizeof(Tr) * args.size());
+    gpuInterface->allocDevice(reinterpret_cast<void**>(&(tracks_device[iArg])), sizeof(Tr) * args.size());
+    ++iArg;
+  }(),
+   ...);
+
+  // Fitters
+  gpuInterface->registerBuffer(reinterpret_cast<void*>(fitters.data()), sizeof(Fitter) * fitters.size());
+  Fitter* fitters_device;
+  gpuInterface->allocDevice(reinterpret_cast<void**>(&fitters_device), sizeof(Fitter) * fitters.size());
+
+  // Results
+  gpuInterface->registerBuffer(reinterpret_cast<void*>(results.data()), sizeof(int) * fitters.size());
+  int* results_device;
+  gpuInterface->allocDevice(reinterpret_cast<void**>(&results_device), sizeof(int) * fitters.size());
+
+  // R.R. Computation
+  int totalSize = fitters.size();
+  int batchSize = totalSize / nBatches;
+  int remainder = totalSize % nBatches;
+
+  for (int iBatch{0}; iBatch < nBatches; ++iBatch) {
+    auto& stream = gpuInterface->getNextStream();
+    auto offset = iBatch * batchSize + std::min(iBatch, remainder);
+    auto nFits = batchSize + (iBatch < remainder ? 1 : 0);
+
+    gpuCheckError(cudaEventRecord(startIOUp[iBatch], stream));
+    gpuCheckError(cudaMemcpyAsync(fitters_device + offset, fitters.data() + offset, sizeof(Fitter) /* * nFits */, cudaMemcpyHostToDevice, stream)); // copying just the first element of the buffer
+    iArg = 0;
+    ([&] {
+      gpuCheckError(cudaMemcpyAsync(tracks_device[iArg] + offset, args.data() + offset, sizeof(Tr) * nFits, cudaMemcpyHostToDevice, stream));
+      ++iArg;
+    }(),
+     ...);
+    gpuCheckError(cudaEventRecord(endIOUp[iBatch], stream));
+
+    gpuCheckError(cudaEventRecord(startKer[iBatch], stream));
+    kernel::initFitters<<<nBlocks, nThreads, 0, stream>>>(fitters_device, offset, nFits);
+    std::apply([&](auto&&... args) { kernel::processBatchKernel<<<nBlocks, nThreads, 0, stream>>>(fitters_device, results_device, offset, nFits, args...); }, tracks_device);
+    gpuCheckError(cudaEventRecord(endKer[iBatch], stream));
+
+    gpuCheckError(cudaPeekAtLastError());
+    iArg = 0;
+    gpuCheckError(cudaEventRecord(startIODown[iBatch], stream));
+    ([&] {
+      gpuCheckError(cudaMemcpyAsync(args.data() + offset, tracks_device[iArg] + offset, sizeof(Tr) * nFits, cudaMemcpyDeviceToHost, stream));
+      ++iArg;
+    }(),
+     ...);
+
+    gpuCheckError(cudaMemcpyAsync(fitters.data() + offset, fitters_device + offset, sizeof(Fitter) * nFits, cudaMemcpyDeviceToHost, stream));
+    gpuCheckError(cudaMemcpyAsync(results.data() + offset, results_device + offset, sizeof(int) * nFits, cudaMemcpyDeviceToHost, stream));
+    gpuCheckError(cudaEventRecord(endIODown[iBatch], stream));
+  }
+
+  ([&] { gpuInterface->unregisterBuffer(args.data()); }(), ...);
+
+  for (auto* tracksD : tracks_device) {
+    gpuInterface->freeDevice(tracksD);
+  }
+
+  gpuInterface->freeDevice(fitters_device);
+  gpuInterface->freeDevice(results_device);
+  gpuInterface->unregisterBuffer(fitters.data());
+  gpuInterface->unregisterBuffer(results.data());
+
+  // Do benchmarks
+  gpuCheckError(cudaDeviceSynchronize());
+  for (int iBatch{0}; iBatch < nBatches; ++iBatch) {
+    gpuCheckError(cudaEventElapsedTime(&ioUp[iBatch], startIOUp[iBatch], endIOUp[iBatch]));
+    gpuCheckError(cudaEventElapsedTime(&kerElapsed[iBatch], startKer[iBatch], endKer[iBatch]));
+    gpuCheckError(cudaEventElapsedTime(&ioDown[iBatch], startIODown[iBatch], endIODown[iBatch]));
+  }
+
+  float totalUp = std::accumulate(ioUp.begin(), ioUp.end(), 0.f);
+  float totalDown = std::accumulate(ioDown.begin(), ioDown.end(), 0.f);
+  float totalKernels = std::accumulate(kerElapsed.begin(), kerElapsed.end(), 0.f);
+  LOGP(info, "Config: {} batches, {} blocks, {} threads", nBatches, nBlocks, nThreads);
+  LOGP(info, "Total I/O time: Up {} ms Avg {} ms, Down {} ms Avg {} ms", totalUp, totalUp / float(nBatches), totalDown, totalDown / (float)nBatches);
+  LOGP(info, "Total Kernel time: {} ms Avg {} ms", totalKernels, totalKernels / (float)nBatches);
+
+  for (int iBatch{0}; iBatch < nBatches; ++iBatch) {
+    gpuCheckError(cudaEventDestroy(startIOUp[iBatch]));
+    gpuCheckError(cudaEventDestroy(endIOUp[iBatch]));
+    gpuCheckError(cudaEventDestroy(startIODown[iBatch]));
+    gpuCheckError(cudaEventDestroy(endIODown[iBatch]));
+    gpuCheckError(cudaEventDestroy(startKer[iBatch]));
+    gpuCheckError(cudaEventDestroy(endKer[iBatch]));
+  }
+}
+
+template void processBulk(const int,
+                          const int,
+                          const int,
+                          std::vector<o2::vertexing::DCAFitterN<2>>&,
+                          std::vector<int>&,
+                          std::vector<o2::track::TrackParCov>&,
+                          std::vector<o2::track::TrackParCov>&);
+template void processBulk(const int,
+                          const int,
+                          const int,
+                          std::vector<o2::vertexing::DCAFitterN<3>>&,
+                          std::vector<int>&,
+                          std::vector<o2::track::TrackParCov>&,
+                          std::vector<o2::track::TrackParCov>&,
+                          std::vector<o2::track::TrackParCov>&);
 template int process(const int, const int, o2::vertexing::DCAFitterN<2>&, o2::track::TrackParCov&, o2::track::TrackParCov&);
 template int process(const int, const int, o2::vertexing::DCAFitterN<3>&, o2::track::TrackParCov&, o2::track::TrackParCov&, o2::track::TrackParCov&);
 template void print(const int, const int, o2::vertexing::DCAFitterN<2>&);