Skip to content

Commit

Permalink
migrate more to new pool
Browse files Browse the repository at this point in the history
  • Loading branch information
VinInn committed May 7, 2022
1 parent 1a43ba7 commit 59bcb2b
Show file tree
Hide file tree
Showing 9 changed files with 302 additions and 46 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ namespace {

// free callback
void CUDART_CB freeCallback(void *p) {
// std::cout << "free callaback" << std::endl;
std::cout << "free callaback" << std::endl;
auto payload = (memoryPool::Payload *)(p);
memoryPool::scheduleFree(payload);
}
Expand All @@ -22,7 +22,7 @@ namespace {

struct CudaAlloc {
static void scheduleFree(memoryPool::Payload *payload, cudaStream_t stream) {
// std::cout << "schedule free" << std::endl;
std::cout << "schedule free" << std::endl;
if (stream)
cudaCheck(cudaLaunchHostFunc(stream, freeCallback, payload));
else
Expand All @@ -36,9 +36,10 @@ struct CudaDeviceAlloc : public CudaAlloc {
static Pointer alloc(size_t size) {
Pointer p = nullptr;
auto err = cudaMalloc(&p, size);
std::cout << "alloc " << size << ((err == cudaSuccess) ? " ok" : " err") << std::endl;
return err == cudaSuccess ? p : nullptr;
}
static void free(Pointer ptr) { cudaFree(ptr); }
static void free(Pointer ptr) { auto err = cudaFree(ptr); std::cout << "free" << ((err == cudaSuccess) ? " ok" : " err") <<std::endl;}
};

struct CudaHostAlloc : public CudaAlloc {
Expand All @@ -47,6 +48,7 @@ struct CudaHostAlloc : public CudaAlloc {
static Pointer alloc(size_t size) {
Pointer p = nullptr;
auto err = cudaMallocHost(&p, size);
std::cout << "alloc H " << size << ((err == cudaSuccess) ? " ok" : " err") << std::endl;
return err == cudaSuccess ? p : nullptr;
}
static void free(Pointer ptr) { cudaFreeHost(ptr); }
Expand Down
3 changes: 3 additions & 0 deletions HeterogeneousCore/CUDAUtilities/interface/memoryPool.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include <memory>
#include <new>

#include<iostream>
class SimplePoolAllocator;

namespace memoryPool {
Expand Down Expand Up @@ -32,6 +33,8 @@ namespace memoryPool {
void operator()(void* p) {
if (!me)
throw std::bad_alloc();
if(!p) std::cout << "delete null pointer!!! " << m_bucket << std::endl;
// assert(p == pool()->pointer(m_bucket));
(*me)(m_bucket);
}

Expand Down
10 changes: 8 additions & 2 deletions HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,14 @@
</bin>

<bin file="testPoolUI.cu" name="testPoolUI">
<flags CUDA_FLAGS="-g"/>
<flags CXXFLAGS="-g"/>
<flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
<flags CXXFLAGS="-g -DGPU_DEBUG"/>
</bin>


<bin file="testPoolUImt.cu" name="testPoolUImt">
<flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
<flags CXXFLAGS="-g -DGPU_DEBUG"/>
</bin>

<bin file="testSimplePoolAllocator.cu" name="testSimplePoolAllocatorGPU">
Expand Down
243 changes: 243 additions & 0 deletions HeterogeneousCore/CUDAUtilities/test/testPoolUImt.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"

#include <cmath>
#include <unistd.h>

#include <random>
#include <limits>

#include <atomic>
#include <thread>
#include <mutex>

typedef std::thread Thread;
typedef std::vector<std::thread> ThreadGroup;
typedef std::mutex Mutex;
typedef std::lock_guard<std::mutex> Lock;

struct Node {
int it = -1;
int i = -1;
void *p = nullptr;
#ifdef __CUDACC__
int c = 0;
#else
std::atomic<int> c = 0;
#endif
};

#ifdef __CUDACC__

// generic callback
template <typename F>
void CUDART_CB myCallback(void *fun) {
(*(F *)(fun))();
}

__global__ void kernel_set(int s, Node ** p, int me) {
int first = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = first; i < s; i += gridDim.x * blockDim.x) {
assert(p[i]);
auto n = p[i];
n->it = me;
n->i = i;
n->p = p[i];
n->c = 1;
}
}

__global__ void kernel_test(int s, Node **p, int me) {
int first = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = first; i < s; i += gridDim.x * blockDim.x) {
assert(p[i]);
auto n = p[i];
atomicSub(&(n->c), 1);
assert(n->it == me);
assert(n->i == i);
assert(n->p == p[i]);
assert(0 == n->c);
}
}
#endif

template <memoryPool::Where where>
void go() {
auto start = std::chrono::high_resolution_clock::now();

const int NUMTHREADS = 24;

#ifdef __CUDACC__
printf("Using CUDA %d\n", CUDART_VERSION);
int cuda_device = 0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, cuda_device);
printf("CUDA Capable: SM %d.%d hardware\n", deviceProp.major, deviceProp.minor);

cudaStream_t streams[NUMTHREADS];

for (int i = 0; i < NUMTHREADS; i++) {
cudaStreamCreate(&(streams[i]));
}

#endif


bool stop = false;
bool bin24 = false;
Thread monitor([&] {
int n = 10;
while (n--) {
sleep(5);
memoryPool::cuda::dumpStat();
if (5 == n)
bin24 = true;
}
std::cout << "\nstop\n" << std::endl;
stop = true;
});

int s = 40;
{
std::cout << "try to allocate " << s << std::endl;
auto stream = streams[0];
{
auto pd = memoryPool::cuda::make_buffer<int>(s, stream, where);
assert(pd.get());
memoryPool::cuda::dumpStat();
pd = memoryPool::cuda::make_buffer<int>(s, stream, where);
memoryPool::cuda::dumpStat();
}
cudaStreamSynchronize(stream);
memoryPool::cuda::dumpStat();

}
std::atomic<int> nt = 0;

auto test = [&] {
int const me = nt++;
auto delta = std::chrono::high_resolution_clock::now() - start;

std::mt19937 eng(me + std::chrono::duration_cast<std::chrono::milliseconds>(delta).count());
std::uniform_int_distribution<int> rgen1(1, 100);
std::uniform_int_distribution<int> rgen20(3, 20);
std::uniform_int_distribution<int> rgen24(3, 24);
std::cout << "first RN " << rgen1(eng) << " at "
<< std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " in " << me << std::endl;

#ifdef __CUDACC__
Node **dp = nullptr;
Node **hp = nullptr;
cudaMalloc(&dp, 100 * sizeof(void *));
assert(dp);
cudaMallocHost(&hp, 100 * sizeof(void *));
assert(hp);
#endif

int iter = 0;
while (true) {
if (stop)
break;
iter++;
auto &stream = streams[me];

memoryPool::Deleter devDeleter(std::make_shared<memoryPool::cuda::BundleDelete>(stream,where));
auto n = rgen1(eng);
bool large = 0 == (iter % (128 + me));
for (int k = 0; k < n; ++k) {
int b = bin24 ? rgen24(eng) : rgen20(eng);
// once in while let's allocate 2GB
if (large) {
b = 31;
large = false;
}
uint64_t s = 1LL << b;
assert(s > 0);
auto p0 = memoryPool::cuda::make_buffer<Node>(s/sizeof(Node) + sizeof(Node), devDeleter);
if (!p0.get()) {
std::cout << "\n\n!!!Failed " << me << " at " << iter << std::endl;
memoryPool::cuda::dumpStat();
return;
}
auto p = p0.get();
if (nullptr == p) {
std::cout << "error not detected??? " << b << ' ' << std::endl;
memoryPool::cuda::dumpStat();
}
assert(p);
hp[k] = p;
}
#ifdef __CUDACC__
assert(n <= 100);
// do something???
cudaMemcpyAsync(dp, hp, n * sizeof(void *), cudaMemcpyHostToDevice, stream);
kernel_set<<<1, 128, 0, stream>>>(n, dp, me);
kernel_test<<<1, 128, 0, stream>>>(n, dp, me);

// better sync each "event"
cudaStreamSynchronize(stream);
#else
// do something???
for (int k = 0; k < n; ++k) {
auto p = hp[k];
assert(p);
auto n = p;
n->it = me;
n->i = i;
n->p = p;
n->c = 1;
}
for (int k = 0; k < n; ++k) {
auto p = hp[k];
assert(p);
auto n = p;
n->c--;
assert(n->it == me);
assert(n->i == i);
assert(n->p == p);
assert(0 == n->c);
}
#endif
}
};

ThreadGroup threads;
threads.reserve(NUMTHREADS);

for (int i = 0; i < NUMTHREADS; ++i) {
threads.emplace_back(test);
}

for (auto &t : threads)
t.join();

threads.clear();
monitor.join();
std::cout << "\nfinished\n" << std::endl;
memoryPool::cuda::dumpStat();
}

#ifdef __CUDACC__
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>

#endif

int main() {
#ifdef __CUDACC__
{
int devices = 0;
auto status = cudaGetDeviceCount(&devices);
if (status != cudaSuccess || 0 == devices)
return 0;
std::cout << "found " << devices << " cuda devices" << std::endl;
}

std::cout << "\ntesting cuda device" << std::endl;
go<memoryPool::onDevice>();
#else
std::cout << "testing posix" << std::endl;
go<memoryPool::onCPU>();
#endif

return 0;
}
15 changes: 7 additions & 8 deletions RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@

#include "BrokenLineFitOnGPU.h"
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"

void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
uint32_t hitsInFit,
Expand All @@ -11,13 +12,11 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;

// Fit internals
auto tkidGPU = cms::cuda::make_device_unique<caConstants::tindex_type[]>(maxNumberOfConcurrentFits_, stream);
auto hitsGPU = cms::cuda::make_device_unique<double[]>(
maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<6>) / sizeof(double), stream);
auto hits_geGPU = cms::cuda::make_device_unique<float[]>(
maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6xNf<6>) / sizeof(float), stream);
auto fast_fit_resultsGPU = cms::cuda::make_device_unique<double[]>(
maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double), stream);
memoryPool::Deleter deleter = memoryPool::Deleter(std::make_shared<memoryPool::cuda::BundleDelete>(stream, memoryPool::onDevice));
auto tkidGPU = memoryPool::cuda::make_buffer<caConstants::tindex_type>(maxNumberOfConcurrentFits_,deleter);
auto hitsGPU = memoryPool::cuda::make_buffer<double>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<6>) / sizeof(double), deleter);
auto hits_geGPU = memoryPool::cuda::make_buffer<float>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6xNf<6>) / sizeof(float), deleter);
auto fast_fit_resultsGPU = memoryPool::cuda::make_buffer<double>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double), deleter);

for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
// fit triplets
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,15 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
#endif

// use "nhits" to heuristically dimension the workspace

// no need to use the Traits allocations, since we know this is being compiled for the CPU
//device_isOuterHitOfCell_ = Traits::template make_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
device_isOuterHitOfCell_ = std::make_unique<GPUCACell::OuterHitOfCellContainer[]>(std::max(1U, nhits));
memoryPool::Deleter deleter = memoryPool::Deleter(std::make_shared<memoryPool::cuda::BundleDelete>(nullptr, memoryPool::onCPU));
device_isOuterHitOfCell_ = memoryPool::cuda::make_buffer<GPUCACell::OuterHitOfCellContainer>(std::max(1U, nhits),deleter);
assert(device_isOuterHitOfCell_.get());
isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), hh.offsetBPIX2()};

auto cellStorageSize = caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) +
caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks);
// no need to use the Traits allocations, since we know this is being compiled for the CPU
//cellStorage_ = Traits::template make_unique<unsigned char[]>(cellStorageSize, stream);
cellStorage_ = std::make_unique<unsigned char[]>(cellStorageSize);
cellStorage_ = memoryPool::cuda::make_buffer<unsigned char>(cellStorageSize,deleter);
device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets *
sizeof(GPUCACell::CellNeighbors));
Expand All @@ -45,9 +42,7 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
device_theCellTracks_.get(),
device_theCellTracksContainer_);

// no need to use the Traits allocations, since we know this is being compiled for the CPU
//device_theCells_ = Traits::template make_unique<GPUCACell[]>(params_.maxNumberOfDoublets_, stream);
device_theCells_ = std::make_unique<GPUCACell[]>(params_.maxNumberOfDoublets_);
device_theCells_ = memoryPool::cuda::make_buffer<GPUCACell>(params_.maxNumberOfDoublets_,deleter);
if (0 == nhits)
return; // protect against empty events

Expand Down
Loading

0 comments on commit 59bcb2b

Please sign in to comment.