Skip to content

Commit

Permalink
Fixes to some easy errors
Browse files Browse the repository at this point in the history
  • Loading branch information
JuanGonzalezCaminero committed Dec 13, 2024
1 parent c60a272 commit da517cc
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 64 deletions.
129 changes: 65 additions & 64 deletions include/AdePT/core/AsyncAdePTTransport.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ __global__ void FreeSlots1(Ts... slotManagers)
{
(slotManagers->FreeMarkedSlotsStage1(), ...);
}

template <typename... Ts>
__global__ void FreeSlots2(Ts... slotManagers)
{
Expand Down Expand Up @@ -371,7 +372,6 @@ __global__ void InitSlotManagers(SlotManager *mgr, std::size_t N)
}
}


/// WIP: Free functions implementing the CUDA parts
namespace async_adept_impl {

Expand Down Expand Up @@ -427,11 +427,11 @@ void FlushScoring(AdePTScoring &scoring)
/// If successful, this will initialise the member fGPUState.
/// If memory allocation fails, an exception is thrown. In this case, the caller has to
/// try again after some wait time or with less transport slots.
void InitializeGPU()
GPUstate *InitializeGPU(int trackCapacity, int scoringCapacity, int numThreads, TrackBuffer& trackBuffer)
{
// auto gpuStateTmp = std::make_unique<GPUstate>();
auto gpuStateTmp = new GPUstate();
GPUstate &gpuState = *gpuStateTmp;
// auto gpuState_ptr = std::make_unique<GPUstate>();
auto gpuState_ptr = new GPUstate();
GPUstate &gpuState = *gpuState_ptr;

// Allocate structures to manage tracks of an implicit type:
// * memory to hold the actual Track elements,
Expand Down Expand Up @@ -459,8 +459,8 @@ void InitializeGPU()
if (emplaceForAutoDelete) gpuState.allCudaPointers.push_back(devPtr);
};

gpuState.slotManager_host = SlotManager{static_cast<SlotManager::value_type>(fTrackCapacity),
static_cast<SlotManager::value_type>(fTrackCapacity)};
gpuState.slotManager_host = SlotManager{static_cast<SlotManager::value_type>(trackCapacity),
static_cast<SlotManager::value_type>(trackCapacity)};
gpuState.slotManager_dev = nullptr;
gpuMalloc(gpuState.slotManager_dev, gpuState.nSlotManager_dev);
COPCORE_CUDA_CHECK(
Expand All @@ -472,7 +472,7 @@ void InitializeGPU()
ParticleType &particleType = gpuState.particles[i];
// Provide 20% more queue slots than track slots, so a large cluster of a specific particle type
// doesn't exhaust the queues.
const size_t nSlot = fTrackCapacity * ParticleType::relativeQueueSize[i] * 1.2;
const size_t nSlot = trackCapacity * ParticleType::relativeQueueSize[i] * 1.2;
const size_t sizeOfQueueStorage = adept::MParray::SizeOfInstance(nSlot);
const size_t sizeOfLeakQueue = adept::MParray::SizeOfInstance(nSlot / 10);

Expand All @@ -495,7 +495,7 @@ void InitializeGPU()

// init gamma interaction queues
for (unsigned int i = 0; i < GammaInteractions::NInt; ++i) {
const auto capacity = fTrackCapacity / 6;
const auto capacity = trackCapacity / 6;
const auto instanceSize = adept::MParrayT<GammaInteractions::Data>::SizeOfInstance(capacity);
void *gpuPtr = nullptr;
gpuMalloc(gpuPtr, instanceSize);
Expand All @@ -508,109 +508,91 @@ void InitializeGPU()
COPCORE_CUDA_CHECK(cudaMallocHost(&gpuState.stats, sizeof(Stats)));

// init scoring structures
gpuMalloc(gpuState.fScoring_dev, fNThread);
gpuMalloc(gpuState.fScoring_dev, numThreads);

fScoring->clear();
fScoring->reserve(fNThread);
for (unsigned int i = 0; i < fNThread; ++i) {
fScoring->reserve(numThreads);
for (unsigned int i = 0; i < numThreads; ++i) {
fScoring->emplace_back(gpuState.fScoring_dev + i);
}
gpuState.fHitScoring.reset(new HitScoring(fScoringCapacity, fNThread));
gpuState.fHitScoring.reset(new HitScoring(scoringCapacity, numThreads));

const auto injectQueueSize = adept::MParrayT<QueueIndexPair>::SizeOfInstance(fBuffer->fNumToDevice);
const auto injectQueueSize = adept::MParrayT<QueueIndexPair>::SizeOfInstance(trackBuffer.fNumToDevice);
void *gpuPtr = nullptr;
gpuMalloc(gpuPtr, injectQueueSize);
gpuState.injectionQueue = static_cast<adept::MParrayT<QueueIndexPair> *>(gpuPtr);
InitQueue<QueueIndexPair><<<1, 1>>>(gpuState.injectionQueue, fBuffer->fNumToDevice);
InitQueue<QueueIndexPair><<<1, 1>>>(gpuState.injectionQueue, trackBuffer.fNumToDevice);

// This is the largest allocation. If it does not fit, we need to try again:
Track *trackStorage_dev = nullptr;
gpuMalloc(trackStorage_dev, fTrackCapacity);
gpuMalloc(trackStorage_dev, trackCapacity);

for (auto &partType : gpuState.particles) {
partType.tracks = trackStorage_dev;
}

// fGPUstate = std::move(gpuStateTmp);
fGPUstate = gpuStateTmp;
// fGPUstate = std::move(gpuState_ptr);
// fGPUstate = gpuState_ptr;
return gpuState_ptr;
}

void FreeGPU()
void AdvanceEventStates(EventState oldState, EventState newState)
{
fGPUstate->runTransport = false;
fGPUWorker.join();

adeptint::VolAuxData *volAux = nullptr;
COPCORE_CUDA_CHECK(cudaMemcpyFromSymbol(&volAux, AsyncAdePT::gVolAuxData, sizeof(adeptint::VolAuxData *)));
COPCORE_CUDA_CHECK(cudaFree(volAux));

// Free resources.
fGPUstate.reset();

// TODO: GPUstate is no longer a unique_ptr inside AsyncAdePTTransport,
// check if there's any further cleanup required

// Free G4HepEm data
FreeG4HepEmData(AsyncAdePTTransport::fg4hepem_state->fData);
for (auto &eventState : fEventStates) {
EventState expected = oldState;
eventState.compare_exchange_strong(expected, newState, std::memory_order_release, std::memory_order_relaxed);
}
}

void ReturnTracksToG4()
void ReturnTracksToG4(TrackBuffer &trackBuffer, GPUstate &gpuState)
{
std::scoped_lock lock{fBuffer->fromDeviceMutex};
const auto &fromDevice = fBuffer->fromDevice_host.get();
TrackDataWithIDs const *const fromDeviceEnd = fromDevice + *fBuffer->nFromDevice_host;
std::scoped_lock lock{trackBuffer.fromDeviceMutex};
const auto &fromDevice = trackBuffer.fromDevice_host;
TrackDataWithIDs const *const fromDeviceEnd = fromDevice + *trackBuffer.nFromDevice_host;

for (TrackDataWithIDs *trackIt = fromDevice; trackIt < fromDeviceEnd; ++trackIt) {
assert(0 <= trackIt->threadId && trackIt->threadId <= fNThread);
fBuffer->fromDeviceBuffers[trackIt->threadId].push_back(*trackIt);
assert(0 <= trackIt->threadId && trackIt->threadId <= numThreads);
trackBuffer.fromDeviceBuffers[trackIt->threadId].push_back(*trackIt);
}

AdvanceEventStates(EventState::FlushingTracks, EventState::DeviceFlushed);
fGPUstate->extractState = GPUstate::ExtractState::Idle;
gpuState.extractState = GPUstate::ExtractState::Idle;

#ifndef NDEBUG
for (const auto &trackBuffer : fBuffer->fromDeviceBuffers) {
if (trackBuffer.empty()) continue;
const auto eventId = trackBuffer.front().eventId;
assert(std::all_of(trackBuffer.begin(), trackBuffer.end(),
for (const auto &buffer : trackBuffer.fromDeviceBuffers) {
if (buffer.empty()) continue;
const auto eventId = buffer.front().eventId;
assert(std::all_of(buffer.begin(), buffer.end(),
[eventId](const TrackDataWithIDs &track) { return eventId == track.eventId; }));
}
#endif
}

void AdvanceEventStates(EventState oldState, EventState newState)
{
for (auto &eventState : fEventStates) {
EventState expected = oldState;
eventState.compare_exchange_strong(expected, newState, std::memory_order_release, std::memory_order_relaxed);
}
}

void TransportLoop()
void TransportLoop(int trackCapacity, int scoringCapacity, int numThreads, TrackBuffer& trackBuffer, GPUstate *gpuStatePtr)
{
// NVTXTracer tracer{"TransportLoop"};

// Initialise the transport engine:
do {
try {
InitializeGPU();
gpuStatePtr = InitializeGPU(trackCapacity, scoringCapacity, numThreads, trackBuffer);
} catch (std::invalid_argument &exc) {
// Clear error state:
auto result = cudaGetLastError();
std::cerr << "\nError: AdePT failed to initialise the device (" << cudaGetErrorName(result) << "):\n"
<< exc.what() << "\nReducing track capacity: " << fTrackCapacity << " --> " << fTrackCapacity * 0.9
<< exc.what() << "\nReducing track capacity: " << trackCapacity << " --> " << trackCapacity * 0.9
<< '\n';
fTrackCapacity *= 0.9;
trackCapacity *= 0.9;

if (fTrackCapacity < 10000) throw std::runtime_error{"AdePT is unable to allocate GPU memory."};
if (trackCapacity < 10000) throw std::runtime_error{"AdePT is unable to allocate GPU memory."};
}
} while (!fGPUstate);
} while (!gpuStatePtr);

using InjectState = GPUstate::InjectState;
using ExtractState = GPUstate::ExtractState;
auto &cudaManager = vecgeom::cxx::CudaManager::Instance();
const vecgeom::cuda::VPlacedVolume *world_dev = cudaManager.world_gpu();
GPUstate &gpuState = *fGPUstate;
GPUstate &gpuState = *gpuStatePtr;

ParticleType &electrons = gpuState.particles[ParticleType::Electron];
ParticleType &positrons = gpuState.particles[ParticleType::Positron];
Expand Down Expand Up @@ -1063,6 +1045,25 @@ std::thread LaunchGPUWorker()
std::thread{&TransportLoop};
}

void FreeGPU(GPUstate &gpuState, G4HepEmState &g4hepem_state, std::thread &gpuWorker)
{
gpuState.runTransport = false;
gpuWorker.join();

adeptint::VolAuxData *volAux = nullptr;
COPCORE_CUDA_CHECK(cudaMemcpyFromSymbol(&volAux, AsyncAdePT::gVolAuxData, sizeof(adeptint::VolAuxData *)));
COPCORE_CUDA_CHECK(cudaFree(volAux));

// Free resources.
gpuState.reset();

// TODO: GPUstate is no longer a unique_ptr inside AsyncAdePTTransport,
// check if there's any further cleanup required

// Free G4HepEm data
FreeG4HepEmData(g4hepem_state.fData);
}

} // namespace async_adept_impl

///////////////////////
Expand Down Expand Up @@ -1123,10 +1124,10 @@ TrackBuffer::TrackBuffer(unsigned int numToDevice, unsigned int numFromDevice, u

} // namespace AsyncAdePT

AsyncAdePTTransport::~AsyncAdePTTransport()
{
FreeGPU();
}
// AsyncAdePTTransport::~AsyncAdePTTransport()
// {
// FreeGPU();
// }


#endif // ASYNC_ADEPT_TRANSPORT_CUH
7 changes: 7 additions & 0 deletions include/AdePT/core/AsyncAdePTTransport.icc
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ void FlushScoring(AdePTScoring &);
void TransportLoop();
std::shared_ptr<const std::vector<GPUHit>> GetGPUHits(unsigned int);
std::thread LaunchGPUWorker();
void FreeGPU(AsyncAdePT::GPUstate &, G4HepEmState &, std::thread &);
}

namespace AsyncAdePT {
Expand Down Expand Up @@ -129,6 +130,12 @@ AsyncAdePTTransport<IntegrationLayer>::AsyncAdePTTransport(AdePTConfiguration &c
AsyncAdePTTransport::Initialize();
}

template <typename IntegrationLayer>
AsyncAdePTTransport<IntegrationLayer>::~AsyncAdePTTransport()
{
async_adept_impl::FreeGPU(*fGPUstate, *fg4hepem_state, fGPUWorker);
}

template <typename IntegrationLayer>
void AsyncAdePTTransport<IntegrationLayer>::AddTrack(int pdg, int parentID, double energy, double x, double y, double z, double dirx,
double diry, double dirz, double globalTime, double localTime, double properTime,
Expand Down

0 comments on commit da517cc

Please sign in to comment.