Skip to content

Commit

Permalink
Deallocate resources before normal program termination
Browse files Browse the repository at this point in the history
At normal program termination, thrust vectors might enter their
destructors when the CUDA primary context manager has already
expired, making it impossible to request the CUDA driver to free
device memory. To avoid this condition, deallocation methods are
wrapped in callbacks that are called by a cleanup class before
normal program termination.
  • Loading branch information
jngrad committed Jun 12, 2023
1 parent 511876f commit 114f588
Show file tree
Hide file tree
Showing 13 changed files with 357 additions and 98 deletions.
70 changes: 0 additions & 70 deletions src/core/cuda/CudaDeviceAllocator.hpp

This file was deleted.

2 changes: 1 addition & 1 deletion src/core/electrostatics/mmm1d_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ void CoulombMMM1DGpu::sanity_checks_cell_structure() const {
}

void CoulombMMM1DGpu::tune() {
System::get_system().gpu.init();
System::get_system().gpu.update();
if (this_node == 0) {
setup();
tune(maxPWerror, far_switch_radius, bessel_cutoff);
Expand Down
9 changes: 9 additions & 0 deletions src/core/system/GpuParticleData.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,15 @@ void GpuParticleData::copy_particles_to_device() {
copy_particles_to_device(::cell_structure.local_particles(), ::this_node);
}

bool GpuParticleData::has_compatible_device() const {
auto result = false;
if (::this_node == 0) {
result = has_compatible_device_impl();
}
boost::mpi::broadcast(::comm_cart, result, 0);
return result;
}

BOOST_IS_BITWISE_SERIALIZABLE(GpuParticleData::GpuParticle)

namespace boost {
Expand Down
10 changes: 6 additions & 4 deletions src/core/system/GpuParticleData.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,16 +78,17 @@ class GpuParticleData {

private:
// forward declare
struct Storage;
class Storage;
/** @brief Whether a device was found and data structures were allocated. */
bool m_communication_enabled = false;
/** @brief Whether to convert particle properties from AoS to SoA. */
bool m_split_particle_struct = false;
/** @brief Whether particle transfer to the GPU was requested. */
bool m_need_particles_update = false;
/** @brief Host and device containers. */
std::unique_ptr<Storage> m_data;
std::shared_ptr<Storage> m_data;

bool has_compatible_device_impl() const;
void gpu_init_particle_comm();
void enable_particle_transfer();
void copy_particles_to_device();
Expand All @@ -101,19 +102,20 @@ class GpuParticleData {
Utils::Span<float> host_torques) const;

public:
GpuParticleData();
GpuParticleData() = default;
~GpuParticleData();

void update() {
if (m_need_particles_update and m_communication_enabled) {
copy_particles_to_device();
}
}
void init() { update(); }
void init();
void enable_property(std::size_t property);
void clear_energy_on_device();
void copy_forces_to_host(ParticleRange const &particles, int this_node);
std::size_t n_particles() const;
bool has_compatible_device() const;

GpuEnergy copy_energy_to_host() const;
GpuEnergy *get_energy_device() const;
Expand Down
73 changes: 59 additions & 14 deletions src/core/system/GpuParticleData_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,12 @@
#include "config/config.hpp"

#include "GpuParticleData.hpp"
#include "ResourceCleanup.hpp"
#include "System.hpp"

#include "ParticleRange.hpp"
#include "errorhandling.hpp"

#include "cuda/CudaDeviceAllocator.hpp"
#include "cuda/init.hpp"
#include "cuda/utils.cuh"

Expand All @@ -47,11 +48,7 @@
#error CU-file includes mpi.h! This should not happen!
#endif

template <class T>
using device_vector = thrust::device_vector<T, CudaDeviceAllocator<T>>;

template <class T, class A>
T *raw_data_pointer(thrust::device_vector<T, A> &vec) {
template <class T> T *raw_data_pointer(thrust::device_vector<T> &vec) {
return thrust::raw_pointer_cast(vec.data());
}

Expand All @@ -77,27 +74,37 @@ template <class SpanLike> std::size_t byte_size(SpanLike const &v) {
* @param n Desired new size of the element.
*/
template <class T>
void resize_or_replace(device_vector<T> &vec, std::size_t n) {
void resize_or_replace(thrust::device_vector<T> &vec, std::size_t n) {
if (vec.capacity() == 0) {
vec = device_vector<T>(n);
vec = thrust::device_vector<T>(n);
} else {
vec.resize(n);
}
}

template <typename T> void free_device_vector(thrust::device_vector<T> &vec) {
vec.clear();
thrust::device_vector<T>().swap(vec);
}

/** @brief Host and device containers for particle data. */
struct GpuParticleData::Storage {
class GpuParticleData::Storage {
void free_device_memory();
using DeviceMemory = ResourceCleanup::Attorney<&Storage::free_device_memory>;
friend DeviceMemory;

public:
/** @brief Which particle properties are needed by GPU methods. */
GpuParticleData::prop::bitset m_need;
GpuParticleData::GpuEnergy *energy_device = nullptr;
std::size_t current_size = 0ul;
pinned_vector<GpuParticle> particle_data_host;
device_vector<GpuParticle> particle_data_device;
thrust::device_vector<GpuParticle> particle_data_device;
pinned_vector<float> particle_forces_host;
device_vector<float> particle_forces_device;
thrust::device_vector<float> particle_forces_device;
#ifdef ROTATION
pinned_vector<float> particle_torques_host;
device_vector<float> particle_torques_device;
thrust::device_vector<float> particle_torques_device;
#endif
float *particle_pos_device = nullptr;
#ifdef DIPOLES
Expand All @@ -107,6 +114,13 @@ struct GpuParticleData::Storage {
float *particle_q_device = nullptr;
#endif

static auto make_shared() {
auto obj = std::make_shared<GpuParticleData::Storage>();
System::get_system().cleanup_queue.push<DeviceMemory>(obj);
return obj;
}

~Storage() { free_device_memory(); }
void realloc_device_memory();
void split_particle_struct();
void copy_particles_to_device();
Expand Down Expand Up @@ -135,8 +149,8 @@ struct GpuParticleData::Storage {
#endif
};

GpuParticleData::GpuParticleData() {
m_data = std::make_unique<GpuParticleData::Storage>();
void GpuParticleData::init() {
m_data = GpuParticleData::Storage::make_shared();
}

GpuParticleData::~GpuParticleData() {}
Expand Down Expand Up @@ -184,6 +198,16 @@ void GpuParticleData::enable_property(std::size_t property) {
enable_particle_transfer();
}

bool GpuParticleData::has_compatible_device_impl() const {
auto result = true;
try {
cuda_check_device();
} catch (cuda_runtime_error const &err) {
result = false;
}
return result;
}

/**
* @brief Setup and call particle reallocation from the host.
*/
Expand Down Expand Up @@ -397,3 +421,24 @@ void GpuParticleData::Storage::realloc_device_memory() {
#endif
current_size = new_size;
}

void GpuParticleData::Storage::free_device_memory() {
auto const free_device_pointer = [](float *&ptr) {
if (ptr != nullptr) {
cuda_safe_mem(cudaFree(reinterpret_cast<void *>(ptr)));
ptr = nullptr;
}
};
free_device_vector(particle_data_device);
free_device_vector(particle_forces_device);
#ifdef ROTATION
free_device_vector(particle_torques_device);
#endif
free_device_pointer(particle_pos_device);
#ifdef DIPOLES
free_device_pointer(particle_dip_device);
#endif
#ifdef ELECTROSTATICS
free_device_pointer(particle_q_device);
#endif
}
Loading

0 comments on commit 114f588

Please sign in to comment.