From 12835dbe6fc8f3b79e3c869b2c6e011580423261 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Thu, 1 Aug 2024 23:41:32 +0200 Subject: [PATCH] progress --- cmake/FindFFTW3.cmake | 14 +- src/core/electrostatics/CMakeLists.txt | 1 - src/core/electrostatics/coulomb.cpp | 5 - src/core/electrostatics/coulomb.hpp | 4 - src/core/electrostatics/elc.cpp | 50 ++-- src/core/electrostatics/elc.hpp | 8 +- src/core/electrostatics/icc.cpp | 9 +- src/core/electrostatics/p3m.cpp | 229 +++++++++++------- src/core/electrostatics/p3m.hpp | 88 +++---- src/core/electrostatics/p3m.impl.hpp | 180 ++++++++++++++ src/core/electrostatics/p3m_gpu.cpp | 75 ------ src/core/electrostatics/p3m_gpu.hpp | 66 ----- src/core/electrostatics/p3m_gpu_cuda.cu | 39 +-- src/core/electrostatics/p3m_gpu_cuda.cuh | 2 +- src/core/fft/fft.cpp | 226 +++++++++++------ src/core/fft/fft.hpp | 51 ++-- src/core/fft/vector.hpp | 36 +-- src/core/forces.cpp | 1 - src/core/magnetostatics/dp3m.cpp | 189 ++++++++------- src/core/magnetostatics/dp3m.hpp | 90 +++---- src/core/magnetostatics/dp3m.impl.hpp | 138 +++++++++++ src/core/p3m/FFTBackendLegacy.cpp | 81 ++++--- src/core/p3m/FFTBackendLegacy.hpp | 37 ++- src/core/p3m/common.hpp | 10 +- src/core/p3m/data_struct.hpp | 54 +++-- src/core/p3m/influence_function.hpp | 16 +- src/core/p3m/influence_function_dipolar.hpp | 19 +- src/core/p3m/send_mesh.cpp | 82 ++++--- src/core/p3m/send_mesh.hpp | 24 +- .../EspressoSystemStandAlone_test.cpp | 94 ++++++- src/core/unit_tests/ParticleFactory.hpp | 16 +- src/core/unit_tests/fft_test.cpp | 4 +- src/python/espressomd/electrostatics.py | 8 + src/python/espressomd/magnetostatics.py | 3 + .../{Actor_impl.hpp => Actor.impl.hpp} | 0 .../electrostatics/CoulombP3M.hpp | 44 +++- .../electrostatics/CoulombP3MGPU.hpp | 103 -------- .../ElectrostaticLayerCorrection.hpp | 29 +-- .../electrostatics/initialize.cpp | 7 +- .../{Actor_impl.hpp => Actor.impl.hpp} | 0 .../magnetostatics/DipolarLayerCorrection.hpp | 27 ++- .../magnetostatics/DipolarP3M.hpp | 39 ++- .../magnetostatics/initialize.cpp | 5 +- src/script_interface/tests/Actors_test.cpp | 4 +- testsuite/python/coulomb_interface.py | 4 + testsuite/python/p3m_madelung.py | 113 +++++---- 46 files changed, 1378 insertions(+), 946 deletions(-) create mode 100644 src/core/electrostatics/p3m.impl.hpp delete mode 100644 src/core/electrostatics/p3m_gpu.cpp delete mode 100644 src/core/electrostatics/p3m_gpu.hpp create mode 100644 src/core/magnetostatics/dp3m.impl.hpp rename src/script_interface/electrostatics/{Actor_impl.hpp => Actor.impl.hpp} (100%) delete mode 100644 src/script_interface/electrostatics/CoulombP3MGPU.hpp rename src/script_interface/magnetostatics/{Actor_impl.hpp => Actor.impl.hpp} (100%) diff --git a/cmake/FindFFTW3.cmake b/cmake/FindFFTW3.cmake index a4bdae4f110..3864db4df24 100644 --- a/cmake/FindFFTW3.cmake +++ b/cmake/FindFFTW3.cmake @@ -31,25 +31,27 @@ if(FFTW3_INCLUDE_DIR) endif(FFTW3_INCLUDE_DIR) find_path(FFTW3_INCLUDE_DIR fftw3.h) -find_library(FFTW3_LIBRARIES NAMES fftw3) +find_library(FFTW3_LIBRARIES fftw3) +find_library(FFTW3F_LIBRARIES fftw3f) find_path(FFTW3_MPI_INCLUDE_DIR fftw3-mpi.h) -find_library(FFTW3_MPI_LIBRARIES NAMES fftw3_mpi) +find_library(FFTW3_MPI_LIBRARIES fftw3_mpi) +find_library(FFTW3F_MPI_LIBRARIES fftw3f_mpi) # handle the QUIETLY and REQUIRED arguments and set FFTW3_FOUND to TRUE if all # listed variables are TRUE include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(FFTW3 DEFAULT_MSG FFTW3_LIBRARIES +find_package_handle_standard_args(FFTW3 DEFAULT_MSG FFTW3_LIBRARIES FFTW3F_LIBRARIES FFTW3_INCLUDE_DIR) set(FPHSA_NAME_MISMATCHED 1) -find_package_handle_standard_args(FFTW3_MPI DEFAULT_MSG FFTW3_MPI_LIBRARIES +find_package_handle_standard_args(FFTW3_MPI DEFAULT_MSG FFTW3_MPI_LIBRARIES FFTW3F_MPI_LIBRARIES FFTW3_MPI_INCLUDE_DIR) unset(FPHSA_NAME_MISMATCHED) -mark_as_advanced(FFTW3_LIBRARIES FFTW3_INCLUDE_DIR FFTW3_MPI_LIBRARIES FFTW3_MPI_INCLUDE_DIR) +mark_as_advanced(FFTW3_LIBRARIES FFTW3F_LIBRARIES FFTW3_INCLUDE_DIR FFTW3_MPI_LIBRARIES FFTW3F_MPI_LIBRARIES FFTW3_MPI_INCLUDE_DIR) if(FFTW3_FOUND AND NOT TARGET FFTW3::FFTW3) add_library(FFTW3::FFTW3 INTERFACE IMPORTED) target_include_directories(FFTW3::FFTW3 INTERFACE "${FFTW3_INCLUDE_DIR}") - target_link_libraries(FFTW3::FFTW3 INTERFACE "${FFTW3_LIBRARIES}") + target_link_libraries(FFTW3::FFTW3 INTERFACE "${FFTW3_LIBRARIES}" "${FFTW3F_LIBRARIES}") endif() diff --git a/src/core/electrostatics/CMakeLists.txt b/src/core/electrostatics/CMakeLists.txt index e188a1d1065..1c56350e265 100644 --- a/src/core/electrostatics/CMakeLists.txt +++ b/src/core/electrostatics/CMakeLists.txt @@ -24,6 +24,5 @@ target_sources( ${CMAKE_CURRENT_SOURCE_DIR}/mmm1d.cpp ${CMAKE_CURRENT_SOURCE_DIR}/mmm-modpsi.cpp ${CMAKE_CURRENT_SOURCE_DIR}/p3m.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/p3m_gpu.cpp ${CMAKE_CURRENT_SOURCE_DIR}/scafacos_impl.cpp ${CMAKE_CURRENT_SOURCE_DIR}/specfunc.cpp) diff --git a/src/core/electrostatics/coulomb.cpp b/src/core/electrostatics/coulomb.cpp index b7bc1eac555..14282a8b17e 100644 --- a/src/core/electrostatics/coulomb.cpp +++ b/src/core/electrostatics/coulomb.cpp @@ -207,11 +207,6 @@ struct LongRangeForce { void operator()(std::shared_ptr const &actor) const { actor->add_long_range_forces(m_particles); } -#ifdef CUDA - void operator()(std::shared_ptr const &actor) const { - actor->add_long_range_forces(m_particles); - } -#endif // CUDA void operator()(std::shared_ptr const &actor) const { actor->add_long_range_forces(m_particles); diff --git a/src/core/electrostatics/coulomb.hpp b/src/core/electrostatics/coulomb.hpp index 93eb9302706..437eafce750 100644 --- a/src/core/electrostatics/coulomb.hpp +++ b/src/core/electrostatics/coulomb.hpp @@ -32,7 +32,6 @@ #include "electrostatics/icc.hpp" #include "electrostatics/mmm1d.hpp" #include "electrostatics/p3m.hpp" -#include "electrostatics/p3m_gpu.hpp" #include "electrostatics/reaction_field.hpp" #include "electrostatics/scafacos.hpp" @@ -49,9 +48,6 @@ using ElectrostaticsActor = std::variant, #ifdef P3M std::shared_ptr, -#ifdef CUDA - std::shared_ptr, -#endif // CUDA std::shared_ptr, #endif // P3M std::shared_ptr, diff --git a/src/core/electrostatics/elc.cpp b/src/core/electrostatics/elc.cpp index 8ef7d5366af..ebb96405f7c 100644 --- a/src/core/electrostatics/elc.cpp +++ b/src/core/electrostatics/elc.cpp @@ -27,7 +27,6 @@ #include "electrostatics/coulomb.hpp" #include "electrostatics/p3m.hpp" -#include "electrostatics/p3m_gpu.hpp" #include "BoxGeometry.hpp" #include "Particle.hpp" @@ -1111,23 +1110,20 @@ static void p3m_assign_image_charge(elc_data const &elc, CoulombP3M &p3m, double q, Utils::Vector3d const &pos) { if (pos[2] < elc.space_layer) { auto const q_eff = elc.delta_mid_bot * q; - p3m.assign_charge(q_eff, {pos[0], pos[1], -pos[2]}); + p3m.assign_charge(q_eff, {pos[0], pos[1], -pos[2]}, true); } if (pos[2] > (elc.box_h - elc.space_layer)) { auto const q_eff = elc.delta_mid_top * q; - p3m.assign_charge(q_eff, {pos[0], pos[1], 2. * elc.box_h - pos[2]}); + p3m.assign_charge(q_eff, {pos[0], pos[1], 2. * elc.box_h - pos[2]}, true); } } template void charge_assign(elc_data const &elc, CoulombP3M &solver, combined_ranges const &p_q_pos_range) { - if (protocol == ChargeProtocol::BOTH or protocol == ChargeProtocol::IMAGE) { - solver.p3m.inter_weights.reset(solver.p3m.params.cao); - } - /* prepare local FFT mesh */ - for (int i = 0; i < solver.p3m.local_mesh.size; i++) - solver.p3m.mesh.rs_scalar[i] = 0.; + + solver.assign_charge_prepare_elc(protocol == ChargeProtocol::BOTH or + protocol == ChargeProtocol::IMAGE); for (auto zipped : p_q_pos_range) { auto const p_q = boost::get<0>(zipped); @@ -1135,7 +1131,7 @@ void charge_assign(elc_data const &elc, CoulombP3M &solver, if (p_q != 0.) { if (protocol == ChargeProtocol::BOTH or protocol == ChargeProtocol::REAL) { - solver.assign_charge(p_q, p_pos, solver.p3m.inter_weights); + solver.assign_charge(p_q, p_pos); } if (protocol == ChargeProtocol::BOTH or protocol == ChargeProtocol::IMAGE) { @@ -1149,7 +1145,9 @@ template void modify_p3m_sums(elc_data const &elc, CoulombP3M &solver, combined_range const &p_q_pos_range) { - Utils::Vector3d node_sums{}; + auto local_n = 0; + auto local_q2 = 0.0; + auto local_q = 0.0; for (auto zipped : p_q_pos_range) { auto const p_q = boost::get<0>(zipped); auto const &p_pos = boost::get<1>(zipped); @@ -1158,33 +1156,35 @@ void modify_p3m_sums(elc_data const &elc, CoulombP3M &solver, if (protocol == ChargeProtocol::BOTH or protocol == ChargeProtocol::REAL) { - node_sums[0] += 1.; - node_sums[1] += Utils::sqr(p_q); - node_sums[2] += p_q; + local_n++; + local_q2 += Utils::sqr(p_q); + local_q += p_q; } if (protocol == ChargeProtocol::BOTH or protocol == ChargeProtocol::IMAGE) { if (p_z < elc.space_layer) { - node_sums[0] += 1.; - node_sums[1] += Utils::sqr(elc.delta_mid_bot * p_q); - node_sums[2] += elc.delta_mid_bot * p_q; + local_n++; + local_q2 += Utils::sqr(elc.delta_mid_bot * p_q); + local_q += elc.delta_mid_bot * p_q; } if (p_z > (elc.box_h - elc.space_layer)) { - node_sums[0] += 1.; - node_sums[1] += Utils::sqr(elc.delta_mid_top * p_q); - node_sums[2] += elc.delta_mid_top * p_q; + local_n++; + local_q2 += Utils::sqr(elc.delta_mid_top * p_q); + local_q += elc.delta_mid_top * p_q; } } } } - auto const tot_sums = - boost::mpi::all_reduce(comm_cart, node_sums, std::plus<>()); - solver.p3m.sum_qpart = static_cast(tot_sums[0] + 0.1); - solver.p3m.sum_q2 = tot_sums[1]; - solver.p3m.square_sum_q = Utils::sqr(tot_sums[2]); + auto global_n = 0; + auto global_q2 = 0.; + auto global_q = 0.; + boost::mpi::all_reduce(comm_cart, local_n, global_n, std::plus<>()); + boost::mpi::all_reduce(comm_cart, local_q2, global_q2, std::plus<>()); + boost::mpi::all_reduce(comm_cart, local_q, global_q, std::plus<>()); + solver.count_charged_particles_elc(global_n, global_q2, Utils::sqr(global_q)); } double ElectrostaticLayerCorrection::long_range_energy( diff --git a/src/core/electrostatics/elc.hpp b/src/core/electrostatics/elc.hpp index b6e8c606114..2a7fe034064 100644 --- a/src/core/electrostatics/elc.hpp +++ b/src/core/electrostatics/elc.hpp @@ -37,13 +37,13 @@ #include "actor/traits.hpp" #include "electrostatics/p3m.hpp" -#include "electrostatics/p3m_gpu.hpp" #include "BoxGeometry.hpp" #include "Particle.hpp" #include "ParticleRange.hpp" #include +#include #include #include @@ -166,11 +166,7 @@ struct elc_data { struct ElectrostaticLayerCorrection : public Coulomb::Actor { - using BaseSolver = std::variant< -#ifdef CUDA - std::shared_ptr, -#endif // CUDA - std::shared_ptr>; + using BaseSolver = std::variant>; elc_data elc; BoxGeometry *m_box_geo; diff --git a/src/core/electrostatics/icc.cpp b/src/core/electrostatics/icc.cpp index 83aa02ea25e..8d138579dbd 100644 --- a/src/core/electrostatics/icc.cpp +++ b/src/core/electrostatics/icc.cpp @@ -40,6 +40,8 @@ #include "communication.hpp" #include "electrostatics/coulomb.hpp" #include "electrostatics/coulomb_inline.hpp" +#include "electrostatics/p3m.hpp" +#include "electrostatics/p3m.impl.hpp" #include "errorhandling.hpp" #include "integrators/Propagation.hpp" #include "system/System.hpp" @@ -248,8 +250,11 @@ struct SanityChecksICC { template void operator()(std::shared_ptr const &) const {} #ifdef P3M #ifdef CUDA - [[noreturn]] void operator()(std::shared_ptr const &) const { - throw std::runtime_error("ICC does not work with P3MGPU"); + void operator()(std::shared_ptr const &p) const { + if (dynamic_cast *>(p.get()) != nullptr or + dynamic_cast *>(p.get()) != nullptr) { + throw std::runtime_error("ICC does not work with P3MGPU"); + } } #endif // CUDA void diff --git a/src/core/electrostatics/p3m.cpp b/src/core/electrostatics/p3m.cpp index 5e604a4fd51..379bd66320a 100644 --- a/src/core/electrostatics/p3m.cpp +++ b/src/core/electrostatics/p3m.cpp @@ -24,14 +24,19 @@ * The corresponding header file is @ref p3m.hpp. */ -#include "electrostatics/p3m.hpp" +#include "config/config.hpp" #ifdef P3M +#include "electrostatics/p3m.hpp" +#include "electrostatics/p3m.impl.hpp" + #include "electrostatics/coulomb.hpp" #include "electrostatics/elc.hpp" -#include "electrostatics/p3m_gpu.hpp" +#ifdef CUDA +#include "electrostatics/p3m_gpu_cuda.cuh" #include "electrostatics/p3m_gpu_error.hpp" +#endif // CUDA #include "fft/fft.hpp" #include "p3m/TuningAlgorithm.hpp" @@ -53,6 +58,7 @@ #include "errorhandling.hpp" #include "integrators/Propagation.hpp" #include "npt.hpp" +#include "system/GpuParticleData.hpp" #include "system/System.hpp" #include "tuning.hpp" @@ -84,7 +90,8 @@ #include #include -void CoulombP3M::count_charged_particles() { +template +void CoulombP3MImpl::count_charged_particles() { auto local_n = 0; auto local_q2 = 0.0; auto local_q = 0.0; @@ -112,21 +119,23 @@ void CoulombP3M::count_charged_particles() { * different convention for the prefactors, which is described in * @cite deserno98a @cite deserno98b. */ -void CoulombP3M::calc_influence_function_force() { +template +void CoulombP3MImpl::calc_influence_function_force() { auto const [KX, KY, KZ] = p3m.fft->get_permutations(); - p3m.g_force = - grid_influence_function<1>(p3m.params, p3m.mesh.start, p3m.mesh.stop, KX, - KY, KZ, get_system().box_geo->length_inv()); + p3m.g_force = grid_influence_function( + p3m.params, p3m.mesh.start, p3m.mesh.stop, KX, KY, KZ, + get_system().box_geo->length_inv()); } /** Calculate the influence function optimized for the energy and the * self energy correction. */ -void CoulombP3M::calc_influence_function_energy() { +template +void CoulombP3MImpl::calc_influence_function_energy() { auto const [KX, KY, KZ] = p3m.fft->get_permutations(); - p3m.g_energy = - grid_influence_function<0>(p3m.params, p3m.mesh.start, p3m.mesh.stop, KX, - KY, KZ, get_system().box_geo->length_inv()); + p3m.g_energy = grid_influence_function( + p3m.params, p3m.mesh.start, p3m.mesh.stop, KX, KY, KZ, + get_system().box_geo->length_inv()); } /** Aliasing sum used by @ref p3m_k_space_error. */ @@ -229,7 +238,8 @@ static double p3m_k_space_error(double pref, Utils::Vector3i const &mesh, (box_l[1] * box_l[2]); } -void CoulombP3M::init() { +template +void CoulombP3MImpl::init_cpu_kernels() { assert(p3m.params.mesh >= Utils::Vector3i::broadcast(1)); assert(p3m.params.cao >= 1 and p3m.params.cao <= 7); assert(p3m.params.alpha > 0.); @@ -252,6 +262,7 @@ void CoulombP3M::init() { assert(p3m.fft); p3m.local_mesh.calc_local_ca_mesh(p3m.params, local_geo, skin, elc_layer); + p3m.fft->init_halo(); p3m.fft->init_fft(); p3m.calc_differential_operator(); @@ -261,48 +272,33 @@ void CoulombP3M::init() { count_charged_particles(); } -CoulombP3M::CoulombP3M(P3MParameters &¶meters, double prefactor, - int tune_timings, bool tune_verbose, - bool check_complex_residuals) - : p3m{std::move(parameters)}, tune_timings{tune_timings}, - tune_verbose{tune_verbose}, - check_complex_residuals{check_complex_residuals} { - - if (tune_timings <= 0) { - throw std::domain_error("Parameter 'timings' must be > 0"); - } - m_is_tuned = !p3m.params.tuning; - p3m.params.tuning = false; - set_prefactor(prefactor); -} - namespace { template struct AssignCharge { - void operator()(decltype(CoulombP3M::p3m) &p3m, double q, - Utils::Vector3d const &real_pos, + void operator()(auto &p3m, double q, Utils::Vector3d const &real_pos, + InterpolationWeights const &w) { + using value_type = + typename std::remove_reference_t::value_type; + p3m_interpolate(p3m.local_mesh, w, [q, &p3m](int ind, double w) { + p3m.mesh.rs_scalar[ind] += value_type(w * q); + }); + } + + void operator()(auto &p3m, double q, Utils::Vector3d const &real_pos, p3m_interpolation_cache &inter_weights) { auto const w = p3m_calculate_interpolation_weights( real_pos, p3m.params.ai, p3m.local_mesh); - inter_weights.store(w); - - p3m_interpolate(p3m.local_mesh, w, [q, &p3m](int ind, double w) { - p3m.mesh.rs_scalar[ind] += w * q; - }); + this->operator()(p3m, q, real_pos, w); } - void operator()(decltype(CoulombP3M::p3m) &p3m, double q, - Utils::Vector3d const &real_pos) { - p3m_interpolate( - p3m.local_mesh, - p3m_calculate_interpolation_weights(real_pos, p3m.params.ai, - p3m.local_mesh), - [q, &p3m](int ind, double w) { p3m.mesh.rs_scalar[ind] += w * q; }); + void operator()(auto &p3m, double q, Utils::Vector3d const &real_pos) { + auto const w = p3m_calculate_interpolation_weights( + real_pos, p3m.params.ai, p3m.local_mesh); + this->operator()(p3m, q, real_pos, w); } template - void operator()(decltype(CoulombP3M::p3m) &p3m, - combined_ranges const &p_q_pos_range) { + void operator()(auto &p3m, combined_ranges const &p_q_pos_range) { for (auto zipped : p_q_pos_range) { auto const p_q = boost::get<0>(zipped); auto const &p_pos = boost::get<1>(zipped); @@ -314,12 +310,14 @@ template struct AssignCharge { }; } // namespace -void CoulombP3M::charge_assign(ParticleRange const &particles) { +template +void CoulombP3MImpl::charge_assign( + ParticleRange const &particles) { p3m.inter_weights.reset(p3m.params.cao); /* prepare local FFT mesh */ for (int i = 0; i < p3m.local_mesh.size; i++) - p3m.mesh.rs_scalar[i] = 0.0; + p3m.mesh.rs_scalar[i] = FloatType(0); auto p_q_range = ParticlePropertyRange::charge_range(particles); auto p_pos_range = ParticlePropertyRange::pos_range(particles); @@ -328,20 +326,21 @@ void CoulombP3M::charge_assign(ParticleRange const &particles) { p3m.params.cao, p3m, boost::combine(p_q_range, p_pos_range)); } -void CoulombP3M::assign_charge(double q, Utils::Vector3d const &real_pos, - p3m_interpolation_cache &inter_weights) { - Utils::integral_parameter(p3m.params.cao, p3m, q, - real_pos, inter_weights); -} - -void CoulombP3M::assign_charge(double q, Utils::Vector3d const &real_pos) { - Utils::integral_parameter(p3m.params.cao, p3m, q, - real_pos); +template +void CoulombP3MImpl::assign_charge( + double q, Utils::Vector3d const &real_pos, bool skip_cache) { + if (skip_cache) { + Utils::integral_parameter(p3m.params.cao, p3m, q, + real_pos); + } else { + Utils::integral_parameter( + p3m.params.cao, p3m, q, real_pos, p3m.inter_weights); + } } template struct AssignForces { template - void operator()(decltype(CoulombP3M::p3m) &p3m, double force_prefac, + void operator()(auto &p3m, double force_prefac, combined_ranges const &p_q_force_range) const { assert(cao == p3m.inter_weights.cao()); @@ -354,13 +353,13 @@ template struct AssignForces { auto &p_force = boost::get<1>(zipped); if (p_q != 0.0) { auto const pref = p_q * force_prefac; - auto const w = p3m.inter_weights.load(p_index); + auto const w = p3m.inter_weights.template load(p_index); Utils::Vector3d force{}; p3m_interpolate(p3m.local_mesh, w, [&force, &p3m](int ind, double w) { - force += w * Utils::Vector3d{p3m.mesh.rs_fields[0u][ind], - p3m.mesh.rs_fields[1u][ind], - p3m.mesh.rs_fields[2u][ind]}; + force[0u] += w * double(p3m.mesh.rs_fields[0u][ind]); + force[1u] += w * double(p3m.mesh.rs_fields[1u][ind]); + force[2u] += w * double(p3m.mesh.rs_fields[2u][ind]); }); p_force -= pref * force; @@ -388,14 +387,16 @@ static auto calc_dipole_moment(boost::mpi::communicator const &comm, * in @cite essmann95a. The part \f$\Pi_{\textrm{corr}, \alpha, \beta}\f$ * eq. (2.8) is not present here since M is the empty set in our simulations. */ -Utils::Vector9d -CoulombP3M::long_range_pressure(ParticleRange const &particles) { +template +Utils::Vector9d CoulombP3MImpl::long_range_pressure( + ParticleRange const &particles) { auto const &box_geo = *get_system().box_geo; Utils::Vector9d node_k_space_pressure_tensor{}; if (p3m.sum_q2 > 0.) { charge_assign(particles); - p3m.fft->perform_fwd_fft(); + p3m.fft->perform_scalar_halo_gather(); + p3m.fft->perform_scalar_fwd_fft(); auto constexpr mesh_start = Utils::Vector3i::broadcast(0); auto const &mesh_stop = p3m.mesh.size; @@ -416,9 +417,9 @@ CoulombP3M::long_range_pressure(ParticleRange const &particles) { if (norm_sq != 0.) { auto const node_k_space_energy = - p3m.g_energy[index] * - (Utils::sqr(p3m.mesh.rs_scalar[2u * index + 0u]) + - Utils::sqr(p3m.mesh.rs_scalar[2u * index + 1u])); + double(p3m.g_energy[index] * + (Utils::sqr(p3m.mesh.rs_scalar[2u * index + 0u]) + + Utils::sqr(p3m.mesh.rs_scalar[2u * index + 1u]))); auto const vterm = -2. * (1. / norm_sq + half_alpha_inv_sq); auto const pref = node_k_space_energy * vterm; node_k_space_pressure_tensor[0u] += pref * kx * kx; /* sigma_xx */ @@ -443,8 +444,9 @@ CoulombP3M::long_range_pressure(ParticleRange const &particles) { return node_k_space_pressure_tensor * prefactor / (2. * box_geo.volume()); } -double CoulombP3M::long_range_kernel(bool force_flag, bool energy_flag, - ParticleRange const &particles) { +template +double CoulombP3MImpl::long_range_kernel( + bool force_flag, bool energy_flag, ParticleRange const &particles) { auto const &system = get_system(); auto const &box_geo = *system.box_geo; #ifdef NPT @@ -459,7 +461,8 @@ double CoulombP3M::long_range_kernel(bool force_flag, bool energy_flag, system.coulomb.impl->solver)) { charge_assign(particles); } - p3m.fft->perform_fwd_fft(); + p3m.fft->perform_scalar_halo_gather(); + p3m.fft->perform_scalar_fwd_fft(); } auto p_q_range = ParticlePropertyRange::charge_range(particles); @@ -485,7 +488,8 @@ double CoulombP3M::long_range_kernel(bool force_flag, bool energy_flag, auto constexpr mesh_start = Utils::Vector3i::broadcast(0); auto const &mesh_stop = p3m.mesh.size; auto const &offset = p3m.mesh.start; - auto const wavevector = (2. * std::numbers::pi) * box_geo.length_inv(); + auto const wavevector = Utils::Vector3((2. * std::numbers::pi) * + box_geo.length_inv()); auto indices = Utils::Vector3i{}; auto index = std::size_t(0u); @@ -493,16 +497,16 @@ double CoulombP3M::long_range_kernel(bool force_flag, bool energy_flag, // Eq. (3.49) @cite deserno00b for_each_3d(mesh_start, mesh_stop, indices, [&]() { auto const rho_hat = - std::complex(p3m.mesh.rs_scalar[2u * index + 0u], - p3m.mesh.rs_scalar[2u * index + 1u]); + std::complex(p3m.mesh.rs_scalar[2u * index + 0u], + p3m.mesh.rs_scalar[2u * index + 1u]); auto const phi_hat = p3m.g_force[index] * rho_hat; for (int d = 0; d < 3; d++) { /* direction in r-space: */ int d_rs = (d + p3m.mesh.ks_pnum) % 3; /* directions */ - auto const k = - p3m.d_op[d_rs][indices[d] + offset[d]] * wavevector[d_rs]; + auto const k = FloatType(p3m.d_op[d_rs][indices[d] + offset[d]]) * + wavevector[d_rs]; /* i*k*(Re+i*Im) = - Im*k + i*Re*k (i=sqrt(-1)) */ p3m.mesh.rs_fields[d_rs][2u * index + 0u] = -k * phi_hat.imag(); @@ -515,7 +519,8 @@ double CoulombP3M::long_range_kernel(bool force_flag, bool energy_flag, auto const check_residuals = not p3m.params.tuning and check_complex_residuals; p3m.fft->check_complex_residuals = check_residuals; - p3m.fft->perform_field_back_fft(); + p3m.fft->perform_vector_back_fft(); + p3m.fft->perform_vector_halo_spread(); p3m.fft->check_complex_residuals = false; auto const force_prefac = prefactor / volume; @@ -543,8 +548,8 @@ double CoulombP3M::long_range_kernel(bool force_flag, bool energy_flag, // Use the energy optimized influence function for energy! // Eq. (3.40) @cite deserno00b node_energy += - p3m.g_energy[i] * (Utils::sqr(p3m.mesh.rs_scalar[2 * i]) + - Utils::sqr(p3m.mesh.rs_scalar[2 * i + 1])); + double(p3m.g_energy[i] * (Utils::sqr(p3m.mesh.rs_scalar[2 * i + 0]) + + Utils::sqr(p3m.mesh.rs_scalar[2 * i + 1]))); } node_energy /= 2. * volume; @@ -579,8 +584,9 @@ double CoulombP3M::long_range_kernel(bool force_flag, bool energy_flag, return 0.; } +template class CoulombTuningAlgorithm : public TuningAlgorithm { - decltype(CoulombP3M::p3m) &p3m; + p3m_data_struct_coulomb &p3m; double m_mesh_density_min = -1., m_mesh_density_max = -1.; // indicates if mesh should be tuned bool m_tune_mesh = false; @@ -589,7 +595,7 @@ class CoulombTuningAlgorithm : public TuningAlgorithm { P3MParameters &get_params() override { return p3m.params; } public: - CoulombTuningAlgorithm(System::System &system, decltype(p3m) &input_p3m, + CoulombTuningAlgorithm(System::System &system, auto &input_p3m, double prefactor, int timings) : TuningAlgorithm(system, prefactor, timings), p3m{input_p3m} {} @@ -598,8 +604,7 @@ class CoulombTuningAlgorithm : public TuningAlgorithm { void setup_logger(bool verbose) override { auto const &box_geo = *m_system.box_geo; #ifdef CUDA - auto const &solver = m_system.coulomb.impl->solver; - auto const on_gpu = has_actor_of_type(solver); + auto const on_gpu = Architecture == Arch::GPU; #else auto const on_gpu = false; #endif @@ -646,8 +651,7 @@ class CoulombTuningAlgorithm : public TuningAlgorithm { rs_err = p3m_real_space_error(m_prefactor, r_cut_iL, p3m.sum_qpart, p3m.sum_q2, alpha_L, box_geo.length()); #ifdef CUDA - auto const &solver = m_system.coulomb.impl->solver; - if (has_actor_of_type(solver)) { + if constexpr (Architecture == Arch::GPU) { if (this_node == 0) { ks_err = p3m_k_space_error_gpu(m_prefactor, mesh.data(), cao, p3m.sum_qpart, @@ -743,7 +747,8 @@ class CoulombTuningAlgorithm : public TuningAlgorithm { } }; -void CoulombP3M::tune() { +template +void CoulombP3MImpl::tune() { auto &system = get_system(); auto const &box_geo = *system.box_geo; if (p3m.params.alpha_L == 0. and p3m.params.alpha != 0.) { @@ -759,7 +764,8 @@ void CoulombP3M::tune() { "CoulombP3M: no charged particles in the system"); } try { - CoulombTuningAlgorithm parameters(system, p3m, prefactor, tune_timings); + CoulombTuningAlgorithm parameters( + system, p3m, prefactor, tune_timings); parameters.setup_logger(tune_verbose); // parameter ranges parameters.determine_mesh_limits(); @@ -850,4 +856,59 @@ void CoulombP3M::scaleby_box_l() { calc_influence_function_energy(); } +#ifdef CUDA +template +void CoulombP3MImpl::add_long_range_forces_gpu( + ParticleRange const &particles) { + if constexpr (Architecture == Arch::GPU) { +#ifdef NPT + if (get_system().propagation->integ_switch == INTEG_METHOD_NPT_ISO) { + auto const energy = long_range_energy(particles); + npt_add_virial_contribution(energy); + } +#else + static_cast(particles); +#endif + if (this_node == 0) { + auto &gpu = get_system().gpu; + p3m_gpu_add_farfield_force(*m_gpu_data, gpu, prefactor, + gpu.n_particles()); + } + } +} + +/* Initialize the CPU kernels. + * This operation is time-consuming and sets up data members + * that are only relevant for ELC force corrections, since the + * GPU implementation uses CPU kernels to compute energies. + */ +template +void CoulombP3MImpl::init_gpu_kernels() { + if constexpr (Architecture == Arch::GPU) { + auto &system = get_system(); + if (has_actor_of_type( + system.coulomb.impl->solver)) { + init_cpu_kernels(); + } + p3m_gpu_init(m_gpu_data, p3m.params.cao, p3m.params.mesh, p3m.params.alpha, + system.box_geo->length(), system.gpu.n_particles()); + } +} + +template +void CoulombP3MImpl::request_gpu() const { + if constexpr (Architecture == Arch::GPU) { + auto &gpu_particle_data = get_system().gpu; + gpu_particle_data.enable_property(GpuParticleData::prop::force); + gpu_particle_data.enable_property(GpuParticleData::prop::q); + gpu_particle_data.enable_property(GpuParticleData::prop::pos); + } +} +#endif // CUDA + +template struct CoulombP3MImpl; +template struct CoulombP3MImpl; +template struct CoulombP3MImpl; +template struct CoulombP3MImpl; + #endif // P3M diff --git a/src/core/electrostatics/p3m.hpp b/src/core/electrostatics/p3m.hpp index de7bbdee9fa..b33b8dad4cb 100644 --- a/src/core/electrostatics/p3m.hpp +++ b/src/core/electrostatics/p3m.hpp @@ -42,57 +42,50 @@ #include "p3m/common.hpp" #include "p3m/data_struct.hpp" -#include "p3m/interpolation.hpp" -#include "p3m/send_mesh.hpp" #include "ParticleRange.hpp" #include #include -#include #include #include -#include +#include /** @brief P3M solver. */ struct CoulombP3M : public Coulomb::Actor { - struct p3m_data_struct_impl : public p3m_data_struct { - explicit p3m_data_struct_impl(P3MParameters &¶meters) - : p3m_data_struct{std::move(parameters)} {} - - /** number of charged particles (only on head node). */ - int sum_qpart = 0; - /** Sum of square of charges (only on head node). */ - double sum_q2 = 0.; - /** square of sum of charges (only on head node). */ - double square_sum_q = 0.; - - p3m_interpolation_cache inter_weights; - }; - - /** P3M parameters. */ - p3m_data_struct_impl p3m; + /** @brief Coulomb P3M parameters. */ + p3m_data_struct &p3m; int tune_timings; bool tune_verbose; bool check_complex_residuals; -private: +protected: bool m_is_tuned; public: - CoulombP3M(P3MParameters &¶meters, double prefactor, int tune_timings, - bool tune_verbose, bool check_complex_residuals); + CoulombP3M(p3m_data_struct &p3m_data, double prefactor, int tune_timings, + bool tune_verbose, bool check_complex_residuals) + : p3m{p3m_data}, tune_timings{tune_timings}, tune_verbose{tune_verbose}, + check_complex_residuals{check_complex_residuals} { + + if (tune_timings <= 0) { + throw std::domain_error("Parameter 'timings' must be > 0"); + } + m_is_tuned = not p3m.params.tuning; + p3m.params.tuning = false; + set_prefactor(prefactor); + } + + virtual ~CoulombP3M() = default; bool is_tuned() const { return m_is_tuned; } /** @brief Recalculate all derived parameters. */ - void init(); - void on_activation() { - sanity_checks(); - tune(); - } + virtual void init() = 0; + virtual void on_activation() = 0; + /** @brief Recalculate all box-length-dependent parameters. */ void on_boxl_change() { scaleby_box_l(); } void on_node_grid_change() const { sanity_checks_node_grid(); } @@ -113,7 +106,8 @@ struct CoulombP3M : public Coulomb::Actor { * Count the number of charged particles and calculate * the sum of the squared charges. */ - void count_charged_particles(); + virtual void count_charged_particles() = 0; + virtual void count_charged_particles_elc(int, double, double) = 0; /** * @brief Tune P3M parameters to desired accuracy. @@ -146,24 +140,24 @@ struct CoulombP3M : public Coulomb::Actor { * The function is based on routines of the program HE_Q.cpp written by M. * Deserno. */ - void tune(); + virtual void tune() = 0; /** Assign the physical charges using the tabulated charge assignment * function. */ - void charge_assign(ParticleRange const &particles); + virtual void charge_assign(ParticleRange const &particles) = 0; /** * @brief Assign a single charge into the current charge grid. * * @param[in] q Particle charge * @param[in] real_pos Particle position in real space - * @param[out] inter_weights Cached interpolation weights to be used. + * @param[in] skip_cache Skip interpolation weights caching. */ - void assign_charge(double q, Utils::Vector3d const &real_pos, - p3m_interpolation_cache &inter_weights); - /** @overload */ - void assign_charge(double q, Utils::Vector3d const &real_pos); + virtual void assign_charge(double q, Utils::Vector3d const &real_pos, + bool skip_cache = false) = 0; + + virtual void assign_charge_prepare_elc(bool images) = 0; /** Calculate real-space contribution of p3m Coulomb pair forces. */ Utils::Vector3d pair_force(double q1q2, Utils::Vector3d const &d, @@ -203,25 +197,17 @@ struct CoulombP3M : public Coulomb::Actor { } /** Compute the k-space part of the pressure tensor */ - Utils::Vector9d long_range_pressure(ParticleRange const &particles); + virtual Utils::Vector9d long_range_pressure(ParticleRange const &) = 0; /** Compute the k-space part of energies. */ - double long_range_energy(ParticleRange const &particles) { - return long_range_kernel(false, true, particles); - } + virtual double long_range_energy(ParticleRange const &) = 0; /** Compute the k-space part of forces. */ - void add_long_range_forces(ParticleRange const &particles) { - long_range_kernel(true, false, particles); - } - - /** Compute the k-space part of forces and energies. */ - double long_range_kernel(bool force_flag, bool energy_flag, - ParticleRange const &particles); + virtual void add_long_range_forces(ParticleRange const &) = 0; -private: - void calc_influence_function_force(); - void calc_influence_function_energy(); +protected: + virtual void calc_influence_function_force() = 0; + virtual void calc_influence_function_energy() = 0; /** Checks for correctness of the k-space cutoff. */ void sanity_checks_boxl() const; @@ -229,7 +215,7 @@ struct CoulombP3M : public Coulomb::Actor { void sanity_checks_periodicity() const; void sanity_checks_cell_structure() const; - void scaleby_box_l(); + virtual void scaleby_box_l(); }; #endif // P3M diff --git a/src/core/electrostatics/p3m.impl.hpp b/src/core/electrostatics/p3m.impl.hpp new file mode 100644 index 00000000000..e8faab77d25 --- /dev/null +++ b/src/core/electrostatics/p3m.impl.hpp @@ -0,0 +1,180 @@ +/* + * Copyright (C) 2010-2024 The ESPResSo project + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010 + * Max-Planck-Institute for Polymer Research, Theory Group + * + * This file is part of ESPResSo. + * + * ESPResSo is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * ESPResSo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +/** @file + * P3M algorithm for long-range Coulomb interaction. + * + * We use a P3M (Particle-Particle Particle-Mesh) method based on the + * Ewald summation. Details of the used method can be found in + * @cite hockney88a and @cite deserno98a @cite deserno98b. + * + * Further reading: @cite ewald21a, @cite hockney88a, @cite deserno98a, + * @cite deserno98b, @cite deserno00e, @cite deserno00b, @cite cerda08d. + * + * Implementation in p3m.cpp. + */ + +#pragma once + +#include "config/config.hpp" + +#ifdef P3M + +#include "electrostatics/p3m.hpp" + +#include "p3m/common.hpp" +#include "p3m/data_struct.hpp" +#include "p3m/interpolation.hpp" + +#include "ParticleRange.hpp" + +#include + +#include +#include + +template +struct p3m_data_struct_coulomb : public p3m_data_struct_fft { + using p3m_data_struct_fft::p3m_data_struct_fft; + + /** number of charged particles (only on head node). */ + int sum_qpart = 0; + /** Sum of square of charges (only on head node). */ + double sum_q2 = 0.; + /** square of sum of charges (only on head node). */ + double square_sum_q = 0.; + + p3m_interpolation_cache inter_weights; +}; + +#ifdef CUDA +struct P3MGpuParams; +#endif + +template +struct CoulombP3MImpl : public CoulombP3M { + ~CoulombP3MImpl() override = default; + + /** @brief Coulomb P3M meshes and FFT algorithm. */ + std::unique_ptr> p3m_impl; + // this member overshadows its own base class pointer in the parent class + p3m_data_struct_coulomb &p3m; + + template + CoulombP3MImpl( + std::unique_ptr> &&p3m_handle, + Args &&...args) + : CoulombP3M(*p3m_handle, std::forward(args)...), + p3m_impl{std::move(p3m_handle)}, p3m{*p3m_impl} {} + + void init() override { + if constexpr (Architecture == Arch::CPU) { + init_cpu_kernels(); + } +#ifdef CUDA + if constexpr (Architecture == Arch::GPU) { + init_gpu_kernels(); + } +#endif + } + void tune() override; + void count_charged_particles() override; + void count_charged_particles_elc(int n, double sum_q2, + double square_sum_q) override { + p3m.sum_qpart = n; + p3m.sum_q2 = sum_q2; + p3m.square_sum_q = square_sum_q; + } + + void on_activation() override { +#ifdef CUDA + if constexpr (Architecture == Arch::GPU) { + request_gpu(); + } +#endif + sanity_checks(); + tune(); +#ifdef CUDA + if constexpr (Architecture == Arch::GPU) { + if (is_tuned()) { + init_cpu_kernels(); + } + } +#endif + } + + double long_range_energy(ParticleRange const &particles) override { + return long_range_kernel(false, true, particles); + } + + void add_long_range_forces(ParticleRange const &particles) override { + if constexpr (Architecture == Arch::CPU) { + long_range_kernel(true, false, particles); + } +#ifdef CUDA + if constexpr (Architecture == Arch::GPU) { + add_long_range_forces_gpu(particles); + } +#endif + } + + Utils::Vector9d long_range_pressure(ParticleRange const &particles) override; + + void charge_assign(ParticleRange const &particles) override; + void assign_charge(double q, Utils::Vector3d const &real_pos, + bool skip_cache = false) override; + void assign_charge_prepare_elc(bool images) override { + if (images) { + p3m.inter_weights.reset(p3m.params.cao); + } + /* prepare local FFT mesh */ + for (int i = 0; i < p3m.local_mesh.size; i++) { + p3m.mesh.rs_scalar[i] = FloatType(0); + } + } + +protected: + /** Compute the k-space part of forces and energies. */ + double long_range_kernel(bool force_flag, bool energy_flag, + ParticleRange const &particles); + void calc_influence_function_force() override; + void calc_influence_function_energy() override; + void init_cpu_kernels(); +#ifdef CUDA + void init_gpu_kernels(); + void add_long_range_forces_gpu(ParticleRange const &particles); + std::shared_ptr m_gpu_data = nullptr; + void request_gpu() const; +#endif +}; + +template class FFTBackendImpl, class... Args> +std::shared_ptr new_p3m_handle(P3MParameters &&p3m, + Args &&...args) { + auto obj = std::make_shared>( + std::make_unique>(std::move(p3m)), + std::forward(args)...); + obj->p3m.template make_fft_instance>(false); + return obj; +} + +#endif // P3M diff --git a/src/core/electrostatics/p3m_gpu.cpp b/src/core/electrostatics/p3m_gpu.cpp deleted file mode 100644 index 61b5aaadbe3..00000000000 --- a/src/core/electrostatics/p3m_gpu.cpp +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (C) 2010-2022 The ESPResSo project - * - * This file is part of ESPResSo. - * - * ESPResSo is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * ESPResSo is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#include "config/config.hpp" - -#ifdef P3M -#ifdef CUDA - -#include "electrostatics/p3m_gpu.hpp" -#include "electrostatics/p3m_gpu_cuda.cuh" - -#include "actor/visitors.hpp" -#include "electrostatics/coulomb.hpp" - -#include "ParticleRange.hpp" -#include "PropagationMode.hpp" -#include "integrators/Propagation.hpp" -#include "npt.hpp" -#include "system/GpuParticleData.hpp" -#include "system/System.hpp" - -#include "communication.hpp" - -void CoulombP3MGPU::add_long_range_forces(ParticleRange const &particles) { -#ifdef NPT - if (get_system().propagation->integ_switch == INTEG_METHOD_NPT_ISO) { - auto const energy = long_range_energy(particles); - npt_add_virial_contribution(energy); - } -#else - static_cast(particles); -#endif - if (this_node == 0) { - auto &gpu = get_system().gpu; - p3m_gpu_add_farfield_force(*m_gpu_data, gpu, prefactor, gpu.n_particles()); - } -} - -void CoulombP3MGPU::init() { - auto &system = get_system(); - if (has_actor_of_type( - system.coulomb.impl->solver)) { - init_cpu_kernels(); - } - p3m_gpu_init(m_gpu_data, p3m.params.cao, p3m.params.mesh, p3m.params.alpha, - system.box_geo->length(), system.gpu.n_particles()); -} - -void CoulombP3MGPU::init_cpu_kernels() { CoulombP3M::init(); } - -void CoulombP3MGPU::request_gpu() const { - auto &gpu_particle_data = get_system().gpu; - gpu_particle_data.enable_property(GpuParticleData::prop::force); - gpu_particle_data.enable_property(GpuParticleData::prop::q); - gpu_particle_data.enable_property(GpuParticleData::prop::pos); -} - -#endif // CUDA -#endif // P3M diff --git a/src/core/electrostatics/p3m_gpu.hpp b/src/core/electrostatics/p3m_gpu.hpp deleted file mode 100644 index 35c68170a47..00000000000 --- a/src/core/electrostatics/p3m_gpu.hpp +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (C) 2014-2022 The ESPResSo project - * - * This file is part of ESPResSo. - * - * ESPResSo is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * ESPResSo is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#pragma once - -/** - * @file - * P3M electrostatics on GPU. - */ - -#include "config/config.hpp" - -#ifdef P3M -#ifdef CUDA - -#include "electrostatics/p3m.hpp" - -#include "ParticleRange.hpp" - -#include - -struct P3MGpuParams; - -struct CoulombP3MGPU : public CoulombP3M { - using CoulombP3M::CoulombP3M; - - void init(); - void on_activation() { - request_gpu(); - CoulombP3M::on_activation(); - if (is_tuned()) { - init_cpu_kernels(); - } - } - - void add_long_range_forces(ParticleRange const &); - void request_gpu() const; - -private: - /** - * @brief Initialize the CPU kernels. - * This operation is time-consuming and sets up data members - * that are only relevant for ELC force corrections. - */ - void init_cpu_kernels(); - std::shared_ptr m_gpu_data = nullptr; -}; - -#endif // CUDA -#endif // P3M diff --git a/src/core/electrostatics/p3m_gpu_cuda.cu b/src/core/electrostatics/p3m_gpu_cuda.cu index c734ba0cccd..05a607da88b 100644 --- a/src/core/electrostatics/p3m_gpu_cuda.cu +++ b/src/core/electrostatics/p3m_gpu_cuda.cu @@ -248,19 +248,6 @@ __global__ void calculate_influence_function_device(const P3MGpuData p) { } } -#ifdef P3M_GPU_REAL_DOUBLE -__device__ double atomicAdd(double *address, double val) { - unsigned long long int *address_as_ull = (unsigned long long int *)address; - unsigned long long int old = *address_as_ull, assumed; - do { - assumed = old; - old = atomicCAS(address_as_ull, assumed, - __double_as_longlong(val + __longlong_as_double(assumed))); - } while (assumed != old); - return __longlong_as_double(old); -} -#endif - namespace { __device__ inline auto linear_index_r(P3MGpuData const &p, int i, int j, int k) { @@ -376,14 +363,14 @@ __global__ void assign_charge_kernel(P3MGpuData const params, auto const c = weights[3u * offset + 3u * cao_id_x] * weights[3u * offset + 3u * threadIdx.y + 1u] * weights[3u * offset + 3u * threadIdx.z + 2u] * part_q[id]; - atomicAdd(&(charge_mesh[index]), c); + atomicAdd(&(charge_mesh[index]), REAL_TYPE(c)); } else { auto const c = Utils::bspline(static_cast(cao_id_x), m_pos[0]) * part_q[id] * Utils::bspline(static_cast(threadIdx.y), m_pos[1]) * Utils::bspline(static_cast(threadIdx.z), m_pos[2]); - atomicAdd(&(charge_mesh[index]), c); + atomicAdd(&(charge_mesh[index]), REAL_TYPE(c)); } } @@ -473,7 +460,7 @@ __global__ void assign_forces_kernel(P3MGpuData const params, extern __shared__ float weights[]; - REAL_TYPE c; + REAL_TYPE c = -prefactor * part_q[id]; if (shared) { auto const offset = static_cast(cao) * part_in_block; if ((threadIdx.y < 3u) && (threadIdx.z == 0u)) { @@ -483,23 +470,23 @@ __global__ void assign_forces_kernel(P3MGpuData const params, __syncthreads(); - c = -prefactor * weights[3u * offset + 3u * cao_id_x] * - weights[3u * offset + 3u * threadIdx.y + 1u] * - weights[3u * offset + 3u * threadIdx.z + 2u] * part_q[id]; + c *= REAL_TYPE(weights[3u * offset + 3u * cao_id_x] * + weights[3u * offset + 3u * threadIdx.y + 1u] * + weights[3u * offset + 3u * threadIdx.z + 2u]); } else { - c = -prefactor * part_q[id] * - Utils::bspline(static_cast(cao_id_x), m_pos[0]) * - Utils::bspline(static_cast(threadIdx.y), m_pos[1]) * - Utils::bspline(static_cast(threadIdx.z), m_pos[2]); + c *= + REAL_TYPE(Utils::bspline(static_cast(cao_id_x), m_pos[0]) * + Utils::bspline(static_cast(threadIdx.y), m_pos[1]) * + Utils::bspline(static_cast(threadIdx.z), m_pos[2])); } const REAL_TYPE *force_mesh_x = (REAL_TYPE *)params.force_mesh_x; const REAL_TYPE *force_mesh_y = (REAL_TYPE *)params.force_mesh_y; const REAL_TYPE *force_mesh_z = (REAL_TYPE *)params.force_mesh_z; - atomicAdd(&(part_f[3u * id + 0u]), c * force_mesh_x[index]); - atomicAdd(&(part_f[3u * id + 1u]), c * force_mesh_y[index]); - atomicAdd(&(part_f[3u * id + 2u]), c * force_mesh_z[index]); + atomicAdd(&(part_f[3u * id + 0u]), float(c * force_mesh_x[index])); + atomicAdd(&(part_f[3u * id + 1u]), float(c * force_mesh_y[index])); + atomicAdd(&(part_f[3u * id + 2u]), float(c * force_mesh_z[index])); } void assign_forces(P3MGpuData const ¶ms, diff --git a/src/core/electrostatics/p3m_gpu_cuda.cuh b/src/core/electrostatics/p3m_gpu_cuda.cuh index da2a7cd8202..0c20a7ee25d 100644 --- a/src/core/electrostatics/p3m_gpu_cuda.cuh +++ b/src/core/electrostatics/p3m_gpu_cuda.cuh @@ -28,7 +28,7 @@ struct P3MGpuParams; -void p3m_gpu_init(std::shared_ptr &p3m_gpu_data_ptr, int cao, +void p3m_gpu_init(std::shared_ptr &data, int cao, Utils::Vector3i const &mesh, double alpha, Utils::Vector3d const &box_l, std::size_t n_part); void p3m_gpu_add_farfield_force(P3MGpuParams &data, GpuParticleData &gpu, diff --git a/src/core/fft/fft.cpp b/src/core/fft/fft.cpp index 29997d3c897..9dad7b9aa54 100644 --- a/src/core/fft/fft.cpp +++ b/src/core/fft/fft.cpp @@ -31,11 +31,12 @@ #include #include #include +#include #include +#include #include -#include #include #include @@ -48,7 +49,6 @@ #include #include -using Utils::get_linear_index; using Utils::permute_ifield; /** @name MPI tags for FFT communication */ @@ -59,8 +59,35 @@ using Utils::permute_ifield; #define REQ_FFT_BACK 302 /**@}*/ +template +static void fft_sendrecv(T const *const sendbuf, int scount, int dest, + T *const recvbuf, int rcount, int source, + boost::mpi::communicator const &comm, int tag) { + auto const type = boost::mpi::get_mpi_datatype(*sendbuf); + MPI_Sendrecv(reinterpret_cast(sendbuf), scount, type, dest, tag, + reinterpret_cast(recvbuf), rcount, type, source, tag, + comm, MPI_STATUS_IGNORE); +} + namespace fft { +template struct fftw { + using complex = fftw_complex; + static auto constexpr plan_many_dft = fftw_plan_many_dft; + static auto constexpr destroy_plan = fftw_destroy_plan; + static auto constexpr execute_dft = fftw_execute_dft; + static auto constexpr malloc = fftw_malloc; + static auto constexpr free = fftw_free; +}; +template <> struct fftw { + using complex = fftwf_complex; + static auto constexpr plan_many_dft = fftwf_plan_many_dft; + static auto constexpr destroy_plan = fftwf_destroy_plan; + static auto constexpr execute_dft = fftwf_execute_dft; + static auto constexpr malloc = fftwf_malloc; + static auto constexpr free = fftwf_free; +}; + /** This ugly function does the bookkeeping: which nodes have to * communicate to each other, when you change the node grid. * Changing the regular decomposition requires communication. This @@ -99,7 +126,7 @@ find_comm_groups(Utils::Vector3i const &grid1, Utils::Vector3i const &grid2, /* comm. group cell index */ int gi[3]; /* position of a node in a grid */ - int p1[3], p2[3]; + Utils::Vector3i p1, p2; /* node identity */ int n; /* comm.rank() position in the communication group. */ @@ -144,8 +171,8 @@ find_comm_groups(Utils::Vector3i const &grid1, Utils::Vector3i const &grid2, p2[1] = (gi[1] * s2[1]) + ((i / s2[0]) % s2[1]); p2[2] = (gi[2] * s2[2]) + (i / (s2[0] * s2[1])); - n = node_list1[get_linear_index(p1[0], p1[1], p1[2], grid1)]; - node_list2[get_linear_index(p2[0], p2[1], p2[2], grid2)] = n; + n = node_list1[Utils::get_linear_index(p1, grid1)]; + node_list2[Utils::get_linear_index(p2, grid2)] = n; pos[3 * n + 0] = p2[0]; pos[3 * n + 1] = p2[1]; @@ -275,7 +302,8 @@ int calc_send_block(const int *pos1, const int *grid1, const int *pos2, * \param[in] dim size of the in-grid. * \param[in] element size of a grid element (e.g. 1 for Real, 2 for Complex). */ -void pack_block_permute1(double const *const in, double *const out, +template +void pack_block_permute1(FloatType const *const in, FloatType *const out, const int *start, const int *size, const int *dim, int element) { @@ -323,7 +351,8 @@ void pack_block_permute1(double const *const in, double *const out, * \param[in] dim size of the in-grid. * \param[in] element size of a grid element (e.g. 1 for Real, 2 for Complex). */ -void pack_block_permute2(double const *const in, double *const out, +template +void pack_block_permute2(FloatType const *const in, FloatType *const out, const int *start, const int *size, const int *dim, int element) { @@ -359,19 +388,19 @@ void pack_block_permute2(double const *const in, double *const out, * \param in input mesh. * \param out output mesh. */ -void fft_data_struct::forw_grid_comm(boost::mpi::communicator const &comm, - fft_forw_plan const &plan, - double const *in, double *out) { +template +void fft_data_struct::forw_grid_comm( + boost::mpi::communicator const &comm, fft_forw_plan const &plan, + FloatType const *in, FloatType *out) { for (std::size_t i = 0ul; i < plan.group.size(); i++) { plan.pack_function(in, send_buf.data(), &(plan.send_block[6ul * i]), &(plan.send_block[6ul * i + 3ul]), plan.old_mesh, plan.element); if (plan.group[i] != comm.rank()) { - MPI_Sendrecv(send_buf.data(), plan.send_size[i], MPI_DOUBLE, - plan.group[i], REQ_FFT_FORW, recv_buf.data(), - plan.recv_size[i], MPI_DOUBLE, plan.group[i], REQ_FFT_FORW, - comm, MPI_STATUS_IGNORE); + fft_sendrecv(send_buf.data(), plan.send_size[i], plan.group[i], + recv_buf.data(), plan.recv_size[i], plan.group[i], comm, + REQ_FFT_FORW); } else { /* Self communication... */ std::swap(send_buf, recv_buf); } @@ -388,10 +417,12 @@ void fft_data_struct::forw_grid_comm(boost::mpi::communicator const &comm, * \param in input mesh. * \param out output mesh. */ -void fft_data_struct::back_grid_comm(boost::mpi::communicator const &comm, - fft_forw_plan const &plan_f, - fft_back_plan const &plan_b, - double const *in, double *out) { +template +void fft_data_struct::back_grid_comm( + boost::mpi::communicator const &comm, + fft_forw_plan const &plan_f, + fft_back_plan const &plan_b, FloatType const *in, + FloatType *out) { /* Back means: Use the send/receive stuff from the forward plan but replace the receive blocks by the send blocks and vice versa. Attention then also new_mesh and old_mesh are exchanged */ @@ -402,10 +433,9 @@ void fft_data_struct::back_grid_comm(boost::mpi::communicator const &comm, plan_f.element); if (plan_f.group[i] != comm.rank()) { /* send first, receive second */ - MPI_Sendrecv(send_buf.data(), plan_f.recv_size[i], MPI_DOUBLE, - plan_f.group[i], REQ_FFT_BACK, recv_buf.data(), - plan_f.send_size[i], MPI_DOUBLE, plan_f.group[i], - REQ_FFT_BACK, comm, MPI_STATUS_IGNORE); + fft_sendrecv(send_buf.data(), plan_f.recv_size[i], plan_f.group[i], + recv_buf.data(), plan_f.send_size[i], plan_f.group[i], comm, + REQ_FFT_BACK); } else { /* Self communication... */ std::swap(send_buf, recv_buf); } @@ -466,7 +496,7 @@ int map_3don2d_grid(int const g3d[3], int g2d[3]) { } /** Calculate most square 2D grid. */ -static void calc_2d_grid(int n, int grid[3]) { +inline void calc_2d_grid(int n, int grid[3]) { grid[0] = n; grid[1] = 1; grid[2] = 1; @@ -480,21 +510,20 @@ static void calc_2d_grid(int n, int grid[3]) { } } -int fft_data_struct::initialize_fft(boost::mpi::communicator const &comm, - Utils::Vector3i const &ca_mesh_dim, - int const *ca_mesh_margin, - Utils::Vector3i const &global_mesh_dim, - Utils::Vector3d const &global_mesh_off, - int &ks_pnum, Utils::Vector3i const &grid) { +template +int fft_data_struct::initialize_fft( + boost::mpi::communicator const &comm, Utils::Vector3i const &ca_mesh_dim, + int const *ca_mesh_margin, Utils::Vector3i const &global_mesh_dim, + Utils::Vector3d const &global_mesh_off, int &ks_pnum, + Utils::Vector3i const &grid) { int n_grid[4][3]; /* The four node grids. */ int my_pos[4][3]; /* The position of comm.rank() in the node grids. */ std::vector n_id[4]; /* linear node identity lists for the node grids. */ std::vector n_pos[4]; /* positions of nodes in the node grids. */ - int const rank = comm.rank(); - int node_pos[3]; - MPI_Cart_coords(comm, rank, 3, node_pos); + auto const rank = comm.rank(); + auto const node_pos = Utils::Mpi::cart_coords<3>(comm, rank); max_comm_size = 0; max_mesh_size = 0; @@ -510,10 +539,12 @@ int fft_data_struct::initialize_fft(boost::mpi::communicator const &comm, my_pos[0][i] = node_pos[i]; } for (int i = 0; i < comm.size(); i++) { - MPI_Cart_coords(comm, i, 3, &(n_pos[0][3 * i + 0])); - auto const lin_ind = get_linear_index( - n_pos[0][3 * i + 0], n_pos[0][3 * i + 1], n_pos[0][3 * i + 2], - {n_grid[0][0], n_grid[0][1], n_grid[0][2]}); + auto const n_pos_i = Utils::Mpi::cart_coords<3>(comm, i); + for (int j = 0; j < 3; ++j) { + n_pos[0][3 * i + j] = n_pos_i[j]; + } + auto const lin_ind = Utils::get_linear_index( + n_pos_i, {n_grid[0][0], n_grid[0][1], n_grid[0][2]}); n_id[0][lin_ind] = i; } @@ -639,7 +670,7 @@ int fft_data_struct::initialize_fft(boost::mpi::communicator const &comm, send_buf.resize(max_comm_size); recv_buf.resize(max_comm_size); data_buf.resize(max_mesh_size); - auto *c_data = (fftw_complex *)(data_buf.data()); + auto *c_data = (typename fftw::complex *)(data_buf.data()); /* === FFT Routines (Using FFTW / RFFTW package)=== */ for (int i = 1; i < 4; i++) { @@ -647,10 +678,10 @@ int fft_data_struct::initialize_fft(boost::mpi::communicator const &comm, forw[i].destroy_plan(); } forw[i].dir = FFTW_FORWARD; - forw[i].plan_handle = - fftw_plan_many_dft(1, &forw[i].new_mesh[2], forw[i].n_ffts, c_data, - nullptr, 1, forw[i].new_mesh[2], c_data, nullptr, 1, - forw[i].new_mesh[2], forw[i].dir, FFTW_PATIENT); + forw[i].plan_handle = fftw::plan_many_dft( + 1, &forw[i].new_mesh[2], forw[i].n_ffts, c_data, nullptr, 1, + forw[i].new_mesh[2], c_data, nullptr, 1, forw[i].new_mesh[2], + forw[i].dir, FFTW_PATIENT); assert(forw[i].plan_handle); } @@ -661,10 +692,10 @@ int fft_data_struct::initialize_fft(boost::mpi::communicator const &comm, back[i].destroy_plan(); } back[i].dir = FFTW_BACKWARD; - back[i].plan_handle = - fftw_plan_many_dft(1, &forw[i].new_mesh[2], forw[i].n_ffts, c_data, - nullptr, 1, forw[i].new_mesh[2], c_data, nullptr, 1, - forw[i].new_mesh[2], back[i].dir, FFTW_PATIENT); + back[i].plan_handle = fftw::plan_many_dft( + 1, &forw[i].new_mesh[2], forw[i].n_ffts, c_data, nullptr, 1, + forw[i].new_mesh[2], c_data, nullptr, 1, forw[i].new_mesh[2], + back[i].dir, FFTW_PATIENT); back[i].pack_function = pack_block_permute1; assert(back[i].plan_handle); } @@ -679,59 +710,61 @@ int fft_data_struct::initialize_fft(boost::mpi::communicator const &comm, return max_mesh_size; } -void fft_data_struct::forward_fft(boost::mpi::communicator const &comm, - double *data) { +template +void fft_data_struct::forward_fft( + boost::mpi::communicator const &comm, FloatType *data) { /* ===== first direction ===== */ - auto *c_data = (fftw_complex *)data; - auto *c_data_buf = (fftw_complex *)data_buf.data(); + auto *c_data = (typename fftw::complex *)data; + auto *c_data_buf = (typename fftw::complex *)data_buf.data(); /* communication to current dir row format (in is data) */ forw_grid_comm(comm, forw[1], data, data_buf.data()); /* complexify the real data array (in is data_buf) */ for (int i = 0; i < forw[1].new_size; i++) { - data[2 * i + 0] = data_buf[i]; /* real value */ - data[2 * i + 1] = 0; /* complex value */ + data[2 * i + 0] = data_buf[i]; /* real value */ + data[2 * i + 1] = FloatType(0); /* complex value */ } /* perform FFT (in/out is data)*/ - fftw_execute_dft(forw[1].plan_handle, c_data, c_data); + fftw::execute_dft(forw[1].plan_handle, c_data, c_data); /* ===== second direction ===== */ /* communication to current dir row format (in is data) */ forw_grid_comm(comm, forw[2], data, data_buf.data()); /* perform FFT (in/out is data_buf) */ - fftw_execute_dft(forw[2].plan_handle, c_data_buf, c_data_buf); + fftw::execute_dft(forw[2].plan_handle, c_data_buf, c_data_buf); /* ===== third direction ===== */ /* communication to current dir row format (in is data_buf) */ forw_grid_comm(comm, forw[3], data_buf.data(), data); /* perform FFT (in/out is data)*/ - fftw_execute_dft(forw[3].plan_handle, c_data, c_data); + fftw::execute_dft(forw[3].plan_handle, c_data, c_data); /* REMARK: Result has to be in data. */ } -void fft_data_struct::backward_fft(boost::mpi::communicator const &comm, - double *data, bool check_complex) { +template +void fft_data_struct::backward_fft( + boost::mpi::communicator const &comm, FloatType *data, bool check_complex) { - auto *c_data = (fftw_complex *)data; - auto *c_data_buf = (fftw_complex *)data_buf.data(); + auto *c_data = (typename fftw::complex *)data; + auto *c_data_buf = (typename fftw::complex *)data_buf.data(); /* ===== third direction ===== */ /* perform FFT (in is data) */ - fftw_execute_dft(back[3].plan_handle, c_data, c_data); + fftw::execute_dft(back[3].plan_handle, c_data, c_data); /* communicate (in is data)*/ back_grid_comm(comm, forw[3], back[3], data, data_buf.data()); /* ===== second direction ===== */ /* perform FFT (in is data_buf) */ - fftw_execute_dft(back[2].plan_handle, c_data_buf, c_data_buf); + fftw::execute_dft(back[2].plan_handle, c_data_buf, c_data_buf); /* communicate (in is data_buf) */ back_grid_comm(comm, forw[2], back[2], data_buf.data(), data); /* ===== first direction ===== */ /* perform FFT (in is data) */ - fftw_execute_dft(back[1].plan_handle, c_data, c_data); + fftw::execute_dft(back[1].plan_handle, c_data, c_data); /* throw away the (hopefully) empty complex component (in is data) */ for (int i = 0; i < forw[1].new_size; i++) { data_buf[i] = data[2 * i]; /* real value */ @@ -749,17 +782,19 @@ void fft_data_struct::backward_fft(boost::mpi::communicator const &comm, /* REMARK: Result has to be in data. */ } -void fft_pack_block(double const *const in, double *const out, +template +void fft_pack_block(FloatType const *const in, FloatType *const out, int const start[3], int const size[3], int const dim[3], int element) { - auto const copy_size = element * size[2] * static_cast(sizeof(double)); + auto const copy_size = + static_cast(element * size[2]) * sizeof(FloatType); /* offsets for indices in input grid */ auto const m_in_offset = element * dim[2]; auto const s_in_offset = element * (dim[2] * (dim[1] - size[1])); /* offsets for indices in output grid */ auto const m_out_offset = element * size[2]; - /* linear index of in grid, linear index of out grid */ + /* linear index of input grid, linear index of output grid */ int li_in = element * (start[2] + dim[2] * (start[1] + dim[1] * start[0])); int li_out = 0; @@ -773,11 +808,13 @@ void fft_pack_block(double const *const in, double *const out, } } -void fft_unpack_block(double const *const in, double *const out, +template +void fft_unpack_block(FloatType const *const in, FloatType *const out, int const start[3], int const size[3], int const dim[3], int element) { - auto const copy_size = element * size[2] * static_cast(sizeof(double)); + auto const copy_size = + static_cast(element * size[2]) * sizeof(FloatType); /* offsets for indices in output grid */ auto const m_out_offset = element * dim[2]; auto const s_out_offset = element * (dim[2] * (dim[1] - size[1])); @@ -797,15 +834,60 @@ void fft_unpack_block(double const *const in, double *const out, } } -void fft_plan::destroy_plan() { +template void fft_plan::destroy_plan() { if (plan_handle) { - fftw_destroy_plan(plan_handle); + fftw::destroy_plan(plan_handle); plan_handle = nullptr; } } -namespace detail { -void fft_free(void *const p) { ::fftw_free(p); } -void *fft_malloc(std::size_t length) { return ::fftw_malloc(length); } -} // namespace detail +template +FloatType *allocator::allocate(const std::size_t n) const { + if (n == 0) { + return nullptr; + } + if (n > std::numeric_limits::max() / sizeof(FloatType)) { + throw std::bad_array_new_length(); + } + void *const pv = fftw::malloc(n * sizeof(FloatType)); + if (!pv) { + throw std::bad_alloc(); + } + return static_cast(pv); +} + +template +void allocator::deallocate(FloatType *const p, + std::size_t) const noexcept { + fftw::free(static_cast(p)); +} + +template struct allocator; +template struct allocator; + +template struct fft_plan; +template struct fft_plan; + +template struct fft_forw_plan; +template struct fft_forw_plan; + +template struct fft_back_plan; +template struct fft_back_plan; + +template struct fft_data_struct; +template struct fft_data_struct; + +template void fft_pack_block(float const *in, float *out, + int const start[3], int const size[3], + int const dim[3], int element); +template void fft_pack_block(double const *in, double *out, + int const start[3], int const size[3], + int const dim[3], int element); +template void fft_unpack_block(float const *in, float *out, + int const start[3], int const size[3], + int const dim[3], int element); +template void fft_unpack_block(double const *in, double *out, + int const start[3], int const size[3], + int const dim[3], int element); + } // namespace fft diff --git a/src/core/fft/fft.hpp b/src/core/fft/fft.hpp index 8747448a371..3ccc7d6dcdf 100644 --- a/src/core/fft/fft.hpp +++ b/src/core/fft/fft.hpp @@ -45,10 +45,12 @@ #include #include #include +#include #include #include struct fftw_plan_s; +struct fftwf_plan_s; namespace boost::mpi { class environment; class communicator; @@ -56,8 +58,9 @@ class communicator; namespace fft { -struct fft_plan { - using fftw_plan = fftw_plan_s *; +template struct fft_plan { + using fftw_plan = std::conditional_t, + fftw_plan_s *, fftwf_plan_s *>; ~fft_plan() { destroy_plan(); } @@ -66,13 +69,14 @@ struct fft_plan { /** plan for the FFT. */ fftw_plan plan_handle = nullptr; /** packing function for send blocks. */ - void (*pack_function)(double const *const, double *const, int const *, + void (*pack_function)(FloatType const *const, FloatType *const, int const *, int const *, int const *, int); void destroy_plan(); }; /** @brief Plan for a forward 1D FFT of a flattened 3D array. */ -struct fft_forw_plan : public fft_plan { +template +struct fft_forw_plan : public fft_plan { /** row direction of that FFT. */ int row_dir; /** permutations from normal coordinate system. */ @@ -100,12 +104,13 @@ struct fft_forw_plan : public fft_plan { std::vector recv_block; /** Recv block communication sizes. */ std::vector recv_size; - /** size of send block elements. */ + /** size of send block elements, i.e. 1 for real, 2 for complex. */ int element; }; /** @brief Plan for a backward 1D FFT of a flattened 3D array. */ -struct fft_back_plan : public fft_plan {}; +template +struct fft_back_plan : public fft_plan {}; /** * @brief Information about the three one dimensional FFTs and how the nodes @@ -114,7 +119,7 @@ struct fft_back_plan : public fft_plan {}; * @note FFT numbering starts with 1 for technical reasons (because we have 4 * node grids, the index 0 is used for the real space charge assignment grid). */ -struct fft_data_struct { +template struct fft_data_struct { private: /** * @brief Handle to the MPI environment. @@ -125,9 +130,9 @@ struct fft_data_struct { std::shared_ptr m_mpi_env; /** Information for forward FFTs. */ - std::array forw; + std::array, 4u> forw; /** Information for backward FFTs. */ - std::array back; + std::array, 4u> back; /** Whether FFT is initialized or not. */ bool init_tag = false; @@ -139,11 +144,11 @@ struct fft_data_struct { int max_mesh_size = 0; /** send buffer. */ - std::vector send_buf; + std::vector send_buf; /** receive buffer. */ - std::vector recv_buf; + std::vector recv_buf; /** Buffer for receive data. */ - fft::vector data_buf; + fft::vector data_buf; public: explicit fft_data_struct(decltype(m_mpi_env) mpi_env) @@ -151,8 +156,8 @@ struct fft_data_struct { // disable copy construction: unsafe because we store raw pointers // to FFT plans (avoids double-free and use-after-free) - fft_data_struct &operator=(fft_data_struct const &) = delete; - fft_data_struct(fft_data_struct const &) = delete; + fft_data_struct &operator=(fft_data_struct const &) = delete; + fft_data_struct(fft_data_struct const &) = delete; /** Initialize everything connected to the 3D-FFT. * @@ -177,7 +182,7 @@ struct fft_data_struct { * \param[in,out] data Mesh. * \param[in] comm MPI communicator */ - void forward_fft(boost::mpi::communicator const &comm, double *data); + void forward_fft(boost::mpi::communicator const &comm, FloatType *data); /** Perform an in-place backward 3D FFT. * \warning The content of \a data is overwritten. @@ -186,7 +191,7 @@ struct fft_data_struct { * non-zero. * \param[in] comm MPI communicator. */ - void backward_fft(boost::mpi::communicator const &comm, double *data, + void backward_fft(boost::mpi::communicator const &comm, FloatType *data, bool check_complex); auto get_mesh_size() const { return forw[3u].new_mesh; } @@ -195,10 +200,12 @@ struct fft_data_struct { private: void forw_grid_comm(boost::mpi::communicator const &comm, - fft_forw_plan const &plan, double const *in, double *out); + fft_forw_plan const &plan, FloatType const *in, + FloatType *out); void back_grid_comm(boost::mpi::communicator const &comm, - fft_forw_plan const &plan_f, fft_back_plan const &plan_b, - double const *in, double *out); + fft_forw_plan const &plan_f, + fft_back_plan const &plan_b, + FloatType const *in, FloatType *out); }; /** Pack a block (size[3] starting at start[3]) of an input @@ -218,7 +225,8 @@ struct fft_data_struct { * \param[in] dim size of the in-grid. * \param[in] element size of a grid element (e.g. 1 for Real, 2 for Complex). */ -void fft_pack_block(double const *in, double *out, int const start[3], +template +void fft_pack_block(FloatType const *in, FloatType *out, int const start[3], int const size[3], int const dim[3], int element); /** Unpack a 3d-grid input block (size[3]) into an output 3d-grid @@ -233,7 +241,8 @@ void fft_pack_block(double const *in, double *out, int const start[3], * \param[in] dim size of the in-grid. * \param[in] element size of a grid element (e.g. 1 for Real, 2 for Complex). */ -void fft_unpack_block(double const *in, double *out, int const start[3], +template +void fft_unpack_block(FloatType const *in, FloatType *out, int const start[3], int const size[3], int const dim[3], int element); int map_3don2d_grid(int const g3d[3], int g2d[3]); diff --git a/src/core/fft/vector.hpp b/src/core/fft/vector.hpp index e74df7a3de0..10cc268b302 100644 --- a/src/core/fft/vector.hpp +++ b/src/core/fft/vector.hpp @@ -24,43 +24,29 @@ #include #include #include +#include #include namespace fft { -namespace detail { -void fft_free(void *p); -void *fft_malloc(std::size_t length); -} // namespace detail /** @brief Aligned allocator for FFT data. */ -template struct allocator { - typedef T value_type; +template struct allocator { + static_assert(std::is_same_v or + std::is_same_v, + "FFTW only implements float and double"); + typedef FloatType value_type; allocator() noexcept = default; // default ctor not required - template explicit allocator(const allocator &) {} - template bool operator==(const allocator &) const { + template explicit allocator(allocator const &) {} + template bool operator==(allocator const &) const { return true; } - template bool operator!=(const allocator &) const { + template bool operator!=(allocator const &) const { return false; } - T *allocate(const std::size_t n) const { - if (n == 0) { - return nullptr; - } - if (n > std::numeric_limits::max() / sizeof(T)) { - throw std::bad_array_new_length(); - } - void *const pv = detail::fft_malloc(n * sizeof(T)); - if (!pv) { - throw std::bad_alloc(); - } - return static_cast(pv); - } + FloatType *allocate(std::size_t n) const; - void deallocate(T *const p, std::size_t) const noexcept { - detail::fft_free(static_cast(p)); - } + void deallocate(FloatType *p, std::size_t) const noexcept; }; template using vector = std::vector>; diff --git a/src/core/forces.cpp b/src/core/forces.cpp index d669b87deaf..b9881e166c0 100644 --- a/src/core/forces.cpp +++ b/src/core/forces.cpp @@ -35,7 +35,6 @@ #include "communication.hpp" #include "constraints/Constraints.hpp" #include "electrostatics/icc.hpp" -#include "electrostatics/p3m_gpu.hpp" #include "forces_inline.hpp" #include "galilei/ComFixed.hpp" #include "immersed_boundary/ImmersedBoundaries.hpp" diff --git a/src/core/magnetostatics/dp3m.cpp b/src/core/magnetostatics/dp3m.cpp index b3ab8871fab..68e409b316c 100644 --- a/src/core/magnetostatics/dp3m.cpp +++ b/src/core/magnetostatics/dp3m.cpp @@ -30,6 +30,7 @@ #ifdef DP3M #include "magnetostatics/dp3m.hpp" +#include "magnetostatics/dp3m.impl.hpp" #include "fft/fft.hpp" #include "p3m/TuningAlgorithm.hpp" @@ -38,7 +39,6 @@ #include "p3m/influence_function_dipolar.hpp" #include "p3m/interpolation.hpp" #include "p3m/math.hpp" -#include "p3m/send_mesh.hpp" #include "BoxGeometry.hpp" #include "LocalBox.hpp" @@ -76,7 +76,8 @@ #include #include -void DipolarP3M::count_magnetic_particles() { +template +void DipolarP3MImpl::count_magnetic_particles() { int local_n = 0; double local_mu2 = 0.; @@ -105,7 +106,10 @@ double dp3m_rtbisection(double box_size, double r_cut_iL, int n_c_part, double sum_q2, double x1, double x2, double xacc, double tuned_accuracy); -double DipolarP3M::calc_average_self_energy_k_space() const { +template +double +DipolarP3MImpl::calc_average_self_energy_k_space() + const { auto const &box_geo = *get_system().box_geo; auto const node_phi = grid_influence_function_self_energy( dp3m.params, dp3m.mesh.start, dp3m.mesh.stop, dp3m.g_energy); @@ -116,7 +120,8 @@ double DipolarP3M::calc_average_self_energy_k_space() const { return phi * std::numbers::pi; } -void DipolarP3M::init() { +template +void DipolarP3MImpl::init_cpu_kernels() { assert(dp3m.params.mesh >= Utils::Vector3i::broadcast(1)); assert(dp3m.params.cao >= 1 and dp3m.params.cao <= 7); assert(dp3m.params.alpha > 0.); @@ -131,6 +136,7 @@ void DipolarP3M::init() { assert(dp3m.fft); dp3m.local_mesh.calc_local_ca_mesh(dp3m.params, local_geo, verlet_skin, 0.); + dp3m.fft->init_halo(); dp3m.fft->init_fft(); dp3m.calc_differential_operator(); @@ -140,44 +146,29 @@ void DipolarP3M::init() { count_magnetic_particles(); } -DipolarP3M::DipolarP3M(P3MParameters &¶meters, double prefactor, - int tune_timings, bool tune_verbose) - : dp3m{std::move(parameters)}, tune_timings{tune_timings}, - tune_verbose{tune_verbose} { - - set_prefactor(prefactor); - m_is_tuned = !dp3m.params.tuning; - dp3m.params.tuning = false; - - if (tune_timings <= 0) { - throw std::domain_error("Parameter 'timings' must be > 0"); - } - - if (dp3m.params.mesh != Utils::Vector3i::broadcast(dp3m.params.mesh[0])) { - throw std::domain_error("DipolarP3M requires a cubic mesh"); - } -} - namespace { template struct AssignDipole { - void operator()(decltype(DipolarP3M::dp3m) &dp3m, - Utils::Vector3d const &real_pos, + void operator()(auto &dp3m, Utils::Vector3d const &real_pos, Utils::Vector3d const &dip) const { + using value_type = + typename std::remove_reference_t::value_type; auto const weights = p3m_calculate_interpolation_weights( real_pos, dp3m.params.ai, dp3m.local_mesh); - p3m_interpolate(dp3m.local_mesh, weights, - [&dip, &dp3m](int ind, double w) { - dp3m.mesh.rs_fields[0][ind] += w * dip[0]; - dp3m.mesh.rs_fields[1][ind] += w * dip[1]; - dp3m.mesh.rs_fields[2][ind] += w * dip[2]; - }); - - dp3m.inter_weights.store(weights); + p3m_interpolate( + dp3m.local_mesh, weights, [&dip, &dp3m](int ind, double w) { + dp3m.mesh.rs_fields[0u][ind] += value_type(w * dip[0u]); + dp3m.mesh.rs_fields[1u][ind] += value_type(w * dip[1u]); + dp3m.mesh.rs_fields[2u][ind] += value_type(w * dip[2u]); + }); + + dp3m.inter_weights.template store(weights); } }; } // namespace -void DipolarP3M::dipole_assign(ParticleRange const &particles) { +template +void DipolarP3MImpl::dipole_assign( + ParticleRange const &particles) { dp3m.inter_weights.reset(dp3m.params.cao); /* prepare local FFT mesh */ @@ -195,7 +186,7 @@ void DipolarP3M::dipole_assign(ParticleRange const &particles) { namespace { template struct AssignTorques { - void operator()(decltype(DipolarP3M::dp3m) &dp3m, double prefac, int d_rs, + void operator()(auto &dp3m, double prefac, int d_rs, ParticleRange const &particles) const { /* magnetic particle index */ @@ -203,12 +194,12 @@ template struct AssignTorques { for (auto &p : particles) { if (p.dipm() != 0.) { - auto const w = dp3m.inter_weights.load(p_index); + auto const weights = dp3m.inter_weights.template load(p_index); Utils::Vector3d E{}; - p3m_interpolate(dp3m.local_mesh, w, + p3m_interpolate(dp3m.local_mesh, weights, [&E, &dp3m, d_rs](int ind, double w) { - E[d_rs] += w * dp3m.mesh.rs_scalar[ind]; + E[d_rs] += w * double(dp3m.mesh.rs_scalar[ind]); }); p.torque() -= vector_product(p.calc_dip(), prefac * E); @@ -219,7 +210,7 @@ template struct AssignTorques { }; template struct AssignForces { - void operator()(decltype(DipolarP3M::dp3m) &dp3m, double prefac, int d_rs, + void operator()(auto &dp3m, double prefac, int d_rs, ParticleRange const &particles) const { /* magnetic particle index */ @@ -227,14 +218,15 @@ template struct AssignForces { for (auto &p : particles) { if (p.dipm() != 0.) { - auto const w = dp3m.inter_weights.load(p_index); + auto const weights = dp3m.inter_weights.template load(p_index); Utils::Vector3d E{}; - p3m_interpolate(dp3m.local_mesh, w, [&E, &dp3m](int ind, double w) { - E[0] += w * dp3m.mesh.rs_fields[0][ind]; - E[1] += w * dp3m.mesh.rs_fields[1][ind]; - E[2] += w * dp3m.mesh.rs_fields[2][ind]; - }); + p3m_interpolate(dp3m.local_mesh, weights, + [&E, &dp3m](int ind, double w) { + E[0u] += w * double(dp3m.mesh.rs_fields[0u][ind]); + E[1u] += w * double(dp3m.mesh.rs_fields[1u][ind]); + E[2u] += w * double(dp3m.mesh.rs_fields[2u][ind]); + }); p.force()[d_rs] += p.calc_dip() * prefac * E; ++p_index; @@ -244,8 +236,9 @@ template struct AssignForces { }; } // namespace -double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag, - ParticleRange const &particles) { +template +double DipolarP3MImpl::long_range_kernel( + bool force_flag, bool energy_flag, ParticleRange const &particles) { /* k-space energy */ double energy = 0.; auto const &system = get_system(); @@ -260,7 +253,8 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag, if (dp3m.sum_mu2 > 0.) { dipole_assign(particles); - dp3m.fft->perform_fwd_fft(); + dp3m.fft->perform_vector_halo_gather(); + dp3m.fft->perform_vector_fwd_fft(); } /* === k-space energy calculation === */ @@ -284,14 +278,14 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag, for_each_3d(mesh_start, dp3m.mesh.size, indices, [&]() { auto const shift = indices + offset; // Re(mu)*k - auto const re = mesh_dip[0u][index] * d_op[shift[KX]] + - mesh_dip[1u][index] * d_op[shift[KY]] + - mesh_dip[2u][index] * d_op[shift[KZ]]; + auto const re = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) + + mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) + + mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]); ++index; // Im(mu)*k - auto const im = mesh_dip[0u][index] * d_op[shift[KX]] + - mesh_dip[1u][index] * d_op[shift[KY]] + - mesh_dip[2u][index] * d_op[shift[KZ]]; + auto const im = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) + + mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) + + mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]); ++index; node_energy += *it_energy * (Utils::sqr(re) + Utils::sqr(im)); std::advance(it_energy, 1); @@ -336,15 +330,15 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag, for_each_3d(mesh_start, mesh_stop, indices, [&]() { auto const shift = indices + offset; // Re(mu)*k - auto const re = mesh_dip[0u][index] * d_op[shift[KX]] + - mesh_dip[1u][index] * d_op[shift[KY]] + - mesh_dip[2u][index] * d_op[shift[KZ]]; + auto const re = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) + + mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) + + mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]); dp3m.mesh.ks_scalar[index] = *it_energy * re; ++index; // Im(mu)*k - auto const im = mesh_dip[0u][index] * d_op[shift[KX]] + - mesh_dip[1u][index] * d_op[shift[KY]] + - mesh_dip[2u][index] * d_op[shift[KZ]]; + auto const im = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) + + mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) + + mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]); dp3m.mesh.ks_scalar[index] = *it_energy * im; ++index; std::advance(it_energy, 1); @@ -354,13 +348,14 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag, for (int d = 0; d < 3; d++) { index = 0u; for_each_3d(mesh_start, mesh_stop, indices, [&]() { - auto const pos = indices[d] + offset[d]; - dp3m.mesh.rs_scalar[index] = d_op[pos] * dp3m.mesh.ks_scalar[index]; + auto const d_op_val = FloatType(d_op[indices[d] + offset[d]]); + dp3m.mesh.rs_scalar[index] = d_op_val * dp3m.mesh.ks_scalar[index]; ++index; - dp3m.mesh.rs_scalar[index] = d_op[pos] * dp3m.mesh.ks_scalar[index]; + dp3m.mesh.rs_scalar[index] = d_op_val * dp3m.mesh.ks_scalar[index]; ++index; }); - dp3m.fft->perform_space_back_fft(); + dp3m.fft->perform_scalar_back_fft(); + dp3m.fft->perform_scalar_halo_spread(); /* Assign force component from mesh to particle */ auto const d_rs = (d + dp3m.mesh.ks_pnum) % 3; Utils::integral_parameter( @@ -380,14 +375,14 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag, for_each_3d(mesh_start, mesh_stop, indices, [&]() { auto const shift = indices + offset; // Re(mu)*k - auto const re = mesh_dip[0u][index] * d_op[shift[KX]] + - mesh_dip[1u][index] * d_op[shift[KY]] + - mesh_dip[2u][index] * d_op[shift[KZ]]; + auto const re = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) + + mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) + + mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]); ++index; // Im(mu)*k - auto const im = mesh_dip[0u][index] * d_op[shift[KX]] + - mesh_dip[1u][index] * d_op[shift[KY]] + - mesh_dip[2u][index] * d_op[shift[KZ]]; + auto const im = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) + + mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) + + mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]); ++index; dp3m.mesh.ks_scalar[index - 2] = *it_force * im; dp3m.mesh.ks_scalar[index - 1] = *it_force * (-re); @@ -398,21 +393,21 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag, for (int d = 0; d < 3; d++) { index = 0u; for_each_3d(mesh_start, mesh_stop, indices, [&]() { + auto const d_op_val = FloatType(d_op[indices[d] + offset[d]]); auto const shift = indices + offset; - auto const f1 = - d_op[indices[d] + offset[d]] * dp3m.mesh.ks_scalar[index]; - mesh_dip[0u][index] = d_op[shift[KX]] * f1; - mesh_dip[1u][index] = d_op[shift[KY]] * f1; - mesh_dip[2u][index] = d_op[shift[KZ]] * f1; + auto const f1 = d_op_val * dp3m.mesh.ks_scalar[index]; + mesh_dip[0u][index] = FloatType(d_op[shift[KX]]) * f1; + mesh_dip[1u][index] = FloatType(d_op[shift[KY]]) * f1; + mesh_dip[2u][index] = FloatType(d_op[shift[KZ]]) * f1; ++index; - auto const f2 = - d_op[indices[d] + offset[d]] * dp3m.mesh.ks_scalar[index]; - mesh_dip[0u][index] = d_op[shift[KX]] * f2; - mesh_dip[1u][index] = d_op[shift[KY]] * f2; - mesh_dip[2u][index] = d_op[shift[KZ]] * f2; + auto const f2 = d_op_val * dp3m.mesh.ks_scalar[index]; + mesh_dip[0u][index] = FloatType(d_op[shift[KX]]) * f2; + mesh_dip[1u][index] = FloatType(d_op[shift[KY]]) * f2; + mesh_dip[2u][index] = FloatType(d_op[shift[KZ]]) * f2; ++index; }); - dp3m.fft->perform_field_back_fft(); + dp3m.fft->perform_vector_back_fft(); + dp3m.fft->perform_vector_halo_spread(); /* Assign force component from mesh to particle */ auto const d_rs = (d + dp3m.mesh.ks_pnum) % 3; Utils::integral_parameter( @@ -507,20 +502,23 @@ double DipolarP3M::calc_surface_term(bool force_flag, bool energy_flag, return energy; } -void DipolarP3M::calc_influence_function_force() { - dp3m.g_force = - grid_influence_function<3>(dp3m.params, dp3m.mesh.start, dp3m.mesh.stop, - get_system().box_geo->length_inv()); +template +void DipolarP3MImpl::calc_influence_function_force() { + dp3m.g_force = grid_influence_function( + dp3m.params, dp3m.mesh.start, dp3m.mesh.stop, + get_system().box_geo->length_inv()); } -void DipolarP3M::calc_influence_function_energy() { - dp3m.g_energy = - grid_influence_function<2>(dp3m.params, dp3m.mesh.start, dp3m.mesh.stop, - get_system().box_geo->length_inv()); +template +void DipolarP3MImpl::calc_influence_function_energy() { + dp3m.g_energy = grid_influence_function( + dp3m.params, dp3m.mesh.start, dp3m.mesh.stop, + get_system().box_geo->length_inv()); } +template class DipolarTuningAlgorithm : public TuningAlgorithm { - decltype(DipolarP3M::dp3m) &dp3m; + p3m_data_struct_dipoles &dp3m; int m_mesh_max = -1, m_mesh_min = -1; public: @@ -632,7 +630,8 @@ class DipolarTuningAlgorithm : public TuningAlgorithm { } }; -void DipolarP3M::tune() { +template +void DipolarP3MImpl::tune() { auto &system = get_system(); auto const &box_geo = *system.box_geo; if (dp3m.params.alpha_L == 0. and dp3m.params.alpha != 0.) { @@ -648,7 +647,8 @@ void DipolarP3M::tune() { "DipolarP3M: no dipolar particles in the system"); } try { - DipolarTuningAlgorithm parameters(system, dp3m, prefactor, tune_timings); + DipolarTuningAlgorithm parameters( + system, dp3m, prefactor, tune_timings); parameters.setup_logger(tune_verbose); // parameter ranges parameters.determine_mesh_limits(); @@ -871,10 +871,10 @@ void DipolarP3M::scaleby_box_l() { sanity_checks_boxl(); calc_influence_function_force(); calc_influence_function_energy(); - dp3m.energy_correction = 0.0; } -void DipolarP3M::calc_energy_correction() { +template +void DipolarP3MImpl::calc_energy_correction() { auto const &box_geo = *get_system().box_geo; auto const Ukp3m = calc_average_self_energy_k_space() * box_geo.volume(); auto const Ewald_volume = Utils::int_pow<3>(dp3m.params.alpha_L); @@ -889,4 +889,7 @@ void npt_add_virial_magnetic_contribution(double energy) { } #endif // NPT +template struct DipolarP3MImpl; +template struct DipolarP3MImpl; + #endif // DP3M diff --git a/src/core/magnetostatics/dp3m.hpp b/src/core/magnetostatics/dp3m.hpp index 0bdf4f05ec2..43381972e90 100644 --- a/src/core/magnetostatics/dp3m.hpp +++ b/src/core/magnetostatics/dp3m.hpp @@ -39,20 +39,17 @@ #include "p3m/common.hpp" #include "p3m/data_struct.hpp" -#include "p3m/interpolation.hpp" -#include "p3m/send_mesh.hpp" #include "Particle.hpp" #include "ParticleRange.hpp" #include #include +#include -#include #include #include -#include -#include +#include #ifdef NPT /** Update the NpT virial */ @@ -61,38 +58,35 @@ void npt_add_virial_magnetic_contribution(double energy); /** @brief Dipolar P3M solver. */ struct DipolarP3M : public Dipoles::Actor { - struct p3m_data_struct_impl : public p3m_data_struct { - explicit p3m_data_struct_impl(P3MParameters &¶meters) - : p3m_data_struct{std::move(parameters)} {} + /** @brief Dipolar P3M parameters. */ + p3m_data_struct &dp3m; - /** number of dipolar particles (only on head node). */ - int sum_dip_part = 0; - /** Sum of square of magnetic dipoles (only on head node). */ - double sum_mu2 = 0.; - - /** position shift for calculation of first assignment mesh point. */ - double pos_shift = 0.; - - p3m_interpolation_cache inter_weights; - - /** cached k-space self-energy correction */ - double energy_correction = 0.; - }; - - /** Dipolar P3M parameters. */ - p3m_data_struct_impl dp3m; - - /** Magnetostatics prefactor. */ int tune_timings; bool tune_verbose; - DipolarP3M(P3MParameters &¶meters, double prefactor, int tune_timings, - bool tune_verbose); +protected: + bool m_is_tuned; - void on_activation() { - sanity_checks(); - tune(); +public: + DipolarP3M(p3m_data_struct &dp3m_data, double prefactor, int tune_timings, + bool tune_verbose) + : dp3m{dp3m_data}, tune_timings{tune_timings}, + tune_verbose{tune_verbose} { + + if (tune_timings <= 0) { + throw std::domain_error("Parameter 'timings' must be > 0"); + } + if (dp3m.params.mesh != Utils::Vector3i::broadcast(dp3m.params.mesh[0])) { + throw std::domain_error("DipolarP3M requires a cubic mesh"); + } + m_is_tuned = not dp3m.params.tuning; + dp3m.params.tuning = false; + set_prefactor(prefactor); } + + virtual ~DipolarP3M() = default; + + virtual void on_activation() = 0; /** @brief Recalculate all box-length-dependent parameters. */ void on_boxl_change() { scaleby_box_l(); } void on_node_grid_change() const { sanity_checks_node_grid(); } @@ -102,7 +96,7 @@ struct DipolarP3M : public Dipoles::Actor { init(); } /** @brief Recalculate all derived parameters. */ - void init(); + virtual void init() = 0; void sanity_checks() const { sanity_checks_boxl(); @@ -115,10 +109,10 @@ struct DipolarP3M : public Dipoles::Actor { * @brief Count the number of magnetic particles and calculate * the sum of the squared dipole moments. */ - void count_magnetic_particles(); + virtual void count_magnetic_particles() = 0; /** Assign the physical dipoles using the tabulated assignment function. */ - void dipole_assign(ParticleRange const &particles); + virtual void dipole_assign(ParticleRange const &particles) = 0; /** * @brief Tune dipolar P3M parameters to desired accuracy. @@ -152,22 +146,14 @@ struct DipolarP3M : public Dipoles::Actor { * The function is based on routines of the program HE_Q.cpp for charges * written by M. Deserno. */ - void tune(); + virtual void tune() = 0; bool is_tuned() const { return m_is_tuned; } /** Compute the k-space part of energies. */ - double long_range_energy(ParticleRange const &particles) { - return long_range_kernel(false, true, particles); - } + virtual double long_range_energy(ParticleRange const &particles) = 0; /** Compute the k-space part of forces. */ - void add_long_range_forces(ParticleRange const &particles) { - long_range_kernel(true, false, particles); - } - - /** Compute the k-space part of forces and energies. */ - double long_range_kernel(bool force_flag, bool energy_flag, - ParticleRange const &particles); + virtual void add_long_range_forces(ParticleRange const &particles) = 0; /** Calculate real-space contribution of p3m dipolar pair forces and torques. * If NPT is compiled in, update the NpT virial. @@ -266,22 +252,20 @@ struct DipolarP3M : public Dipoles::Actor { return prefactor * (mimj * B_r - mir * mjr * C_r); } -private: - bool m_is_tuned; - +protected: /** Calculate self-energy in k-space. */ - double calc_average_self_energy_k_space() const; + virtual double calc_average_self_energy_k_space() const = 0; /** Calculate energy correction that minimizes the error. * This quantity is only calculated once and is cached. */ - void calc_energy_correction(); + virtual void calc_energy_correction() = 0; /** Calculate the influence function for the dipolar forces and torques. */ - void calc_influence_function_force(); + virtual void calc_influence_function_force() = 0; /** Calculate the influence function for the dipolar energy. */ - void calc_influence_function_energy(); + virtual void calc_influence_function_energy() = 0; /** Compute the dipolar surface terms */ double calc_surface_term(bool force_flag, bool energy_flag, @@ -293,7 +277,7 @@ struct DipolarP3M : public Dipoles::Actor { void sanity_checks_periodicity() const; void sanity_checks_cell_structure() const; - void scaleby_box_l(); + virtual void scaleby_box_l(); }; #endif // DP3M diff --git a/src/core/magnetostatics/dp3m.impl.hpp b/src/core/magnetostatics/dp3m.impl.hpp new file mode 100644 index 00000000000..108b79d21c8 --- /dev/null +++ b/src/core/magnetostatics/dp3m.impl.hpp @@ -0,0 +1,138 @@ +/* + * Copyright (C) 2010-2024 The ESPResSo project + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010 + * Max-Planck-Institute for Polymer Research, Theory Group + * + * This file is part of ESPResSo. + * + * ESPResSo is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * ESPResSo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +/** @file + * P3M algorithm for long-range magnetic dipole-dipole interaction. + * + * We use here a P3M (Particle-Particle Particle-Mesh) method based + * on the dipolar Ewald summation. Details of the used method can be + * found in @cite hockney88a and @cite deserno98a @cite deserno98b. + * + * Further reading: @cite cerda08d + */ + +#pragma once + +#include "config/config.hpp" + +#ifdef DP3M + +#include "magnetostatics/dp3m.hpp" + +#include "p3m/common.hpp" +#include "p3m/data_struct.hpp" +#include "p3m/interpolation.hpp" + +#include "Particle.hpp" +#include "ParticleRange.hpp" + +#include +#include + +#include +#include + +template +struct p3m_data_struct_dipoles : public p3m_data_struct_fft { + using p3m_data_struct_fft::p3m_data_struct_fft; + + /** number of dipolar particles (only on head node). */ + int sum_dip_part = 0; + /** Sum of square of magnetic dipoles (only on head node). */ + double sum_mu2 = 0.; + + /** position shift for calculation of first assignment mesh point. */ + double pos_shift = 0.; + + /** cached k-space self-energy correction */ + double energy_correction = 0.; + + p3m_interpolation_cache inter_weights; +}; + +template +struct DipolarP3MImpl : public DipolarP3M { + ~DipolarP3MImpl() override = default; + + /** @brief Coulomb P3M meshes and FFT algorithm. */ + std::unique_ptr> dp3m_impl; + // this member overshadows its own base class pointer in the parent class + p3m_data_struct_dipoles &dp3m; + + template + DipolarP3MImpl( + std::unique_ptr> &&dp3m_handle, + Args &&...args) + : DipolarP3M(*dp3m_handle, std::forward(args)...), + dp3m_impl{std::move(dp3m_handle)}, dp3m{*dp3m_impl} {} + + void init() override { + if constexpr (Architecture == Arch::CPU) { + init_cpu_kernels(); + } + } + void tune() override; + void count_magnetic_particles() override; + + void on_activation() override { + sanity_checks(); + tune(); + } + + double long_range_energy(ParticleRange const &particles) override { + return long_range_kernel(false, true, particles); + } + + void add_long_range_forces(ParticleRange const &particles) override { + if constexpr (Architecture == Arch::CPU) { + long_range_kernel(true, false, particles); + } + } + + void dipole_assign(ParticleRange const &particles) override; + +protected: + /** Compute the k-space part of forces and energies. */ + double long_range_kernel(bool force_flag, bool energy_flag, + ParticleRange const &particles); + double calc_average_self_energy_k_space() const override; + void calc_energy_correction() override; + void calc_influence_function_force() override; + void calc_influence_function_energy() override; + void init_cpu_kernels(); + void scaleby_box_l() override { + DipolarP3M::scaleby_box_l(); + dp3m.energy_correction = 0.; + } +}; + +template class FFTBackendImpl, class... Args> +std::shared_ptr new_dp3m_handle(P3MParameters &&p3m, + Args &&...args) { + auto obj = std::make_shared>( + std::make_unique>(std::move(p3m)), + std::forward(args)...); + obj->dp3m.template make_fft_instance>(true); + return obj; +} + +#endif // DP3M diff --git a/src/core/p3m/FFTBackendLegacy.cpp b/src/core/p3m/FFTBackendLegacy.cpp index 146c241de19..c00f3440c84 100644 --- a/src/core/p3m/FFTBackendLegacy.cpp +++ b/src/core/p3m/FFTBackendLegacy.cpp @@ -36,14 +36,18 @@ #include #include -FFTBackendLegacy::FFTBackendLegacy(p3m_data_struct &obj, bool dipolar) - : FFTBackend(obj), dipolar{dipolar}, - fft{std::make_unique( +template +FFTBackendLegacy::FFTBackendLegacy( + p3m_data_struct_fft &obj, bool dipolar) + : FFTBackend(obj), dipolar{dipolar}, + fft{std::make_unique>( ::Communication::mpiCallbacksHandle()->share_mpi_env())} {} -FFTBackendLegacy::~FFTBackendLegacy() = default; +template +FFTBackendLegacy::~FFTBackendLegacy() = default; -void FFTBackendLegacy::update_mesh_data() { +template +void FFTBackendLegacy::update_mesh_data() { auto const mesh_size_ptr = fft->get_mesh_size(); auto const mesh_start_ptr = fft->get_mesh_start(); for (auto i = 0u; i < 3u; ++i) { @@ -58,8 +62,11 @@ void FFTBackendLegacy::update_mesh_data() { } } -void FFTBackendLegacy::init_fft() { +template void FFTBackendLegacy::init_halo() { mesh_comm.resize(::comm_cart, local_mesh); +} + +template void FFTBackendLegacy::init_fft() { auto ca_mesh_size = fft->initialize_fft( ::comm_cart, local_mesh.dim, local_mesh.margin, params.mesh, params.mesh_off, mesh.ks_pnum, ::communicator.node_grid); @@ -73,40 +80,60 @@ void FFTBackendLegacy::init_fft() { update_mesh_data(); } -void FFTBackendLegacy::perform_field_back_fft() { - /* Back FFT force component mesh */ +template +void FFTBackendLegacy::perform_vector_back_fft() { for (auto &rs_mesh_field : rs_mesh_fields) { fft->backward_fft(::comm_cart, rs_mesh_field.data(), check_complex_residuals); } - /* redistribute force component mesh */ - std::array meshes = {{rs_mesh_fields[0u].data(), - rs_mesh_fields[1u].data(), - rs_mesh_fields[2u].data()}}; +} + +template +void FFTBackendLegacy::perform_vector_halo_spread() { + std::array meshes = {{rs_mesh_fields[0u].data(), + rs_mesh_fields[1u].data(), + rs_mesh_fields[2u].data()}}; mesh_comm.spread_grid(::comm_cart, meshes, local_mesh.dim); } -void FFTBackendLegacy::perform_fwd_fft() { - if (dipolar) { - std::array meshes = {{rs_mesh_fields[0u].data(), - rs_mesh_fields[1u].data(), - rs_mesh_fields[2u].data()}}; - mesh_comm.gather_grid(::comm_cart, meshes, local_mesh.dim); - for (auto &rs_mesh_field : rs_mesh_fields) { - fft->forward_fft(::comm_cart, rs_mesh_field.data()); - } - } else { - mesh_comm.gather_grid(::comm_cart, rs_mesh.data(), local_mesh.dim); - fft->forward_fft(::comm_cart, rs_mesh.data()); +template +void FFTBackendLegacy::perform_scalar_halo_gather() { + mesh_comm.gather_grid(::comm_cart, rs_mesh.data(), local_mesh.dim); +} + +template +void FFTBackendLegacy::perform_scalar_fwd_fft() { + fft->forward_fft(::comm_cart, rs_mesh.data()); + update_mesh_data(); +} + +template +void FFTBackendLegacy::perform_vector_halo_gather() { + std::array meshes = {{rs_mesh_fields[0u].data(), + rs_mesh_fields[1u].data(), + rs_mesh_fields[2u].data()}}; + mesh_comm.gather_grid(::comm_cart, meshes, local_mesh.dim); +} + +template +void FFTBackendLegacy::perform_vector_fwd_fft() { + for (auto &rs_mesh_field : rs_mesh_fields) { + fft->forward_fft(::comm_cart, rs_mesh_field.data()); } update_mesh_data(); } -void FFTBackendLegacy::perform_space_back_fft() { - /* Back FFT force component mesh */ +template +void FFTBackendLegacy::perform_scalar_back_fft() { fft->backward_fft(::comm_cart, rs_mesh.data(), check_complex_residuals); - /* redistribute force component mesh */ +} + +template +void FFTBackendLegacy::perform_scalar_halo_spread() { mesh_comm.spread_grid(::comm_cart, rs_mesh.data(), local_mesh.dim); } +template class FFTBackendLegacy; +template class FFTBackendLegacy; + #endif // defined(P3M) or defined(DP3M) diff --git a/src/core/p3m/FFTBackendLegacy.hpp b/src/core/p3m/FFTBackendLegacy.hpp index 2ae1a379aef..bf2dd92f7b2 100644 --- a/src/core/p3m/FFTBackendLegacy.hpp +++ b/src/core/p3m/FFTBackendLegacy.hpp @@ -34,33 +34,48 @@ #include #include #include +#include namespace fft { -struct fft_data_struct; +template struct fft_data_struct; } // namespace fft /** * @brief Historic FFT backend based on FFTW3. * The 3D FFT is split into three 1D FFTs. */ -class FFTBackendLegacy : public FFTBackend { +template +class FFTBackendLegacy : public FFTBackend { + static_assert(std::is_same_v or + std::is_same_v, + "FFTW only implements float and double"); bool dipolar; - std::unique_ptr fft; + std::unique_ptr> fft; /** @brief k-space mesh (local) for k-space calculations. */ - std::vector ks_mesh; + std::vector ks_mesh; /** @brief real-space mesh (local) for CA/FFT. */ - fft::vector rs_mesh; + fft::vector rs_mesh; /** @brief real-space mesh (local) for the electric or dipolar field. */ - std::array, 3> rs_mesh_fields; - p3m_send_mesh mesh_comm; + std::array, 3> rs_mesh_fields; + p3m_send_mesh mesh_comm; + using FFTBackend::params; + using FFTBackend::mesh; + using FFTBackend::local_mesh; + using FFTBackend::check_complex_residuals; public: - explicit FFTBackendLegacy(p3m_data_struct &obj, bool dipolar); + FFTBackendLegacy(p3m_data_struct_fft &obj, bool dipolar); ~FFTBackendLegacy() override; void init_fft() override; - void perform_fwd_fft() override; - void perform_field_back_fft() override; - void perform_space_back_fft() override; + void init_halo() override; + void perform_scalar_fwd_fft() override; + void perform_vector_fwd_fft() override; + void perform_scalar_back_fft() override; + void perform_vector_back_fft() override; + void perform_scalar_halo_gather() override; + void perform_vector_halo_gather() override; + void perform_scalar_halo_spread() override; + void perform_vector_halo_spread() override; void update_mesh_data(); /** diff --git a/src/core/p3m/common.hpp b/src/core/p3m/common.hpp index af9a9af5959..4a4b88b22f3 100644 --- a/src/core/p3m/common.hpp +++ b/src/core/p3m/common.hpp @@ -53,6 +53,8 @@ auto constexpr P3M_EPSILON_METALLIC = 0.0; #include #include +enum class Arch { CPU, GPU }; + /** @brief Structure to hold P3M parameters and some dependent variables. */ struct P3MParameters { /** tuning or production? */ @@ -216,13 +218,13 @@ struct P3MLocalMesh { }; /** @brief Local mesh FFT buffers. */ -struct P3MFFTMesh { +template struct P3MFFTMesh { /** @brief k-space scalar mesh for k-space calculations. */ - std::span ks_scalar; + std::span ks_scalar; /** @brief real-space scalar mesh for charge assignment and FFT. */ - std::span rs_scalar; + std::span rs_scalar; /** @brief real-space 3D meshes for the electric or dipolar field. */ - std::array, 3> rs_fields; + std::array, 3> rs_fields; /** @brief Indices of the lower left corner of the local mesh grid. */ Utils::Vector3i start; diff --git a/src/core/p3m/data_struct.hpp b/src/core/p3m/data_struct.hpp index 9b607031991..7669a5b359c 100644 --- a/src/core/p3m/data_struct.hpp +++ b/src/core/p3m/data_struct.hpp @@ -34,7 +34,7 @@ #include #include -class FFTBackend; +template class FFTBackend; /** * @brief Base class for the electrostatics and magnetostatics P3M algorithms. @@ -49,21 +49,12 @@ struct p3m_data_struct { P3MParameters params; /** @brief Local mesh properties. */ P3MLocalMesh local_mesh; - /** @brief Local mesh FFT buffers. */ - P3MFFTMesh mesh; /** * @brief Spatial differential operator in k-space. * We use an i*k differentiation. */ std::array, 3> d_op; - /** Force optimised influence function (k-space) */ - std::vector g_force; - /** Energy optimised influence function (k-space) */ - std::vector g_energy; - - /** FFT backend. */ - std::unique_ptr fft; /** Calculate the Fourier transformed differential operator. * Remark: This is done on the level of n-vectors and not k-vectors, @@ -72,6 +63,21 @@ struct p3m_data_struct { void calc_differential_operator() { d_op = detail::calc_meshift(params.mesh, true); } +}; + +template +struct p3m_data_struct_fft : public p3m_data_struct { + using p3m_data_struct::p3m_data_struct; + using value_type = FloatType; + /** @brief Local mesh FFT buffers. */ + P3MFFTMesh mesh; + + /** Force optimised influence function (k-space) */ + std::vector g_force; + /** Energy optimised influence function (k-space) */ + std::vector g_energy; + /** FFT backend. */ + std::unique_ptr> fft; template void make_fft_instance(Args... args) { assert(fft == nullptr); @@ -85,25 +91,37 @@ struct p3m_data_struct { * The backend can read some members of @ref p3m_data_struct * but can only modify the FFT buffers in @ref P3MFFTMesh. */ -class FFTBackend { +template class FFTBackend { protected: P3MParameters const ¶ms; P3MLocalMesh const &local_mesh; - P3MFFTMesh &mesh; + P3MFFTMesh &mesh; public: bool check_complex_residuals = false; - explicit FFTBackend(p3m_data_struct &obj) + explicit FFTBackend(p3m_data_struct_fft &obj) : params{obj.params}, local_mesh{obj.local_mesh}, mesh{obj.mesh} {} virtual ~FFTBackend() = default; /** @brief Initialize the FFT plans and buffers. */ virtual void init_fft() = 0; - /** @brief Carry out the forward FFT of the scalar or field meshes. */ - virtual void perform_fwd_fft() = 0; + /** @brief Initialize the halo buffers. */ + virtual void init_halo() = 0; + /** @brief Carry out the forward FFT of the scalar mesh. */ + virtual void perform_scalar_fwd_fft() = 0; + /** @brief Carry out the forward FFT of the vector meshes. */ + virtual void perform_vector_fwd_fft() = 0; /** @brief Carry out the backward FFT of the scalar mesh. */ - virtual void perform_field_back_fft() = 0; - /** @brief Carry out the backward FFT of the field meshes. */ - virtual void perform_space_back_fft() = 0; + virtual void perform_scalar_back_fft() = 0; + /** @brief Carry out the backward FFT of the vector meshes. */ + virtual void perform_vector_back_fft() = 0; + /** @brief Update scalar mesh halo with data from neighbors (accumulation). */ + virtual void perform_scalar_halo_gather() = 0; + /** @brief Update vector mesh halo with data from neighbors (accumulation). */ + virtual void perform_vector_halo_gather() = 0; + /** @brief Update scalar mesh halo of all neighbors. */ + virtual void perform_scalar_halo_spread() = 0; + /** @brief Update vector mesh halo of all neighbors. */ + virtual void perform_vector_halo_spread() = 0; /** @brief Get indices of the k-space data layout. */ virtual std::tuple get_permutations() const = 0; }; diff --git a/src/core/p3m/influence_function.hpp b/src/core/p3m/influence_function.hpp index eb259d89d92..c3373a88d15 100644 --- a/src/core/p3m/influence_function.hpp +++ b/src/core/p3m/influence_function.hpp @@ -110,19 +110,17 @@ double G_opt(int cao, double alpha, Utils::Vector3d const &k, * @param inv_box_l Inverse box length. * @return Values of G_opt at regular grid points. */ -template -std::vector grid_influence_function(P3MParameters const ¶ms, - Utils::Vector3i const &n_start, - Utils::Vector3i const &n_stop, - int const KX, int const KY, - int const KZ, - Utils::Vector3d const &inv_box_l) { +template +std::vector grid_influence_function( + P3MParameters const ¶ms, Utils::Vector3i const &n_start, + Utils::Vector3i const &n_stop, int const KX, int const KY, int const KZ, + Utils::Vector3d const &inv_box_l) { auto const shifts = detail::calc_meshift(params.mesh); auto const size = n_stop - n_start; /* The influence function grid */ - auto g = std::vector(Utils::product(size), 0.); + auto g = std::vector(Utils::product(size), FloatType(0)); /* Skip influence function calculation in tuning mode, the results need not be correct for timing. */ @@ -143,7 +141,7 @@ std::vector grid_influence_function(P3MParameters const ¶ms, Utils::Vector3d{{shifts[0u][indices[KX]] * wavevector[0u], shifts[1u][indices[KY]] * wavevector[1u], shifts[2u][indices[KZ]] * wavevector[2u]}}; - g[index] = G_opt(params.cao, params.alpha, k, params.a); + g[index] = FloatType(G_opt(params.cao, params.alpha, k, params.a)); } ++index; }); diff --git a/src/core/p3m/influence_function_dipolar.hpp b/src/core/p3m/influence_function_dipolar.hpp index d52a1a6d675..74af200a208 100644 --- a/src/core/p3m/influence_function_dipolar.hpp +++ b/src/core/p3m/influence_function_dipolar.hpp @@ -105,16 +105,15 @@ double G_opt_dipolar(P3MParameters const ¶ms, Utils::Vector3i const &shift, * @param inv_box_l Inverse box length * @return Values of the influence function at regular grid points. */ -template -std::vector grid_influence_function(P3MParameters const ¶ms, - Utils::Vector3i const &n_start, - Utils::Vector3i const &n_stop, - Utils::Vector3d const &inv_box_l) { +template +std::vector grid_influence_function( + P3MParameters const ¶ms, Utils::Vector3i const &n_start, + Utils::Vector3i const &n_stop, Utils::Vector3d const &inv_box_l) { auto const size = n_stop - n_start; /* The influence function grid */ - auto g = std::vector(Utils::product(size), 0.); + auto g = std::vector(Utils::product(size), FloatType(0)); /* Skip influence function calculation in tuning mode, the results need not be correct for timing. */ @@ -138,7 +137,8 @@ std::vector grid_influence_function(P3MParameters const ¶ms, [&]() { if (((indices[0] % half_mesh != 0) or (indices[1] % half_mesh != 0) or (indices[2] % half_mesh != 0))) { - g[index] = prefactor * G_opt_dipolar(params, shift_off, d_op_off); + g[index] = FloatType(prefactor * + G_opt_dipolar(params, shift_off, d_op_off)); } ++index; }, @@ -183,9 +183,10 @@ inline double G_opt_dipolar_self_energy(P3MParameters const ¶ms, * @param g Energies on the grid. * @return Total self-energy. */ +template inline double grid_influence_function_self_energy( P3MParameters const ¶ms, Utils::Vector3i const &n_start, - Utils::Vector3i const &n_stop, std::vector const &g) { + Utils::Vector3i const &n_stop, std::vector const &g) { auto const offset = detail::calc_meshift(params.mesh, false)[0]; auto const d_op = detail::calc_meshift(params.mesh, true)[0]; @@ -202,7 +203,7 @@ inline double grid_influence_function_self_energy( if (((indices[0] % half_mesh != 0) or (indices[1] % half_mesh != 0) or (indices[2] % half_mesh != 0))) { auto const U2 = G_opt_dipolar_self_energy(params, shift_off); - energy += g[index] * U2 * d_op_off.norm2(); + energy += double(g[index]) * U2 * d_op_off.norm2(); } ++index; }, diff --git a/src/core/p3m/send_mesh.cpp b/src/core/p3m/send_mesh.cpp index 9bb377a7688..47079361960 100644 --- a/src/core/p3m/send_mesh.cpp +++ b/src/core/p3m/send_mesh.cpp @@ -31,13 +31,25 @@ #include #include +#include #include +#include #include #include #include +template +static void mesh_sendrecv(T const *const sendbuf, int scount, int dest, + T *const recvbuf, int rcount, int source, + boost::mpi::communicator const &comm, int tag) { + auto const type = boost::mpi::get_mpi_datatype(*sendbuf); + MPI_Sendrecv(reinterpret_cast(sendbuf), scount, type, dest, tag, + reinterpret_cast(recvbuf), rcount, type, source, tag, + comm, MPI_STATUS_IGNORE); +} + /** Add values of a 3d-grid input block (size[3]) to values of 3d-grid * output array with dimension dim[3] at start position start[3]. * @@ -47,8 +59,10 @@ * \param size Dimensions of the block * \param dim Dimensions of the output grid. */ -static void p3m_add_block(double const *in, double *out, int const start[3], - int const size[3], int const dim[3]) { +template +static void p3m_add_block(FloatType const *in, FloatType *out, + int const start[3], int const size[3], + int const dim[3]) { /* fast, mid and slow changing indices */ int f, m, s; /* linear index of in grid, linear index of out grid */ @@ -71,8 +85,9 @@ static void p3m_add_block(double const *in, double *out, int const start[3], } } -void p3m_send_mesh::resize(boost::mpi::communicator const &comm, - P3MLocalMesh const &local_mesh) { +template +void p3m_send_mesh::resize(boost::mpi::communicator const &comm, + P3MLocalMesh const &local_mesh) { int done[3] = {0, 0, 0}; /* send grids */ for (int i = 0; i < 3; i++) { @@ -101,8 +116,7 @@ void p3m_send_mesh::resize(boost::mpi::communicator const &comm, s_dim[i][j] = s_ur[i][j] - s_ld[i][j]; s_size[i] *= s_dim[i][j]; } - if (s_size[i] > max) - max = s_size[i]; + max = std::max(max, s_size[i]); } /* communication */ auto const node_neighbors = Utils::Mpi::cart_neighbors<3>(comm); @@ -112,15 +126,14 @@ void p3m_send_mesh::resize(boost::mpi::communicator const &comm, auto const j = (i % 2 == 0) ? i + 1 : i - 1; if (node_neighbors[i] != comm.rank()) { - MPI_Sendrecv(&(local_mesh.margin[i]), 1, MPI_INT, node_neighbors[i], - REQ_P3M_INIT, &(r_margin[j]), 1, MPI_INT, node_neighbors[j], - REQ_P3M_INIT, comm, MPI_STATUS_IGNORE); + mesh_sendrecv(&(local_mesh.margin[i]), 1, node_neighbors[i], + &(r_margin[j]), 1, node_neighbors[j], comm, REQ_P3M_INIT); } else { r_margin[j] = local_mesh.margin[i]; } } /* recv grids */ - for (int i = 0; i < 3; i++) + for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) { if (j == i) { r_ld[i * 2][j] = s_ld[i * 2][j] + local_mesh.margin[2 * j]; @@ -135,20 +148,21 @@ void p3m_send_mesh::resize(boost::mpi::communicator const &comm, r_ur[(i * 2) + 1][j] = s_ur[(i * 2) + 1][j]; } } + } for (int i = 0; i < 6; i++) { r_size[i] = 1; for (int j = 0; j < 3; j++) { r_dim[i][j] = r_ur[i][j] - r_ld[i][j]; r_size[i] *= r_dim[i][j]; } - if (r_size[i] > max) - max = r_size[i]; + max = std::max(max, r_size[i]); } } -void p3m_send_mesh::gather_grid(boost::mpi::communicator const &comm, - std::span meshes, - Utils::Vector3i const &dim) { +template +void p3m_send_mesh::gather_grid(boost::mpi::communicator const &comm, + std::span meshes, + Utils::Vector3i const &dim) { auto const node_neighbors = Utils::Mpi::cart_neighbors<3>(comm); send_grid.resize(max * meshes.size()); recv_grid.resize(max * meshes.size()); @@ -158,19 +172,20 @@ void p3m_send_mesh::gather_grid(boost::mpi::communicator const &comm, auto const r_dir = (s_dir % 2 == 0) ? s_dir + 1 : s_dir - 1; /* pack send block */ - if (s_size[s_dir] > 0) + if (s_size[s_dir] > 0) { for (std::size_t i = 0; i < meshes.size(); i++) { fft::fft_pack_block(meshes[i], send_grid.data() + i * s_size[s_dir], s_ld[s_dir], s_dim[s_dir], dim.data(), 1); } + } /* communication */ if (node_neighbors[s_dir] != comm.rank()) { - MPI_Sendrecv( - send_grid.data(), static_cast(meshes.size()) * s_size[s_dir], - MPI_DOUBLE, node_neighbors[s_dir], REQ_P3M_GATHER, recv_grid.data(), - static_cast(meshes.size()) * r_size[r_dir], MPI_DOUBLE, - node_neighbors[r_dir], REQ_P3M_GATHER, comm, MPI_STATUS_IGNORE); + auto const send_size = static_cast(meshes.size()) * s_size[s_dir]; + auto const recv_size = static_cast(meshes.size()) * r_size[r_dir]; + mesh_sendrecv(send_grid.data(), send_size, node_neighbors[s_dir], + recv_grid.data(), recv_size, node_neighbors[r_dir], comm, + REQ_P3M_GATHER); } else { std::swap(send_grid, recv_grid); } @@ -184,9 +199,10 @@ void p3m_send_mesh::gather_grid(boost::mpi::communicator const &comm, } } -void p3m_send_mesh::spread_grid(boost::mpi::communicator const &comm, - std::span meshes, - Utils::Vector3i const &dim) { +template +void p3m_send_mesh::spread_grid(boost::mpi::communicator const &comm, + std::span meshes, + Utils::Vector3i const &dim) { auto const node_neighbors = Utils::Mpi::cart_neighbors<3>(comm); send_grid.resize(max * meshes.size()); recv_grid.resize(max * meshes.size()); @@ -196,22 +212,23 @@ void p3m_send_mesh::spread_grid(boost::mpi::communicator const &comm, auto const r_dir = (s_dir % 2 == 0) ? s_dir + 1 : s_dir - 1; /* pack send block */ - if (r_size[r_dir] > 0) + if (r_size[r_dir] > 0) { for (std::size_t i = 0; i < meshes.size(); i++) { fft::fft_pack_block(meshes[i], send_grid.data() + i * r_size[r_dir], r_ld[r_dir], r_dim[r_dir], dim.data(), 1); } + } /* communication */ if (node_neighbors[r_dir] != comm.rank()) { - MPI_Sendrecv( - send_grid.data(), r_size[r_dir] * static_cast(meshes.size()), - MPI_DOUBLE, node_neighbors[r_dir], REQ_P3M_SPREAD, recv_grid.data(), - s_size[s_dir] * static_cast(meshes.size()), MPI_DOUBLE, - node_neighbors[s_dir], REQ_P3M_SPREAD, comm, MPI_STATUS_IGNORE); + auto const send_size = static_cast(meshes.size()) * r_size[r_dir]; + auto const recv_size = static_cast(meshes.size()) * s_size[s_dir]; + mesh_sendrecv(send_grid.data(), send_size, node_neighbors[r_dir], + recv_grid.data(), recv_size, node_neighbors[s_dir], comm, + REQ_P3M_SPREAD); } else { std::swap(send_grid, recv_grid); } - /* un pack recv block */ + /* unpack recv block */ if (s_size[s_dir] > 0) { for (std::size_t i = 0; i < meshes.size(); i++) { fft::fft_unpack_block(recv_grid.data() + i * s_size[s_dir], meshes[i], @@ -221,4 +238,7 @@ void p3m_send_mesh::spread_grid(boost::mpi::communicator const &comm, } } +template class p3m_send_mesh; +template class p3m_send_mesh; + #endif // defined(P3M) or defined(DP3M) diff --git a/src/core/p3m/send_mesh.hpp b/src/core/p3m/send_mesh.hpp index 8fa73435880..fbbb1126579 100644 --- a/src/core/p3m/send_mesh.hpp +++ b/src/core/p3m/send_mesh.hpp @@ -34,8 +34,8 @@ #include #include -/** Structure for send/recv meshes. */ -class p3m_send_mesh { +/** @brief P3M halo communicator. */ +template class p3m_send_mesh { enum Requests { REQ_P3M_INIT = 200, REQ_P3M_GATHER = 201, @@ -43,17 +43,17 @@ class p3m_send_mesh { }; /** dimension of sub meshes to send. */ int s_dim[6][3]; - /** left down corners of sub meshes to send. */ + /** lower left corners of sub meshes to send. */ int s_ld[6][3]; - /** up right corners of sub meshes to send. */ + /** upper right corners of sub meshes to send. */ int s_ur[6][3]; /** sizes for send buffers. */ int s_size[6]; /** dimension of sub meshes to recv. */ int r_dim[6][3]; - /** left down corners of sub meshes to recv. */ + /** lower left corners of sub meshes to recv. */ int r_ld[6][3]; - /** up right corners of sub meshes to recv. */ + /** upper right corners of sub meshes to recv. */ int r_ur[6][3]; /** sizes for recv buffers. */ int r_size[6]; @@ -61,22 +61,22 @@ class p3m_send_mesh { int max; /** vector to store grid points to send. */ - std::vector send_grid; + std::vector send_grid; /** vector to store grid points to recv */ - std::vector recv_grid; + std::vector recv_grid; public: void resize(boost::mpi::communicator const &comm, P3MLocalMesh const &local_mesh); void gather_grid(boost::mpi::communicator const &comm, - std::span meshes, Utils::Vector3i const &dim); - void gather_grid(boost::mpi::communicator const &comm, double *mesh, + std::span meshes, Utils::Vector3i const &dim); + void gather_grid(boost::mpi::communicator const &comm, FloatType *mesh, Utils::Vector3i const &dim) { gather_grid(comm, std::span(&mesh, 1u), dim); } void spread_grid(boost::mpi::communicator const &comm, - std::span meshes, Utils::Vector3i const &dim); - void spread_grid(boost::mpi::communicator const &comm, double *mesh, + std::span meshes, Utils::Vector3i const &dim); + void spread_grid(boost::mpi::communicator const &comm, FloatType *mesh, Utils::Vector3i const &dim) { spread_grid(comm, std::span(&mesh, 1u), dim); } diff --git a/src/core/unit_tests/EspressoSystemStandAlone_test.cpp b/src/core/unit_tests/EspressoSystemStandAlone_test.cpp index a5f22987bd8..b14ecd19bb7 100644 --- a/src/core/unit_tests/EspressoSystemStandAlone_test.cpp +++ b/src/core/unit_tests/EspressoSystemStandAlone_test.cpp @@ -41,10 +41,13 @@ namespace utf = boost::unit_test; #include "cuda/utils.hpp" #include "electrostatics/coulomb.hpp" #include "electrostatics/p3m.hpp" +#include "electrostatics/p3m.impl.hpp" #include "galilei/Galilei.hpp" #include "integrate.hpp" #include "integrators/Propagation.hpp" #include "magnetostatics/dipoles.hpp" +#include "magnetostatics/dp3m.hpp" +#include "magnetostatics/dp3m.impl.hpp" #include "nonbonded_interactions/lj.hpp" #include "nonbonded_interactions/nonbonded_interaction_data.hpp" #include "observables/ParticleVelocities.hpp" @@ -299,8 +302,8 @@ BOOST_FIXTURE_TEST_CASE(espresso_system_stand_alone, ParticleFactory) { #ifdef P3M { // add charges - set_particle_property(pid1, &Particle::q, +1.); - set_particle_property(pid2, &Particle::q, -1.); + set_particle_property(pid1, &Particle::q, +0.5); + set_particle_property(pid2, &Particle::q, -0.5); // set up P3M auto const prefactor = 2.; @@ -312,13 +315,12 @@ BOOST_FIXTURE_TEST_CASE(espresso_system_stand_alone, ParticleFactory) { 5, 0.615, 1e-3}; - auto solver = - std::make_shared(std::move(p3m), prefactor, 1, false, true); - solver->p3m.make_fft_instance(false); + auto solver = new_p3m_handle( + std::move(p3m), prefactor, 1, false, true); add_actor(comm, espresso::system, system.coulomb.impl->solver, solver, [&system]() { system.on_coulomb_change(); }); - // measure energies + // measure energies and forces auto const step = 0.02; auto const pos1 = start_positions.at(pid1); Utils::Vector3d pos2{box_center, box_center - 0.1, 1.0}; @@ -327,19 +329,95 @@ BOOST_FIXTURE_TEST_CASE(espresso_system_stand_alone, ParticleFactory) { pos2[0] += step; set_particle_pos(pid2, pos2); auto const r = (pos2 - pos1).norm(); - // check P3M energy auto const obs_energy = system.calculate_energy(); + system.integrate(0, INTEG_REUSE_FORCES_CONDITIONALLY); if (rank == 0) { // at very short distances, the real-space contribution to // the energy is much larger than the k-space contribution - auto const energy_ref = -prefactor / r; + auto const energy_ref = -prefactor * 0.25 / r; auto const energy_p3m = obs_energy->coulomb[0] + obs_energy->coulomb[1]; BOOST_CHECK_CLOSE(energy_p3m, energy_ref, 0.002); + auto pf = get_particle_property(pid1, &Particle::force_and_torque); + BOOST_REQUIRE(pf.has_value()); + BOOST_CHECK_GE(std::abs(pf->f[0u]), 0.1); + BOOST_CHECK_LE(std::abs(pf->f[1u]), 1e-12); + BOOST_CHECK_LE(std::abs(pf->f[2u]), 1e-12); + BOOST_CHECK_EQUAL(pf->torque.norm(), 0.); } } + + // deactivate actor + solver->detach_system(espresso::system); + system.coulomb.impl->solver = std::nullopt; + system.on_coulomb_change(); + auto const obs_energy = system.calculate_energy(); + if (rank == 0) { + auto const energy_p3m = obs_energy->coulomb[0] + obs_energy->coulomb[1]; + BOOST_CHECK_EQUAL(energy_p3m, 0.); + } } #endif // P3M + // check magnetostatics +#ifdef DP3M + { + // add charges + set_particle_property(pid1, &Particle::dipm, +0.5); + set_particle_property(pid2, &Particle::dipm, -0.5); + + // set up P3M + auto const prefactor = 2.; + auto p3m = P3MParameters{false, + 0.0, + 3.5, + Utils::Vector3i::broadcast(12), + Utils::Vector3d::broadcast(0.5), + 5, + 0.615, + 1e-3}; + auto solver = new_dp3m_handle( + std::move(p3m), prefactor, 1, false); + add_actor(comm, espresso::system, system.dipoles.impl->solver, solver, + [&system]() { system.on_dipoles_change(); }); + + // measure energies and forces + auto const step = 0.02; + auto const pos1 = start_positions.at(pid1); + Utils::Vector3d pos2{box_center, box_center - 0.1, 1.0}; + for (int i = 0; i < 10; ++i) { + // move particle + pos2[0] += step; + set_particle_pos(pid2, pos2); + auto const r = (pos2 - pos1).norm(); + auto const obs_energy = system.calculate_energy(); + system.integrate(0, INTEG_REUSE_FORCES_CONDITIONALLY); + if (rank == 0) { + // at very short distances, the real-space contribution to + // the energy is much larger than the k-space contribution + auto const energy_ref = -prefactor * 0.25 / Utils::int_pow<3>(r); + auto const energy_p3m = obs_energy->dipolar[0] + obs_energy->dipolar[1]; + BOOST_CHECK_CLOSE(energy_p3m, energy_ref, 0.002); + auto pf = get_particle_property(pid1, &Particle::force_and_torque); + BOOST_REQUIRE(pf.has_value()); + BOOST_CHECK_GE(std::abs(pf->f[0u]), 0.1); + BOOST_CHECK_LE(std::abs(pf->f[1u]), 1e-12); + BOOST_CHECK_LE(std::abs(pf->f[2u]), 1e-12); + BOOST_CHECK_LE(pf->torque.norm(), 1e-12); + } + } + + // deactivate actor + solver->detach_system(espresso::system); + system.dipoles.impl->solver = std::nullopt; + system.on_dipoles_change(); + auto const obs_energy = system.calculate_energy(); + if (rank == 0) { + auto const energy_p3m = obs_energy->dipolar[0] + obs_energy->dipolar[1]; + BOOST_CHECK_EQUAL(energy_p3m, 0.); + } + } +#endif // DP3M + // check integration { // set up velocity-Verlet integrator diff --git a/src/core/unit_tests/ParticleFactory.hpp b/src/core/unit_tests/ParticleFactory.hpp index 71f2d14bf1b..861087b0137 100644 --- a/src/core/unit_tests/ParticleFactory.hpp +++ b/src/core/unit_tests/ParticleFactory.hpp @@ -26,6 +26,7 @@ #include +#include #include /** Fixture to create particles during a test and remove them at the end. */ @@ -61,15 +62,26 @@ struct ParticleFactory { } template - void set_particle_property(int p_id, T &(Particle::*setter)(), + void set_particle_property(int p_id, T &(Particle::*getter)(), T const &value) const { auto &system = System::get_system(); auto p = system.cell_structure->get_local_particle(p_id); if (p != nullptr) { - (p->*setter)() = value; + (p->*getter)() = value; } } + template + auto get_particle_property(int p_id, T &(Particle::*getter)()) const { + auto &system = System::get_system(); + auto p = system.cell_structure->get_local_particle(p_id); + std::optional opt = std::nullopt; + if (p != nullptr) { + opt = (p->*getter)(); + } + return opt; + } + void clear_particles() { for (auto pid : particle_cache) { remove_particle(pid); diff --git a/src/core/unit_tests/fft_test.cpp b/src/core/unit_tests/fft_test.cpp index 94cfc33f7fc..129e7fd9ea8 100644 --- a/src/core/unit_tests/fft_test.cpp +++ b/src/core/unit_tests/fft_test.cpp @@ -157,8 +157,8 @@ BOOST_AUTO_TEST_CASE(fft_map_grid) { BOOST_AUTO_TEST_CASE(fft_exceptions) { auto constexpr size_max = std::numeric_limits::max(); - auto constexpr bad_size = size_max / sizeof(int) + 1ul; - fft::allocator allocator{}; + auto constexpr bad_size = size_max / sizeof(float) + 1ul; + fft::allocator allocator{}; BOOST_CHECK_EQUAL(allocator.allocate(0ul), nullptr); BOOST_CHECK_THROW(allocator.allocate(bad_size), std::bad_array_new_length); } diff --git a/src/python/espressomd/electrostatics.py b/src/python/espressomd/electrostatics.py index a197c56528f..87b23f0090c 100644 --- a/src/python/espressomd/electrostatics.py +++ b/src/python/espressomd/electrostatics.py @@ -219,12 +219,17 @@ class P3M(_P3MBase): check_complex_residuals: :obj:`bool`, optional Raise a warning if the backward Fourier transform has non-zero complex residuals when set to ``True`` (default). + single_precision : :obj:`bool` + Use single-precision floating-point arithmetic. """ _so_name = "Coulomb::CoulombP3M" _so_creation_policy = "GLOBAL" _so_features = ("P3M",) + def default_params(self): + return {"single_precision": False, **super().default_params()} + @script_interface_register class P3MGPU(_P3MBase): @@ -275,6 +280,9 @@ class P3MGPU(_P3MBase): _so_creation_policy = "GLOBAL" _so_features = ("P3M", "CUDA") + def default_params(self): + return {"single_precision": True, **super().default_params()} + @script_interface_register class ELC(ElectrostaticInteraction): diff --git a/src/python/espressomd/magnetostatics.py b/src/python/espressomd/magnetostatics.py index ca611d6b461..0132dd32a05 100644 --- a/src/python/espressomd/magnetostatics.py +++ b/src/python/espressomd/magnetostatics.py @@ -106,6 +106,8 @@ class DipolarP3M(MagnetostaticInteraction): (default is ``True``, i.e., activated). timings : :obj:`int` Number of force calculations during tuning. + single_precision : :obj:`bool` + Use single-precision floating-point arithmetic. """ _so_name = "Dipoles::DipolarP3M" @@ -152,6 +154,7 @@ def default_params(self): "epsilon": 0.0, "mesh_off": [0.5, 0.5, 0.5], "prefactor": 0., + "single_precision": False, "tune": True, "timings": 10, "verbose": True} diff --git a/src/script_interface/electrostatics/Actor_impl.hpp b/src/script_interface/electrostatics/Actor.impl.hpp similarity index 100% rename from src/script_interface/electrostatics/Actor_impl.hpp rename to src/script_interface/electrostatics/Actor.impl.hpp diff --git a/src/script_interface/electrostatics/CoulombP3M.hpp b/src/script_interface/electrostatics/CoulombP3M.hpp index 34ea03c20e1..78d1e4d8e24 100644 --- a/src/script_interface/electrostatics/CoulombP3M.hpp +++ b/src/script_interface/electrostatics/CoulombP3M.hpp @@ -26,6 +26,7 @@ #include "Actor.hpp" #include "core/electrostatics/p3m.hpp" +#include "core/electrostatics/p3m.impl.hpp" #include "core/p3m/FFTBackendLegacy.hpp" #include "script_interface/get_value.hpp" @@ -36,12 +37,26 @@ namespace ScriptInterface { namespace Coulomb { -class CoulombP3M : public Actor { +template +class CoulombP3M : public Actor, ::CoulombP3M> { bool m_tune; + bool m_single_precision; + +public: + using Base = Actor, ::CoulombP3M>; + using Base::actor; + using Base::add_parameters; + using Base::context; + +protected: + using Base::m_actor; + using Base::set_charge_neutrality_tolerance; public: CoulombP3M() { add_parameters({ + {"single_precision", AutoParameter::read_only, + [this]() { return m_single_precision; }}, {"alpha_L", AutoParameter::read_only, [this]() { return actor()->p3m.params.alpha_L; }}, {"r_cut_iL", AutoParameter::read_only, @@ -76,7 +91,12 @@ class CoulombP3M : public Actor { void do_construct(VariantMap const ¶ms) override { m_tune = get_value(params, "tune"); + m_single_precision = get_value(params, "single_precision"); context()->parallel_try_catch([&]() { + if (Architecture == Arch::GPU and not m_single_precision) { + throw std::invalid_argument( + "P3M GPU only implemented in single-precision mode"); + } auto p3m = P3MParameters{!get_value_or(params, "is_tuned", !m_tune), get_value(params, "epsilon"), get_value(params, "r_cut"), @@ -85,14 +105,26 @@ class CoulombP3M : public Actor { get_value(params, "cao"), get_value(params, "alpha"), get_value(params, "accuracy")}; - m_actor = std::make_shared( - std::move(p3m), get_value(params, "prefactor"), - get_value(params, "timings"), get_value(params, "verbose"), - get_value(params, "check_complex_residuals")); - m_actor->p3m.make_fft_instance(false); + make_handle(m_single_precision, std::move(p3m), + get_value(params, "prefactor"), + get_value(params, "timings"), + get_value(params, "verbose"), + get_value(params, "check_complex_residuals")); }); set_charge_neutrality_tolerance(params); } + +private: + template + void make_handle(bool single_precision, Args &&...args) { + if (single_precision) { + m_actor = new_p3m_handle( + std::forward(args)...); + } else { + m_actor = new_p3m_handle( + std::forward(args)...); + } + } }; } // namespace Coulomb diff --git a/src/script_interface/electrostatics/CoulombP3MGPU.hpp b/src/script_interface/electrostatics/CoulombP3MGPU.hpp deleted file mode 100644 index 187fde4308c..00000000000 --- a/src/script_interface/electrostatics/CoulombP3MGPU.hpp +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (C) 2022 The ESPResSo project - * - * This file is part of ESPResSo. - * - * ESPResSo is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * ESPResSo is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#pragma once - -#include "config/config.hpp" - -#ifdef P3M -#ifdef CUDA - -#include "Actor.hpp" - -#include "core/electrostatics/p3m_gpu.hpp" -#include "core/p3m/FFTBackendLegacy.hpp" - -#include "script_interface/get_value.hpp" - -#include -#include - -namespace ScriptInterface { -namespace Coulomb { - -class CoulombP3MGPU : public Actor { - bool m_tune; - -public: - CoulombP3MGPU() { - add_parameters({ - {"alpha_L", AutoParameter::read_only, - [this]() { return actor()->p3m.params.alpha_L; }}, - {"r_cut_iL", AutoParameter::read_only, - [this]() { return actor()->p3m.params.r_cut_iL; }}, - {"mesh", AutoParameter::read_only, - [this]() { return actor()->p3m.params.mesh; }}, - {"mesh_off", AutoParameter::read_only, - [this]() { return actor()->p3m.params.mesh_off; }}, - {"cao", AutoParameter::read_only, - [this]() { return actor()->p3m.params.cao; }}, - {"accuracy", AutoParameter::read_only, - [this]() { return actor()->p3m.params.accuracy; }}, - {"epsilon", AutoParameter::read_only, - [this]() { return actor()->p3m.params.epsilon; }}, - {"a", AutoParameter::read_only, - [this]() { return actor()->p3m.params.a; }}, - {"alpha", AutoParameter::read_only, - [this]() { return actor()->p3m.params.alpha; }}, - {"r_cut", AutoParameter::read_only, - [this]() { return actor()->p3m.params.r_cut; }}, - {"is_tuned", AutoParameter::read_only, - [this]() { return actor()->is_tuned(); }}, - {"verbose", AutoParameter::read_only, - [this]() { return actor()->tune_verbose; }}, - {"timings", AutoParameter::read_only, - [this]() { return actor()->tune_timings; }}, - {"tune", AutoParameter::read_only, [this]() { return m_tune; }}, - {"check_complex_residuals", AutoParameter::read_only, - [this]() { return actor()->check_complex_residuals; }}, - }); - } - - void do_construct(VariantMap const ¶ms) override { - m_tune = get_value(params, "tune"); - context()->parallel_try_catch([&]() { - auto p3m = P3MParameters{!get_value_or(params, "is_tuned", !m_tune), - get_value(params, "epsilon"), - get_value(params, "r_cut"), - get_value(params, "mesh"), - get_value(params, "mesh_off"), - get_value(params, "cao"), - get_value(params, "alpha"), - get_value(params, "accuracy")}; - m_actor = std::make_shared( - std::move(p3m), get_value(params, "prefactor"), - get_value(params, "timings"), get_value(params, "verbose"), - get_value(params, "check_complex_residuals")); - m_actor->p3m.make_fft_instance(false); // for CPU part - }); - set_charge_neutrality_tolerance(params); - } -}; - -} // namespace Coulomb -} // namespace ScriptInterface - -#endif // CUDA -#endif // P3M diff --git a/src/script_interface/electrostatics/ElectrostaticLayerCorrection.hpp b/src/script_interface/electrostatics/ElectrostaticLayerCorrection.hpp index b009a5bf204..b31ff09d45e 100644 --- a/src/script_interface/electrostatics/ElectrostaticLayerCorrection.hpp +++ b/src/script_interface/electrostatics/ElectrostaticLayerCorrection.hpp @@ -45,9 +45,9 @@ class ElectrostaticLayerCorrection using BaseSolver = boost::variant< #ifdef CUDA - std::shared_ptr, + std::shared_ptr>, #endif // CUDA - std::shared_ptr>; + std::shared_ptr>>; BaseSolver m_solver; void on_bind_system(::System::System &system) override { @@ -88,19 +88,20 @@ class ElectrostaticLayerCorrection auto so_ptr = get_value(params, "actor"); context()->parallel_try_catch([&]() { #ifdef CUDA - if (auto so_solver = std::dynamic_pointer_cast(so_ptr)) { - solver = so_solver->actor(); - m_solver = so_solver; - } else + if (auto so = std::dynamic_pointer_cast>(so_ptr)) { + solver = so->actor(); + m_solver = so; + return; + } #endif // CUDA - if (auto so_solver = std::dynamic_pointer_cast(so_ptr)) { - solver = so_solver->actor(); - m_solver = so_solver; - } else { - throw std::invalid_argument("Parameter 'actor' of type " + - so_ptr->name().to_string() + - " isn't supported by ELC"); - } + if (auto so = std::dynamic_pointer_cast>(so_ptr)) { + solver = so->actor(); + m_solver = so; + return; + } + throw std::invalid_argument("Parameter 'actor' of type " + + so_ptr->name().to_string() + + " isn't supported by ELC"); }); context()->parallel_try_catch([&]() { auto elc = elc_data{get_value(params, "maxPWerror"), diff --git a/src/script_interface/electrostatics/initialize.cpp b/src/script_interface/electrostatics/initialize.cpp index 476ea4ca9c5..b34c086c2a3 100644 --- a/src/script_interface/electrostatics/initialize.cpp +++ b/src/script_interface/electrostatics/initialize.cpp @@ -23,12 +23,11 @@ #ifdef ELECTROSTATICS -#include "Actor_impl.hpp" +#include "Actor.impl.hpp" #include "Container.hpp" #include "CoulombMMM1D.hpp" #include "CoulombP3M.hpp" -#include "CoulombP3MGPU.hpp" #include "CoulombScafacos.hpp" #include "DebyeHueckel.hpp" #include "ElectrostaticLayerCorrection.hpp" @@ -50,9 +49,9 @@ void initialize(Utils::Factory *om) { #ifdef ELECTROSTATICS om->register_new("Coulomb::DebyeHueckel"); #ifdef P3M - om->register_new("Coulomb::CoulombP3M"); + om->register_new>("Coulomb::CoulombP3M"); #ifdef CUDA - om->register_new("Coulomb::CoulombP3MGPU"); + om->register_new>("Coulomb::CoulombP3MGPU"); #endif om->register_new( "Coulomb::ElectrostaticLayerCorrection"); diff --git a/src/script_interface/magnetostatics/Actor_impl.hpp b/src/script_interface/magnetostatics/Actor.impl.hpp similarity index 100% rename from src/script_interface/magnetostatics/Actor_impl.hpp rename to src/script_interface/magnetostatics/Actor.impl.hpp diff --git a/src/script_interface/magnetostatics/DipolarLayerCorrection.hpp b/src/script_interface/magnetostatics/DipolarLayerCorrection.hpp index 3a32a80bc44..55cce422e52 100644 --- a/src/script_interface/magnetostatics/DipolarLayerCorrection.hpp +++ b/src/script_interface/magnetostatics/DipolarLayerCorrection.hpp @@ -45,7 +45,7 @@ class DipolarLayerCorrection using DipolarDSR = DipolarDirectSum; using BaseSolver = boost::variant< #ifdef DP3M - std::shared_ptr, + std::shared_ptr>, #endif std::shared_ptr>; BaseSolver m_solver; @@ -78,19 +78,20 @@ class DipolarLayerCorrection auto so_ptr = get_value(params, "actor"); context()->parallel_try_catch([&]() { #ifdef DP3M - if (auto so_solver = std::dynamic_pointer_cast(so_ptr)) { - solver = so_solver->actor(); - m_solver = so_solver; - } else + if (auto so = std::dynamic_pointer_cast>(so_ptr)) { + solver = so->actor(); + m_solver = so; + return; + } #endif // DP3M - if (auto so_solver = std::dynamic_pointer_cast(so_ptr)) { - solver = so_solver->actor(); - m_solver = so_solver; - } else { - throw std::invalid_argument("Parameter 'actor' of type " + - so_ptr->name().to_string() + - " isn't supported by DLC"); - } + if (auto so = std::dynamic_pointer_cast(so_ptr)) { + solver = so->actor(); + m_solver = so; + return; + } + throw std::invalid_argument("Parameter 'actor' of type " + + so_ptr->name().to_string() + + " isn't supported by DLC"); }); context()->parallel_try_catch([&]() { auto dlc = dlc_data(get_value(params, "maxPWerror"), diff --git a/src/script_interface/magnetostatics/DipolarP3M.hpp b/src/script_interface/magnetostatics/DipolarP3M.hpp index 7474345e9d4..20dff5968ee 100644 --- a/src/script_interface/magnetostatics/DipolarP3M.hpp +++ b/src/script_interface/magnetostatics/DipolarP3M.hpp @@ -26,6 +26,7 @@ #include "Actor.hpp" #include "core/magnetostatics/dp3m.hpp" +#include "core/magnetostatics/dp3m.impl.hpp" #include "core/p3m/FFTBackendLegacy.hpp" #include "script_interface/get_value.hpp" @@ -35,12 +36,25 @@ namespace ScriptInterface { namespace Dipoles { -class DipolarP3M : public Actor { +template +class DipolarP3M : public Actor, ::DipolarP3M> { bool m_tune; + bool m_single_precision; + +public: + using Base = Actor, ::DipolarP3M>; + using Base::actor; + using Base::add_parameters; + using Base::context; + +protected: + using Base::m_actor; public: DipolarP3M() { add_parameters({ + {"single_precision", AutoParameter::read_only, + [this]() { return m_single_precision; }}, {"alpha_L", AutoParameter::read_only, [this]() { return actor()->dp3m.params.alpha_L; }}, {"r_cut_iL", AutoParameter::read_only, @@ -73,6 +87,8 @@ class DipolarP3M : public Actor { void do_construct(VariantMap const ¶ms) override { m_tune = get_value(params, "tune"); + m_single_precision = get_value(params, "single_precision"); + static_assert(Architecture == Arch::CPU, "GPU not implemented"); context()->parallel_try_catch([&]() { auto p3m = P3MParameters{!get_value_or(params, "is_tuned", !m_tune), get_value(params, "epsilon"), @@ -82,13 +98,24 @@ class DipolarP3M : public Actor { get_value(params, "cao"), get_value(params, "alpha"), get_value(params, "accuracy")}; - m_actor = std::make_shared( - std::move(p3m), get_value(params, "prefactor"), - get_value(params, "timings"), - get_value(params, "verbose")); - m_actor->dp3m.make_fft_instance(true); + make_handle(m_single_precision, std::move(p3m), + get_value(params, "prefactor"), + get_value(params, "timings"), + get_value(params, "verbose")); }); } + +private: + template + void make_handle(bool single_precision, Args &&...args) { + if (single_precision) { + m_actor = new_dp3m_handle( + std::forward(args)...); + } else { + m_actor = new_dp3m_handle( + std::forward(args)...); + } + } }; } // namespace Dipoles diff --git a/src/script_interface/magnetostatics/initialize.cpp b/src/script_interface/magnetostatics/initialize.cpp index 04b22818ed5..26265863b05 100644 --- a/src/script_interface/magnetostatics/initialize.cpp +++ b/src/script_interface/magnetostatics/initialize.cpp @@ -16,13 +16,14 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ + #include "initialize.hpp" #include "config/config.hpp" #ifdef DIPOLES -#include "Actor_impl.hpp" +#include "Actor.impl.hpp" #include "Container.hpp" #include "DipolarDirectSum.hpp" @@ -49,7 +50,7 @@ void initialize(Utils::Factory *om) { om->register_new("Dipoles::DipolarDirectSumGpu"); #endif #ifdef DP3M - om->register_new("Dipoles::DipolarP3M"); + om->register_new>("Dipoles::DipolarP3M"); #endif #ifdef SCAFACOS_DIPOLES om->register_new("Dipoles::DipolarScafacos"); diff --git a/src/script_interface/tests/Actors_test.cpp b/src/script_interface/tests/Actors_test.cpp index 591864587f8..e02364c8e22 100644 --- a/src/script_interface/tests/Actors_test.cpp +++ b/src/script_interface/tests/Actors_test.cpp @@ -28,9 +28,9 @@ #include #include "script_interface/electrostatics/Actor.hpp" -#include "script_interface/electrostatics/Actor_impl.hpp" +#include "script_interface/electrostatics/Actor.impl.hpp" #include "script_interface/magnetostatics/Actor.hpp" -#include "script_interface/magnetostatics/Actor_impl.hpp" +#include "script_interface/magnetostatics/Actor.impl.hpp" #include "script_interface/scafacos/scafacos.hpp" #include "core/actor/visitors.hpp" diff --git a/testsuite/python/coulomb_interface.py b/testsuite/python/coulomb_interface.py index fc40860e4a2..55b89100b60 100644 --- a/testsuite/python/coulomb_interface.py +++ b/testsuite/python/coulomb_interface.py @@ -159,6 +159,7 @@ def test_mmm1d_cpu_tuning_exceptions(self): def test_elc_p3m_exceptions(self): P3M = espressomd.electrostatics.P3M ELC = espressomd.electrostatics.ELC + P3MGPU = espressomd.electrostatics.P3MGPU # create valid solvers dh = espressomd.electrostatics.DH(prefactor=1.2, kappa=0.8, r_cut=2.0) p3m_params = dict(prefactor=1., epsilon=0.1, accuracy=1e-2, @@ -173,6 +174,9 @@ def test_elc_p3m_exceptions(self): P3M(**{**p3m_params, 'timings': -2}) with self.assertRaisesRegex(ValueError, "Parameter 'mesh' has to be an integer or integer list of length 3"): P3M(**{**p3m_params, 'mesh': [8, 8]}) + if espressomd.has_features(["CUDA"]) and espressomd.gpu_available(): + with self.assertRaisesRegex(ValueError, "P3M GPU only implemented in single-precision mode"): + P3MGPU(single_precision=False, **p3m_params) with self.assertRaisesRegex(ValueError, "Parameter 'actor' of type Coulomb::ElectrostaticLayerCorrection isn't supported by ELC"): ELC(gap_size=2., maxPWerror=1., actor=elc) with self.assertRaisesRegex(ValueError, "Parameter 'actor' of type Coulomb::DebyeHueckel isn't supported by ELC"): diff --git a/testsuite/python/p3m_madelung.py b/testsuite/python/p3m_madelung.py index f2a01e08bd6..af98348672d 100644 --- a/testsuite/python/p3m_madelung.py +++ b/testsuite/python/p3m_madelung.py @@ -98,13 +98,16 @@ def check(): energy_per_dip = self.get_normalized_obs_per_dipole(dipm, spacing) np.testing.assert_allclose(energy_per_dip, mc, atol=0., rtol=2e-9) - dp3m = espressomd.magnetostatics.DipolarP3M(**dp3m_params) - self.system.magnetostatics.solver = dp3m - check() - self.system.magnetostatics.clear() - mdlc = espressomd.magnetostatics.DLC(actor=dp3m, **mdlc_params) - self.system.magnetostatics.solver = mdlc - check() + for single_precision in [False, True]: + with self.subTest(msg=f"DP3M CPU {single_precision=}"): + dp3m = espressomd.magnetostatics.DipolarP3M( + single_precision=single_precision, **dp3m_params) + self.system.magnetostatics.solver = dp3m + check() + self.system.magnetostatics.clear() + mdlc = espressomd.magnetostatics.DLC(actor=dp3m, **mdlc_params) + self.system.magnetostatics.solver = mdlc + check() @utx.skipIfMissingFeatures(["DP3M"]) def test_infinite_magnetic_sheet(self): @@ -142,13 +145,16 @@ def check(): energy_per_dip = self.get_normalized_obs_per_dipole(dipm, spacing) np.testing.assert_allclose(energy_per_dip, mc, atol=0., rtol=4e-6) - dp3m = espressomd.magnetostatics.DipolarP3M(**dp3m_params) - self.system.magnetostatics.solver = dp3m - check() - self.system.magnetostatics.clear() - mdlc = espressomd.magnetostatics.DLC(actor=dp3m, **mdlc_params) - self.system.magnetostatics.solver = mdlc - check() + for single_precision in [False, True]: + with self.subTest(msg=f"DP3M CPU {single_precision=}"): + dp3m = espressomd.magnetostatics.DipolarP3M( + single_precision=single_precision, **dp3m_params) + self.system.magnetostatics.solver = dp3m + check() + self.system.magnetostatics.clear() + mdlc = espressomd.magnetostatics.DLC(actor=dp3m, **mdlc_params) + self.system.magnetostatics.solver = mdlc + check() @utx.skipIfMissingFeatures(["DP3M"]) def test_infinite_magnetic_cube(self): @@ -186,9 +192,12 @@ def check(): energy_per_dip = self.get_normalized_obs_per_dipole(dipm, spacing) np.testing.assert_allclose(energy_per_dip, mc, atol=0., rtol=3e-4) - dp3m = espressomd.magnetostatics.DipolarP3M(**dp3m_params) - self.system.magnetostatics.solver = dp3m - check() + for single_precision in [False, True]: + with self.subTest(msg=f"DP3M CPU {single_precision=}"): + dp3m = espressomd.magnetostatics.DipolarP3M( + single_precision=single_precision, **dp3m_params) + self.system.magnetostatics.solver = dp3m + check() @utx.skipIfMissingFeatures(["P3M"]) def test_infinite_ionic_wire(self): @@ -212,13 +221,17 @@ def check(): np.testing.assert_allclose(p_tensor, ref_pressure, atol=1e-12, rtol=1e-6) - p3m = espressomd.electrostatics.P3M(**p3m_params) - self.system.electrostatics.solver = p3m - check() + for single_precision in [False, True]: + with self.subTest(msg=f"P3M CPU {single_precision=}"): + p3m = espressomd.electrostatics.P3M( + single_precision=single_precision, **p3m_params) + self.system.electrostatics.solver = p3m + check() if espressomd.has_features("CUDA") and espressomd.gpu_available(): - p3m = espressomd.electrostatics.P3MGPU(**p3m_params) - self.system.electrostatics.solver = p3m - check() + with self.subTest(msg=f"P3M GPU single_precision=True"): + p3m = espressomd.electrostatics.P3MGPU(**p3m_params) + self.system.electrostatics.solver = p3m + check() @utx.skipIfMissingFeatures(["P3M"]) def test_infinite_ionic_sheet(self): @@ -245,19 +258,23 @@ def check(pressure=True): np.testing.assert_allclose(p_tensor, ref_pressure, atol=1e-12, rtol=5e-7) - p3m = espressomd.electrostatics.P3M(**p3m_params) - self.system.electrostatics.solver = p3m - check(pressure=True) - elc = espressomd.electrostatics.ELC(actor=p3m, **elc_params) - self.system.electrostatics.solver = elc - check(pressure=False) + for single_precision in [False, True]: + with self.subTest(msg=f"P3M CPU {single_precision=}"): + p3m = espressomd.electrostatics.P3M( + single_precision=single_precision, **p3m_params) + self.system.electrostatics.solver = p3m + check(pressure=True) + elc = espressomd.electrostatics.ELC(actor=p3m, **elc_params) + self.system.electrostatics.solver = elc + check(pressure=False) if espressomd.has_features("CUDA") and espressomd.gpu_available(): - p3m = espressomd.electrostatics.P3MGPU(**p3m_params) - self.system.electrostatics.solver = p3m - check(pressure=True) - elc = espressomd.electrostatics.ELC(actor=p3m, **elc_params) - self.system.electrostatics.solver = elc - check(pressure=False) + with self.subTest(msg=f"P3M GPU single_precision=True"): + p3m = espressomd.electrostatics.P3MGPU(**p3m_params) + self.system.electrostatics.solver = p3m + check(pressure=True) + elc = espressomd.electrostatics.ELC(actor=p3m, **elc_params) + self.system.electrostatics.solver = elc + check(pressure=False) @utx.skipIfMissingFeatures(["P3M"]) def test_infinite_ionic_cube(self): @@ -269,26 +286,30 @@ def test_infinite_ionic_cube(self): mc = -1.74756459463318219 base = [1., 1., 1.] ref_energy, ref_pressure = self.get_reference_obs_per_ion(mc, base) - p3m_params = dict(prefactor=1., accuracy=3e-7, mesh=52, r_cut=4.4, cao=7, - alpha=0.8895166, tune=False) + p3m_params = dict(prefactor=1., accuracy=1e-9, mesh=72, r_cut=5.6, + cao=7, alpha=0.8163977, tune=False) def check(): energy, p_scalar, p_tensor = self.get_normalized_obs_per_ion() np.testing.assert_allclose(energy, ref_energy, atol=0., rtol=1e-6) np.testing.assert_allclose(p_scalar, np.trace(ref_pressure) / 3., atol=1e-12, rtol=5e-6) - np.testing.assert_allclose(p_tensor, ref_pressure, atol=1e-12, + np.testing.assert_allclose(p_tensor, ref_pressure, atol=5e-9, rtol=5e-6) - p3m = espressomd.electrostatics.P3M(**p3m_params) - self.system.electrostatics.solver = p3m - check() + for single_precision in [False, True]: + with self.subTest(msg=f"P3M CPU {single_precision=}"): + p3m = espressomd.electrostatics.P3M( + single_precision=single_precision, **p3m_params) + self.system.electrostatics.solver = p3m + check() if espressomd.has_features("CUDA") and espressomd.gpu_available(): - p3m_params = dict(prefactor=1., accuracy=3e-7, mesh=42, r_cut=5.5, - cao=7, alpha=0.709017, tune=False) - p3m = espressomd.electrostatics.P3MGPU(**p3m_params) - self.system.electrostatics.solver = p3m - check() + with self.subTest(msg=f"P3M GPU single_precision=True"): + p3m_params = dict(prefactor=1., accuracy=3e-7, mesh=42, + r_cut=5.5, cao=7, alpha=0.709017, tune=False) + p3m = espressomd.electrostatics.P3MGPU(**p3m_params) + self.system.electrostatics.solver = p3m + check() if __name__ == "__main__":