From 12835dbe6fc8f3b79e3c869b2c6e011580423261 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Thu, 1 Aug 2024 23:41:32 +0200
Subject: [PATCH] progress

---
 cmake/FindFFTW3.cmake                         |  14 +-
 src/core/electrostatics/CMakeLists.txt        |   1 -
 src/core/electrostatics/coulomb.cpp           |   5 -
 src/core/electrostatics/coulomb.hpp           |   4 -
 src/core/electrostatics/elc.cpp               |  50 ++--
 src/core/electrostatics/elc.hpp               |   8 +-
 src/core/electrostatics/icc.cpp               |   9 +-
 src/core/electrostatics/p3m.cpp               | 229 +++++++++++-------
 src/core/electrostatics/p3m.hpp               |  88 +++----
 src/core/electrostatics/p3m.impl.hpp          | 180 ++++++++++++++
 src/core/electrostatics/p3m_gpu.cpp           |  75 ------
 src/core/electrostatics/p3m_gpu.hpp           |  66 -----
 src/core/electrostatics/p3m_gpu_cuda.cu       |  39 +--
 src/core/electrostatics/p3m_gpu_cuda.cuh      |   2 +-
 src/core/fft/fft.cpp                          | 226 +++++++++++------
 src/core/fft/fft.hpp                          |  51 ++--
 src/core/fft/vector.hpp                       |  36 +--
 src/core/forces.cpp                           |   1 -
 src/core/magnetostatics/dp3m.cpp              | 189 ++++++++-------
 src/core/magnetostatics/dp3m.hpp              |  90 +++----
 src/core/magnetostatics/dp3m.impl.hpp         | 138 +++++++++++
 src/core/p3m/FFTBackendLegacy.cpp             |  81 ++++---
 src/core/p3m/FFTBackendLegacy.hpp             |  37 ++-
 src/core/p3m/common.hpp                       |  10 +-
 src/core/p3m/data_struct.hpp                  |  54 +++--
 src/core/p3m/influence_function.hpp           |  16 +-
 src/core/p3m/influence_function_dipolar.hpp   |  19 +-
 src/core/p3m/send_mesh.cpp                    |  82 ++++---
 src/core/p3m/send_mesh.hpp                    |  24 +-
 .../EspressoSystemStandAlone_test.cpp         |  94 ++++++-
 src/core/unit_tests/ParticleFactory.hpp       |  16 +-
 src/core/unit_tests/fft_test.cpp              |   4 +-
 src/python/espressomd/electrostatics.py       |   8 +
 src/python/espressomd/magnetostatics.py       |   3 +
 .../{Actor_impl.hpp => Actor.impl.hpp}        |   0
 .../electrostatics/CoulombP3M.hpp             |  44 +++-
 .../electrostatics/CoulombP3MGPU.hpp          | 103 --------
 .../ElectrostaticLayerCorrection.hpp          |  29 +--
 .../electrostatics/initialize.cpp             |   7 +-
 .../{Actor_impl.hpp => Actor.impl.hpp}        |   0
 .../magnetostatics/DipolarLayerCorrection.hpp |  27 ++-
 .../magnetostatics/DipolarP3M.hpp             |  39 ++-
 .../magnetostatics/initialize.cpp             |   5 +-
 src/script_interface/tests/Actors_test.cpp    |   4 +-
 testsuite/python/coulomb_interface.py         |   4 +
 testsuite/python/p3m_madelung.py              | 113 +++++----
 46 files changed, 1378 insertions(+), 946 deletions(-)
 create mode 100644 src/core/electrostatics/p3m.impl.hpp
 delete mode 100644 src/core/electrostatics/p3m_gpu.cpp
 delete mode 100644 src/core/electrostatics/p3m_gpu.hpp
 create mode 100644 src/core/magnetostatics/dp3m.impl.hpp
 rename src/script_interface/electrostatics/{Actor_impl.hpp => Actor.impl.hpp} (100%)
 delete mode 100644 src/script_interface/electrostatics/CoulombP3MGPU.hpp
 rename src/script_interface/magnetostatics/{Actor_impl.hpp => Actor.impl.hpp} (100%)

diff --git a/cmake/FindFFTW3.cmake b/cmake/FindFFTW3.cmake
index a4bdae4f110..3864db4df24 100644
--- a/cmake/FindFFTW3.cmake
+++ b/cmake/FindFFTW3.cmake
@@ -31,25 +31,27 @@ if(FFTW3_INCLUDE_DIR)
 endif(FFTW3_INCLUDE_DIR)
 
 find_path(FFTW3_INCLUDE_DIR fftw3.h)
-find_library(FFTW3_LIBRARIES NAMES fftw3)
+find_library(FFTW3_LIBRARIES fftw3)
+find_library(FFTW3F_LIBRARIES fftw3f)
 find_path(FFTW3_MPI_INCLUDE_DIR fftw3-mpi.h)
-find_library(FFTW3_MPI_LIBRARIES NAMES fftw3_mpi)
+find_library(FFTW3_MPI_LIBRARIES fftw3_mpi)
+find_library(FFTW3F_MPI_LIBRARIES fftw3f_mpi)
 
 # handle the QUIETLY and REQUIRED arguments and set FFTW3_FOUND to TRUE if all
 # listed variables are TRUE
 include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(FFTW3 DEFAULT_MSG FFTW3_LIBRARIES
+find_package_handle_standard_args(FFTW3 DEFAULT_MSG FFTW3_LIBRARIES FFTW3F_LIBRARIES
                                   FFTW3_INCLUDE_DIR)
 set(FPHSA_NAME_MISMATCHED 1)
-find_package_handle_standard_args(FFTW3_MPI DEFAULT_MSG FFTW3_MPI_LIBRARIES
+find_package_handle_standard_args(FFTW3_MPI DEFAULT_MSG FFTW3_MPI_LIBRARIES FFTW3F_MPI_LIBRARIES
                                   FFTW3_MPI_INCLUDE_DIR)
 unset(FPHSA_NAME_MISMATCHED)
 
-mark_as_advanced(FFTW3_LIBRARIES FFTW3_INCLUDE_DIR FFTW3_MPI_LIBRARIES FFTW3_MPI_INCLUDE_DIR)
+mark_as_advanced(FFTW3_LIBRARIES FFTW3F_LIBRARIES FFTW3_INCLUDE_DIR FFTW3_MPI_LIBRARIES FFTW3F_MPI_LIBRARIES FFTW3_MPI_INCLUDE_DIR)
 
 
 if(FFTW3_FOUND AND NOT TARGET FFTW3::FFTW3)
   add_library(FFTW3::FFTW3 INTERFACE IMPORTED)
   target_include_directories(FFTW3::FFTW3 INTERFACE "${FFTW3_INCLUDE_DIR}")
-  target_link_libraries(FFTW3::FFTW3 INTERFACE "${FFTW3_LIBRARIES}")
+  target_link_libraries(FFTW3::FFTW3 INTERFACE "${FFTW3_LIBRARIES}" "${FFTW3F_LIBRARIES}")
 endif()
diff --git a/src/core/electrostatics/CMakeLists.txt b/src/core/electrostatics/CMakeLists.txt
index e188a1d1065..1c56350e265 100644
--- a/src/core/electrostatics/CMakeLists.txt
+++ b/src/core/electrostatics/CMakeLists.txt
@@ -24,6 +24,5 @@ target_sources(
           ${CMAKE_CURRENT_SOURCE_DIR}/mmm1d.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/mmm-modpsi.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/p3m.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/p3m_gpu.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/scafacos_impl.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/specfunc.cpp)
diff --git a/src/core/electrostatics/coulomb.cpp b/src/core/electrostatics/coulomb.cpp
index b7bc1eac555..14282a8b17e 100644
--- a/src/core/electrostatics/coulomb.cpp
+++ b/src/core/electrostatics/coulomb.cpp
@@ -207,11 +207,6 @@ struct LongRangeForce {
   void operator()(std::shared_ptr<CoulombP3M> const &actor) const {
     actor->add_long_range_forces(m_particles);
   }
-#ifdef CUDA
-  void operator()(std::shared_ptr<CoulombP3MGPU> const &actor) const {
-    actor->add_long_range_forces(m_particles);
-  }
-#endif // CUDA
   void
   operator()(std::shared_ptr<ElectrostaticLayerCorrection> const &actor) const {
     actor->add_long_range_forces(m_particles);
diff --git a/src/core/electrostatics/coulomb.hpp b/src/core/electrostatics/coulomb.hpp
index 93eb9302706..437eafce750 100644
--- a/src/core/electrostatics/coulomb.hpp
+++ b/src/core/electrostatics/coulomb.hpp
@@ -32,7 +32,6 @@
 #include "electrostatics/icc.hpp"
 #include "electrostatics/mmm1d.hpp"
 #include "electrostatics/p3m.hpp"
-#include "electrostatics/p3m_gpu.hpp"
 #include "electrostatics/reaction_field.hpp"
 #include "electrostatics/scafacos.hpp"
 
@@ -49,9 +48,6 @@ using ElectrostaticsActor =
     std::variant<std::shared_ptr<DebyeHueckel>,
 #ifdef P3M
                  std::shared_ptr<CoulombP3M>,
-#ifdef CUDA
-                 std::shared_ptr<CoulombP3MGPU>,
-#endif // CUDA
                  std::shared_ptr<ElectrostaticLayerCorrection>,
 #endif // P3M
                  std::shared_ptr<CoulombMMM1D>,
diff --git a/src/core/electrostatics/elc.cpp b/src/core/electrostatics/elc.cpp
index 8ef7d5366af..ebb96405f7c 100644
--- a/src/core/electrostatics/elc.cpp
+++ b/src/core/electrostatics/elc.cpp
@@ -27,7 +27,6 @@
 
 #include "electrostatics/coulomb.hpp"
 #include "electrostatics/p3m.hpp"
-#include "electrostatics/p3m_gpu.hpp"
 
 #include "BoxGeometry.hpp"
 #include "Particle.hpp"
@@ -1111,23 +1110,20 @@ static void p3m_assign_image_charge(elc_data const &elc, CoulombP3M &p3m,
                                     double q, Utils::Vector3d const &pos) {
   if (pos[2] < elc.space_layer) {
     auto const q_eff = elc.delta_mid_bot * q;
-    p3m.assign_charge(q_eff, {pos[0], pos[1], -pos[2]});
+    p3m.assign_charge(q_eff, {pos[0], pos[1], -pos[2]}, true);
   }
   if (pos[2] > (elc.box_h - elc.space_layer)) {
     auto const q_eff = elc.delta_mid_top * q;
-    p3m.assign_charge(q_eff, {pos[0], pos[1], 2. * elc.box_h - pos[2]});
+    p3m.assign_charge(q_eff, {pos[0], pos[1], 2. * elc.box_h - pos[2]}, true);
   }
 }
 
 template <ChargeProtocol protocol, typename combined_ranges>
 void charge_assign(elc_data const &elc, CoulombP3M &solver,
                    combined_ranges const &p_q_pos_range) {
-  if (protocol == ChargeProtocol::BOTH or protocol == ChargeProtocol::IMAGE) {
-    solver.p3m.inter_weights.reset(solver.p3m.params.cao);
-  }
-  /* prepare local FFT mesh */
-  for (int i = 0; i < solver.p3m.local_mesh.size; i++)
-    solver.p3m.mesh.rs_scalar[i] = 0.;
+
+  solver.assign_charge_prepare_elc(protocol == ChargeProtocol::BOTH or
+                                   protocol == ChargeProtocol::IMAGE);
 
   for (auto zipped : p_q_pos_range) {
     auto const p_q = boost::get<0>(zipped);
@@ -1135,7 +1131,7 @@ void charge_assign(elc_data const &elc, CoulombP3M &solver,
     if (p_q != 0.) {
       if (protocol == ChargeProtocol::BOTH or
           protocol == ChargeProtocol::REAL) {
-        solver.assign_charge(p_q, p_pos, solver.p3m.inter_weights);
+        solver.assign_charge(p_q, p_pos);
       }
       if (protocol == ChargeProtocol::BOTH or
           protocol == ChargeProtocol::IMAGE) {
@@ -1149,7 +1145,9 @@ template <ChargeProtocol protocol, typename combined_range>
 void modify_p3m_sums(elc_data const &elc, CoulombP3M &solver,
                      combined_range const &p_q_pos_range) {
 
-  Utils::Vector3d node_sums{};
+  auto local_n = 0;
+  auto local_q2 = 0.0;
+  auto local_q = 0.0;
   for (auto zipped : p_q_pos_range) {
     auto const p_q = boost::get<0>(zipped);
     auto const &p_pos = boost::get<1>(zipped);
@@ -1158,33 +1156,35 @@ void modify_p3m_sums(elc_data const &elc, CoulombP3M &solver,
 
       if (protocol == ChargeProtocol::BOTH or
           protocol == ChargeProtocol::REAL) {
-        node_sums[0] += 1.;
-        node_sums[1] += Utils::sqr(p_q);
-        node_sums[2] += p_q;
+        local_n++;
+        local_q2 += Utils::sqr(p_q);
+        local_q += p_q;
       }
 
       if (protocol == ChargeProtocol::BOTH or
           protocol == ChargeProtocol::IMAGE) {
         if (p_z < elc.space_layer) {
-          node_sums[0] += 1.;
-          node_sums[1] += Utils::sqr(elc.delta_mid_bot * p_q);
-          node_sums[2] += elc.delta_mid_bot * p_q;
+          local_n++;
+          local_q2 += Utils::sqr(elc.delta_mid_bot * p_q);
+          local_q += elc.delta_mid_bot * p_q;
         }
 
         if (p_z > (elc.box_h - elc.space_layer)) {
-          node_sums[0] += 1.;
-          node_sums[1] += Utils::sqr(elc.delta_mid_top * p_q);
-          node_sums[2] += elc.delta_mid_top * p_q;
+          local_n++;
+          local_q2 += Utils::sqr(elc.delta_mid_top * p_q);
+          local_q += elc.delta_mid_top * p_q;
         }
       }
     }
   }
 
-  auto const tot_sums =
-      boost::mpi::all_reduce(comm_cart, node_sums, std::plus<>());
-  solver.p3m.sum_qpart = static_cast<int>(tot_sums[0] + 0.1);
-  solver.p3m.sum_q2 = tot_sums[1];
-  solver.p3m.square_sum_q = Utils::sqr(tot_sums[2]);
+  auto global_n = 0;
+  auto global_q2 = 0.;
+  auto global_q = 0.;
+  boost::mpi::all_reduce(comm_cart, local_n, global_n, std::plus<>());
+  boost::mpi::all_reduce(comm_cart, local_q2, global_q2, std::plus<>());
+  boost::mpi::all_reduce(comm_cart, local_q, global_q, std::plus<>());
+  solver.count_charged_particles_elc(global_n, global_q2, Utils::sqr(global_q));
 }
 
 double ElectrostaticLayerCorrection::long_range_energy(
diff --git a/src/core/electrostatics/elc.hpp b/src/core/electrostatics/elc.hpp
index b6e8c606114..2a7fe034064 100644
--- a/src/core/electrostatics/elc.hpp
+++ b/src/core/electrostatics/elc.hpp
@@ -37,13 +37,13 @@
 #include "actor/traits.hpp"
 
 #include "electrostatics/p3m.hpp"
-#include "electrostatics/p3m_gpu.hpp"
 
 #include "BoxGeometry.hpp"
 #include "Particle.hpp"
 #include "ParticleRange.hpp"
 
 #include <utils/Vector.hpp>
+#include <utils/math/sqr.hpp>
 
 #include <memory>
 #include <optional>
@@ -166,11 +166,7 @@ struct elc_data {
 
 struct ElectrostaticLayerCorrection
     : public Coulomb::Actor<ElectrostaticLayerCorrection> {
-  using BaseSolver = std::variant<
-#ifdef CUDA
-      std::shared_ptr<CoulombP3MGPU>,
-#endif // CUDA
-      std::shared_ptr<CoulombP3M>>;
+  using BaseSolver = std::variant<std::shared_ptr<CoulombP3M>>;
 
   elc_data elc;
   BoxGeometry *m_box_geo;
diff --git a/src/core/electrostatics/icc.cpp b/src/core/electrostatics/icc.cpp
index 83aa02ea25e..8d138579dbd 100644
--- a/src/core/electrostatics/icc.cpp
+++ b/src/core/electrostatics/icc.cpp
@@ -40,6 +40,8 @@
 #include "communication.hpp"
 #include "electrostatics/coulomb.hpp"
 #include "electrostatics/coulomb_inline.hpp"
+#include "electrostatics/p3m.hpp"
+#include "electrostatics/p3m.impl.hpp"
 #include "errorhandling.hpp"
 #include "integrators/Propagation.hpp"
 #include "system/System.hpp"
@@ -248,8 +250,11 @@ struct SanityChecksICC {
   template <typename T> void operator()(std::shared_ptr<T> const &) const {}
 #ifdef P3M
 #ifdef CUDA
-  [[noreturn]] void operator()(std::shared_ptr<CoulombP3MGPU> const &) const {
-    throw std::runtime_error("ICC does not work with P3MGPU");
+  void operator()(std::shared_ptr<CoulombP3M> const &p) const {
+    if (dynamic_cast<CoulombP3MImpl<float, Arch::GPU> *>(p.get()) != nullptr or
+        dynamic_cast<CoulombP3MImpl<double, Arch::GPU> *>(p.get()) != nullptr) {
+      throw std::runtime_error("ICC does not work with P3MGPU");
+    }
   }
 #endif // CUDA
   void
diff --git a/src/core/electrostatics/p3m.cpp b/src/core/electrostatics/p3m.cpp
index 5e604a4fd51..379bd66320a 100644
--- a/src/core/electrostatics/p3m.cpp
+++ b/src/core/electrostatics/p3m.cpp
@@ -24,14 +24,19 @@
  *  The corresponding header file is @ref p3m.hpp.
  */
 
-#include "electrostatics/p3m.hpp"
+#include "config/config.hpp"
 
 #ifdef P3M
 
+#include "electrostatics/p3m.hpp"
+#include "electrostatics/p3m.impl.hpp"
+
 #include "electrostatics/coulomb.hpp"
 #include "electrostatics/elc.hpp"
-#include "electrostatics/p3m_gpu.hpp"
+#ifdef CUDA
+#include "electrostatics/p3m_gpu_cuda.cuh"
 #include "electrostatics/p3m_gpu_error.hpp"
+#endif // CUDA
 
 #include "fft/fft.hpp"
 #include "p3m/TuningAlgorithm.hpp"
@@ -53,6 +58,7 @@
 #include "errorhandling.hpp"
 #include "integrators/Propagation.hpp"
 #include "npt.hpp"
+#include "system/GpuParticleData.hpp"
 #include "system/System.hpp"
 #include "tuning.hpp"
 
@@ -84,7 +90,8 @@
 #include <tuple>
 #include <utility>
 
-void CoulombP3M::count_charged_particles() {
+template <typename FloatType, Arch Architecture>
+void CoulombP3MImpl<FloatType, Architecture>::count_charged_particles() {
   auto local_n = 0;
   auto local_q2 = 0.0;
   auto local_q = 0.0;
@@ -112,21 +119,23 @@ void CoulombP3M::count_charged_particles() {
  *  different convention for the prefactors, which is described in
  *  @cite deserno98a @cite deserno98b.
  */
-void CoulombP3M::calc_influence_function_force() {
+template <typename FloatType, Arch Architecture>
+void CoulombP3MImpl<FloatType, Architecture>::calc_influence_function_force() {
   auto const [KX, KY, KZ] = p3m.fft->get_permutations();
-  p3m.g_force =
-      grid_influence_function<1>(p3m.params, p3m.mesh.start, p3m.mesh.stop, KX,
-                                 KY, KZ, get_system().box_geo->length_inv());
+  p3m.g_force = grid_influence_function<FloatType, 1>(
+      p3m.params, p3m.mesh.start, p3m.mesh.stop, KX, KY, KZ,
+      get_system().box_geo->length_inv());
 }
 
 /** Calculate the influence function optimized for the energy and the
  *  self energy correction.
  */
-void CoulombP3M::calc_influence_function_energy() {
+template <typename FloatType, Arch Architecture>
+void CoulombP3MImpl<FloatType, Architecture>::calc_influence_function_energy() {
   auto const [KX, KY, KZ] = p3m.fft->get_permutations();
-  p3m.g_energy =
-      grid_influence_function<0>(p3m.params, p3m.mesh.start, p3m.mesh.stop, KX,
-                                 KY, KZ, get_system().box_geo->length_inv());
+  p3m.g_energy = grid_influence_function<FloatType, 0>(
+      p3m.params, p3m.mesh.start, p3m.mesh.stop, KX, KY, KZ,
+      get_system().box_geo->length_inv());
 }
 
 /** Aliasing sum used by @ref p3m_k_space_error. */
@@ -229,7 +238,8 @@ static double p3m_k_space_error(double pref, Utils::Vector3i const &mesh,
          (box_l[1] * box_l[2]);
 }
 
-void CoulombP3M::init() {
+template <typename FloatType, Arch Architecture>
+void CoulombP3MImpl<FloatType, Architecture>::init_cpu_kernels() {
   assert(p3m.params.mesh >= Utils::Vector3i::broadcast(1));
   assert(p3m.params.cao >= 1 and p3m.params.cao <= 7);
   assert(p3m.params.alpha > 0.);
@@ -252,6 +262,7 @@ void CoulombP3M::init() {
 
   assert(p3m.fft);
   p3m.local_mesh.calc_local_ca_mesh(p3m.params, local_geo, skin, elc_layer);
+  p3m.fft->init_halo();
   p3m.fft->init_fft();
   p3m.calc_differential_operator();
 
@@ -261,48 +272,33 @@ void CoulombP3M::init() {
   count_charged_particles();
 }
 
-CoulombP3M::CoulombP3M(P3MParameters &&parameters, double prefactor,
-                       int tune_timings, bool tune_verbose,
-                       bool check_complex_residuals)
-    : p3m{std::move(parameters)}, tune_timings{tune_timings},
-      tune_verbose{tune_verbose},
-      check_complex_residuals{check_complex_residuals} {
-
-  if (tune_timings <= 0) {
-    throw std::domain_error("Parameter 'timings' must be > 0");
-  }
-  m_is_tuned = !p3m.params.tuning;
-  p3m.params.tuning = false;
-  set_prefactor(prefactor);
-}
-
 namespace {
 template <int cao> struct AssignCharge {
-  void operator()(decltype(CoulombP3M::p3m) &p3m, double q,
-                  Utils::Vector3d const &real_pos,
+  void operator()(auto &p3m, double q, Utils::Vector3d const &real_pos,
+                  InterpolationWeights<cao> const &w) {
+    using value_type =
+        typename std::remove_reference_t<decltype(p3m)>::value_type;
+    p3m_interpolate(p3m.local_mesh, w, [q, &p3m](int ind, double w) {
+      p3m.mesh.rs_scalar[ind] += value_type(w * q);
+    });
+  }
+
+  void operator()(auto &p3m, double q, Utils::Vector3d const &real_pos,
                   p3m_interpolation_cache &inter_weights) {
     auto const w = p3m_calculate_interpolation_weights<cao>(
         real_pos, p3m.params.ai, p3m.local_mesh);
-
     inter_weights.store(w);
-
-    p3m_interpolate(p3m.local_mesh, w, [q, &p3m](int ind, double w) {
-      p3m.mesh.rs_scalar[ind] += w * q;
-    });
+    this->operator()(p3m, q, real_pos, w);
   }
 
-  void operator()(decltype(CoulombP3M::p3m) &p3m, double q,
-                  Utils::Vector3d const &real_pos) {
-    p3m_interpolate(
-        p3m.local_mesh,
-        p3m_calculate_interpolation_weights<cao>(real_pos, p3m.params.ai,
-                                                 p3m.local_mesh),
-        [q, &p3m](int ind, double w) { p3m.mesh.rs_scalar[ind] += w * q; });
+  void operator()(auto &p3m, double q, Utils::Vector3d const &real_pos) {
+    auto const w = p3m_calculate_interpolation_weights<cao>(
+        real_pos, p3m.params.ai, p3m.local_mesh);
+    this->operator()(p3m, q, real_pos, w);
   }
 
   template <typename combined_ranges>
-  void operator()(decltype(CoulombP3M::p3m) &p3m,
-                  combined_ranges const &p_q_pos_range) {
+  void operator()(auto &p3m, combined_ranges const &p_q_pos_range) {
     for (auto zipped : p_q_pos_range) {
       auto const p_q = boost::get<0>(zipped);
       auto const &p_pos = boost::get<1>(zipped);
@@ -314,12 +310,14 @@ template <int cao> struct AssignCharge {
 };
 } // namespace
 
-void CoulombP3M::charge_assign(ParticleRange const &particles) {
+template <typename FloatType, Arch Architecture>
+void CoulombP3MImpl<FloatType, Architecture>::charge_assign(
+    ParticleRange const &particles) {
   p3m.inter_weights.reset(p3m.params.cao);
 
   /* prepare local FFT mesh */
   for (int i = 0; i < p3m.local_mesh.size; i++)
-    p3m.mesh.rs_scalar[i] = 0.0;
+    p3m.mesh.rs_scalar[i] = FloatType(0);
 
   auto p_q_range = ParticlePropertyRange::charge_range(particles);
   auto p_pos_range = ParticlePropertyRange::pos_range(particles);
@@ -328,20 +326,21 @@ void CoulombP3M::charge_assign(ParticleRange const &particles) {
       p3m.params.cao, p3m, boost::combine(p_q_range, p_pos_range));
 }
 
-void CoulombP3M::assign_charge(double q, Utils::Vector3d const &real_pos,
-                               p3m_interpolation_cache &inter_weights) {
-  Utils::integral_parameter<int, AssignCharge, 1, 7>(p3m.params.cao, p3m, q,
-                                                     real_pos, inter_weights);
-}
-
-void CoulombP3M::assign_charge(double q, Utils::Vector3d const &real_pos) {
-  Utils::integral_parameter<int, AssignCharge, 1, 7>(p3m.params.cao, p3m, q,
-                                                     real_pos);
+template <typename FloatType, Arch Architecture>
+void CoulombP3MImpl<FloatType, Architecture>::assign_charge(
+    double q, Utils::Vector3d const &real_pos, bool skip_cache) {
+  if (skip_cache) {
+    Utils::integral_parameter<int, AssignCharge, 1, 7>(p3m.params.cao, p3m, q,
+                                                       real_pos);
+  } else {
+    Utils::integral_parameter<int, AssignCharge, 1, 7>(
+        p3m.params.cao, p3m, q, real_pos, p3m.inter_weights);
+  }
 }
 
 template <int cao> struct AssignForces {
   template <typename combined_ranges>
-  void operator()(decltype(CoulombP3M::p3m) &p3m, double force_prefac,
+  void operator()(auto &p3m, double force_prefac,
                   combined_ranges const &p_q_force_range) const {
 
     assert(cao == p3m.inter_weights.cao());
@@ -354,13 +353,13 @@ template <int cao> struct AssignForces {
       auto &p_force = boost::get<1>(zipped);
       if (p_q != 0.0) {
         auto const pref = p_q * force_prefac;
-        auto const w = p3m.inter_weights.load<cao>(p_index);
+        auto const w = p3m.inter_weights.template load<cao>(p_index);
 
         Utils::Vector3d force{};
         p3m_interpolate(p3m.local_mesh, w, [&force, &p3m](int ind, double w) {
-          force += w * Utils::Vector3d{p3m.mesh.rs_fields[0u][ind],
-                                       p3m.mesh.rs_fields[1u][ind],
-                                       p3m.mesh.rs_fields[2u][ind]};
+          force[0u] += w * double(p3m.mesh.rs_fields[0u][ind]);
+          force[1u] += w * double(p3m.mesh.rs_fields[1u][ind]);
+          force[2u] += w * double(p3m.mesh.rs_fields[2u][ind]);
         });
 
         p_force -= pref * force;
@@ -388,14 +387,16 @@ static auto calc_dipole_moment(boost::mpi::communicator const &comm,
  *  in @cite essmann95a. The part \f$\Pi_{\textrm{corr}, \alpha, \beta}\f$
  *  eq. (2.8) is not present here since M is the empty set in our simulations.
  */
-Utils::Vector9d
-CoulombP3M::long_range_pressure(ParticleRange const &particles) {
+template <typename FloatType, Arch Architecture>
+Utils::Vector9d CoulombP3MImpl<FloatType, Architecture>::long_range_pressure(
+    ParticleRange const &particles) {
   auto const &box_geo = *get_system().box_geo;
   Utils::Vector9d node_k_space_pressure_tensor{};
 
   if (p3m.sum_q2 > 0.) {
     charge_assign(particles);
-    p3m.fft->perform_fwd_fft();
+    p3m.fft->perform_scalar_halo_gather();
+    p3m.fft->perform_scalar_fwd_fft();
 
     auto constexpr mesh_start = Utils::Vector3i::broadcast(0);
     auto const &mesh_stop = p3m.mesh.size;
@@ -416,9 +417,9 @@ CoulombP3M::long_range_pressure(ParticleRange const &particles) {
 
       if (norm_sq != 0.) {
         auto const node_k_space_energy =
-            p3m.g_energy[index] *
-            (Utils::sqr(p3m.mesh.rs_scalar[2u * index + 0u]) +
-             Utils::sqr(p3m.mesh.rs_scalar[2u * index + 1u]));
+            double(p3m.g_energy[index] *
+                   (Utils::sqr(p3m.mesh.rs_scalar[2u * index + 0u]) +
+                    Utils::sqr(p3m.mesh.rs_scalar[2u * index + 1u])));
         auto const vterm = -2. * (1. / norm_sq + half_alpha_inv_sq);
         auto const pref = node_k_space_energy * vterm;
         node_k_space_pressure_tensor[0u] += pref * kx * kx; /* sigma_xx */
@@ -443,8 +444,9 @@ CoulombP3M::long_range_pressure(ParticleRange const &particles) {
   return node_k_space_pressure_tensor * prefactor / (2. * box_geo.volume());
 }
 
-double CoulombP3M::long_range_kernel(bool force_flag, bool energy_flag,
-                                     ParticleRange const &particles) {
+template <typename FloatType, Arch Architecture>
+double CoulombP3MImpl<FloatType, Architecture>::long_range_kernel(
+    bool force_flag, bool energy_flag, ParticleRange const &particles) {
   auto const &system = get_system();
   auto const &box_geo = *system.box_geo;
 #ifdef NPT
@@ -459,7 +461,8 @@ double CoulombP3M::long_range_kernel(bool force_flag, bool energy_flag,
             system.coulomb.impl->solver)) {
       charge_assign(particles);
     }
-    p3m.fft->perform_fwd_fft();
+    p3m.fft->perform_scalar_halo_gather();
+    p3m.fft->perform_scalar_fwd_fft();
   }
 
   auto p_q_range = ParticlePropertyRange::charge_range(particles);
@@ -485,7 +488,8 @@ double CoulombP3M::long_range_kernel(bool force_flag, bool energy_flag,
     auto constexpr mesh_start = Utils::Vector3i::broadcast(0);
     auto const &mesh_stop = p3m.mesh.size;
     auto const &offset = p3m.mesh.start;
-    auto const wavevector = (2. * std::numbers::pi) * box_geo.length_inv();
+    auto const wavevector = Utils::Vector3<FloatType>((2. * std::numbers::pi) *
+                                                      box_geo.length_inv());
     auto indices = Utils::Vector3i{};
     auto index = std::size_t(0u);
 
@@ -493,16 +497,16 @@ double CoulombP3M::long_range_kernel(bool force_flag, bool energy_flag,
     // Eq. (3.49) @cite deserno00b
     for_each_3d(mesh_start, mesh_stop, indices, [&]() {
       auto const rho_hat =
-          std::complex<double>(p3m.mesh.rs_scalar[2u * index + 0u],
-                               p3m.mesh.rs_scalar[2u * index + 1u]);
+          std::complex<FloatType>(p3m.mesh.rs_scalar[2u * index + 0u],
+                                  p3m.mesh.rs_scalar[2u * index + 1u]);
       auto const phi_hat = p3m.g_force[index] * rho_hat;
 
       for (int d = 0; d < 3; d++) {
         /* direction in r-space: */
         int d_rs = (d + p3m.mesh.ks_pnum) % 3;
         /* directions */
-        auto const k =
-            p3m.d_op[d_rs][indices[d] + offset[d]] * wavevector[d_rs];
+        auto const k = FloatType(p3m.d_op[d_rs][indices[d] + offset[d]]) *
+                       wavevector[d_rs];
 
         /* i*k*(Re+i*Im) = - Im*k + i*Re*k     (i=sqrt(-1)) */
         p3m.mesh.rs_fields[d_rs][2u * index + 0u] = -k * phi_hat.imag();
@@ -515,7 +519,8 @@ double CoulombP3M::long_range_kernel(bool force_flag, bool energy_flag,
     auto const check_residuals =
         not p3m.params.tuning and check_complex_residuals;
     p3m.fft->check_complex_residuals = check_residuals;
-    p3m.fft->perform_field_back_fft();
+    p3m.fft->perform_vector_back_fft();
+    p3m.fft->perform_vector_halo_spread();
     p3m.fft->check_complex_residuals = false;
 
     auto const force_prefac = prefactor / volume;
@@ -543,8 +548,8 @@ double CoulombP3M::long_range_kernel(bool force_flag, bool energy_flag,
       // Use the energy optimized influence function for energy!
       // Eq. (3.40) @cite deserno00b
       node_energy +=
-          p3m.g_energy[i] * (Utils::sqr(p3m.mesh.rs_scalar[2 * i]) +
-                             Utils::sqr(p3m.mesh.rs_scalar[2 * i + 1]));
+          double(p3m.g_energy[i] * (Utils::sqr(p3m.mesh.rs_scalar[2 * i + 0]) +
+                                    Utils::sqr(p3m.mesh.rs_scalar[2 * i + 1])));
     }
     node_energy /= 2. * volume;
 
@@ -579,8 +584,9 @@ double CoulombP3M::long_range_kernel(bool force_flag, bool energy_flag,
   return 0.;
 }
 
+template <typename FloatType, Arch Architecture>
 class CoulombTuningAlgorithm : public TuningAlgorithm {
-  decltype(CoulombP3M::p3m) &p3m;
+  p3m_data_struct_coulomb<FloatType> &p3m;
   double m_mesh_density_min = -1., m_mesh_density_max = -1.;
   // indicates if mesh should be tuned
   bool m_tune_mesh = false;
@@ -589,7 +595,7 @@ class CoulombTuningAlgorithm : public TuningAlgorithm {
   P3MParameters &get_params() override { return p3m.params; }
 
 public:
-  CoulombTuningAlgorithm(System::System &system, decltype(p3m) &input_p3m,
+  CoulombTuningAlgorithm(System::System &system, auto &input_p3m,
                          double prefactor, int timings)
       : TuningAlgorithm(system, prefactor, timings), p3m{input_p3m} {}
 
@@ -598,8 +604,7 @@ class CoulombTuningAlgorithm : public TuningAlgorithm {
   void setup_logger(bool verbose) override {
     auto const &box_geo = *m_system.box_geo;
 #ifdef CUDA
-    auto const &solver = m_system.coulomb.impl->solver;
-    auto const on_gpu = has_actor_of_type<CoulombP3MGPU>(solver);
+    auto const on_gpu = Architecture == Arch::GPU;
 #else
     auto const on_gpu = false;
 #endif
@@ -646,8 +651,7 @@ class CoulombTuningAlgorithm : public TuningAlgorithm {
     rs_err = p3m_real_space_error(m_prefactor, r_cut_iL, p3m.sum_qpart,
                                   p3m.sum_q2, alpha_L, box_geo.length());
 #ifdef CUDA
-    auto const &solver = m_system.coulomb.impl->solver;
-    if (has_actor_of_type<CoulombP3MGPU>(solver)) {
+    if constexpr (Architecture == Arch::GPU) {
       if (this_node == 0) {
         ks_err =
             p3m_k_space_error_gpu(m_prefactor, mesh.data(), cao, p3m.sum_qpart,
@@ -743,7 +747,8 @@ class CoulombTuningAlgorithm : public TuningAlgorithm {
   }
 };
 
-void CoulombP3M::tune() {
+template <typename FloatType, Arch Architecture>
+void CoulombP3MImpl<FloatType, Architecture>::tune() {
   auto &system = get_system();
   auto const &box_geo = *system.box_geo;
   if (p3m.params.alpha_L == 0. and p3m.params.alpha != 0.) {
@@ -759,7 +764,8 @@ void CoulombP3M::tune() {
           "CoulombP3M: no charged particles in the system");
     }
     try {
-      CoulombTuningAlgorithm parameters(system, p3m, prefactor, tune_timings);
+      CoulombTuningAlgorithm<FloatType, Architecture> parameters(
+          system, p3m, prefactor, tune_timings);
       parameters.setup_logger(tune_verbose);
       // parameter ranges
       parameters.determine_mesh_limits();
@@ -850,4 +856,59 @@ void CoulombP3M::scaleby_box_l() {
   calc_influence_function_energy();
 }
 
+#ifdef CUDA
+template <typename FloatType, Arch Architecture>
+void CoulombP3MImpl<FloatType, Architecture>::add_long_range_forces_gpu(
+    ParticleRange const &particles) {
+  if constexpr (Architecture == Arch::GPU) {
+#ifdef NPT
+    if (get_system().propagation->integ_switch == INTEG_METHOD_NPT_ISO) {
+      auto const energy = long_range_energy(particles);
+      npt_add_virial_contribution(energy);
+    }
+#else
+    static_cast<void>(particles);
+#endif
+    if (this_node == 0) {
+      auto &gpu = get_system().gpu;
+      p3m_gpu_add_farfield_force(*m_gpu_data, gpu, prefactor,
+                                 gpu.n_particles());
+    }
+  }
+}
+
+/* Initialize the CPU kernels.
+ * This operation is time-consuming and sets up data members
+ * that are only relevant for ELC force corrections, since the
+ * GPU implementation uses CPU kernels to compute energies.
+ */
+template <typename FloatType, Arch Architecture>
+void CoulombP3MImpl<FloatType, Architecture>::init_gpu_kernels() {
+  if constexpr (Architecture == Arch::GPU) {
+    auto &system = get_system();
+    if (has_actor_of_type<ElectrostaticLayerCorrection>(
+            system.coulomb.impl->solver)) {
+      init_cpu_kernels();
+    }
+    p3m_gpu_init(m_gpu_data, p3m.params.cao, p3m.params.mesh, p3m.params.alpha,
+                 system.box_geo->length(), system.gpu.n_particles());
+  }
+}
+
+template <typename FloatType, Arch Architecture>
+void CoulombP3MImpl<FloatType, Architecture>::request_gpu() const {
+  if constexpr (Architecture == Arch::GPU) {
+    auto &gpu_particle_data = get_system().gpu;
+    gpu_particle_data.enable_property(GpuParticleData::prop::force);
+    gpu_particle_data.enable_property(GpuParticleData::prop::q);
+    gpu_particle_data.enable_property(GpuParticleData::prop::pos);
+  }
+}
+#endif // CUDA
+
+template struct CoulombP3MImpl<float, Arch::CPU>;
+template struct CoulombP3MImpl<float, Arch::GPU>;
+template struct CoulombP3MImpl<double, Arch::CPU>;
+template struct CoulombP3MImpl<double, Arch::GPU>;
+
 #endif // P3M
diff --git a/src/core/electrostatics/p3m.hpp b/src/core/electrostatics/p3m.hpp
index de7bbdee9fa..b33b8dad4cb 100644
--- a/src/core/electrostatics/p3m.hpp
+++ b/src/core/electrostatics/p3m.hpp
@@ -42,57 +42,50 @@
 
 #include "p3m/common.hpp"
 #include "p3m/data_struct.hpp"
-#include "p3m/interpolation.hpp"
-#include "p3m/send_mesh.hpp"
 
 #include "ParticleRange.hpp"
 
 #include <utils/Vector.hpp>
 #include <utils/math/AS_erfc_part.hpp>
 
-#include <array>
 #include <cmath>
 #include <numbers>
-#include <utility>
+#include <stdexcept>
 
 /** @brief P3M solver. */
 struct CoulombP3M : public Coulomb::Actor<CoulombP3M> {
-  struct p3m_data_struct_impl : public p3m_data_struct {
-    explicit p3m_data_struct_impl(P3MParameters &&parameters)
-        : p3m_data_struct{std::move(parameters)} {}
-
-    /** number of charged particles (only on head node). */
-    int sum_qpart = 0;
-    /** Sum of square of charges (only on head node). */
-    double sum_q2 = 0.;
-    /** square of sum of charges (only on head node). */
-    double square_sum_q = 0.;
-
-    p3m_interpolation_cache inter_weights;
-  };
-
-  /** P3M parameters. */
-  p3m_data_struct_impl p3m;
+  /** @brief Coulomb P3M parameters. */
+  p3m_data_struct &p3m;
 
   int tune_timings;
   bool tune_verbose;
   bool check_complex_residuals;
 
-private:
+protected:
   bool m_is_tuned;
 
 public:
-  CoulombP3M(P3MParameters &&parameters, double prefactor, int tune_timings,
-             bool tune_verbose, bool check_complex_residuals);
+  CoulombP3M(p3m_data_struct &p3m_data, double prefactor, int tune_timings,
+             bool tune_verbose, bool check_complex_residuals)
+      : p3m{p3m_data}, tune_timings{tune_timings}, tune_verbose{tune_verbose},
+        check_complex_residuals{check_complex_residuals} {
+
+    if (tune_timings <= 0) {
+      throw std::domain_error("Parameter 'timings' must be > 0");
+    }
+    m_is_tuned = not p3m.params.tuning;
+    p3m.params.tuning = false;
+    set_prefactor(prefactor);
+  }
+
+  virtual ~CoulombP3M() = default;
 
   bool is_tuned() const { return m_is_tuned; }
 
   /** @brief Recalculate all derived parameters. */
-  void init();
-  void on_activation() {
-    sanity_checks();
-    tune();
-  }
+  virtual void init() = 0;
+  virtual void on_activation() = 0;
+
   /** @brief Recalculate all box-length-dependent parameters. */
   void on_boxl_change() { scaleby_box_l(); }
   void on_node_grid_change() const { sanity_checks_node_grid(); }
@@ -113,7 +106,8 @@ struct CoulombP3M : public Coulomb::Actor<CoulombP3M> {
    * Count the number of charged particles and calculate
    * the sum of the squared charges.
    */
-  void count_charged_particles();
+  virtual void count_charged_particles() = 0;
+  virtual void count_charged_particles_elc(int, double, double) = 0;
 
   /**
    * @brief Tune P3M parameters to desired accuracy.
@@ -146,24 +140,24 @@ struct CoulombP3M : public Coulomb::Actor<CoulombP3M> {
    * The function is based on routines of the program HE_Q.cpp written by M.
    * Deserno.
    */
-  void tune();
+  virtual void tune() = 0;
 
   /** Assign the physical charges using the tabulated charge assignment
    * function.
    */
-  void charge_assign(ParticleRange const &particles);
+  virtual void charge_assign(ParticleRange const &particles) = 0;
 
   /**
    * @brief Assign a single charge into the current charge grid.
    *
    * @param[in] q          Particle charge
    * @param[in] real_pos   Particle position in real space
-   * @param[out] inter_weights Cached interpolation weights to be used.
+   * @param[in] skip_cache Skip interpolation weights caching.
    */
-  void assign_charge(double q, Utils::Vector3d const &real_pos,
-                     p3m_interpolation_cache &inter_weights);
-  /** @overload */
-  void assign_charge(double q, Utils::Vector3d const &real_pos);
+  virtual void assign_charge(double q, Utils::Vector3d const &real_pos,
+                             bool skip_cache = false) = 0;
+
+  virtual void assign_charge_prepare_elc(bool images) = 0;
 
   /** Calculate real-space contribution of p3m Coulomb pair forces. */
   Utils::Vector3d pair_force(double q1q2, Utils::Vector3d const &d,
@@ -203,25 +197,17 @@ struct CoulombP3M : public Coulomb::Actor<CoulombP3M> {
   }
 
   /** Compute the k-space part of the pressure tensor */
-  Utils::Vector9d long_range_pressure(ParticleRange const &particles);
+  virtual Utils::Vector9d long_range_pressure(ParticleRange const &) = 0;
 
   /** Compute the k-space part of energies. */
-  double long_range_energy(ParticleRange const &particles) {
-    return long_range_kernel(false, true, particles);
-  }
+  virtual double long_range_energy(ParticleRange const &) = 0;
 
   /** Compute the k-space part of forces. */
-  void add_long_range_forces(ParticleRange const &particles) {
-    long_range_kernel(true, false, particles);
-  }
-
-  /** Compute the k-space part of forces and energies. */
-  double long_range_kernel(bool force_flag, bool energy_flag,
-                           ParticleRange const &particles);
+  virtual void add_long_range_forces(ParticleRange const &) = 0;
 
-private:
-  void calc_influence_function_force();
-  void calc_influence_function_energy();
+protected:
+  virtual void calc_influence_function_force() = 0;
+  virtual void calc_influence_function_energy() = 0;
 
   /** Checks for correctness of the k-space cutoff. */
   void sanity_checks_boxl() const;
@@ -229,7 +215,7 @@ struct CoulombP3M : public Coulomb::Actor<CoulombP3M> {
   void sanity_checks_periodicity() const;
   void sanity_checks_cell_structure() const;
 
-  void scaleby_box_l();
+  virtual void scaleby_box_l();
 };
 
 #endif // P3M
diff --git a/src/core/electrostatics/p3m.impl.hpp b/src/core/electrostatics/p3m.impl.hpp
new file mode 100644
index 00000000000..e8faab77d25
--- /dev/null
+++ b/src/core/electrostatics/p3m.impl.hpp
@@ -0,0 +1,180 @@
+/*
+ * Copyright (C) 2010-2024 The ESPResSo project
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
+ *   Max-Planck-Institute for Polymer Research, Theory Group
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *  P3M algorithm for long-range Coulomb interaction.
+ *
+ *  We use a P3M (Particle-Particle Particle-Mesh) method based on the
+ *  Ewald summation. Details of the used method can be found in
+ *  @cite hockney88a and @cite deserno98a @cite deserno98b.
+ *
+ *  Further reading: @cite ewald21a, @cite hockney88a, @cite deserno98a,
+ *  @cite deserno98b, @cite deserno00e, @cite deserno00b, @cite cerda08d.
+ *
+ *  Implementation in p3m.cpp.
+ */
+
+#pragma once
+
+#include "config/config.hpp"
+
+#ifdef P3M
+
+#include "electrostatics/p3m.hpp"
+
+#include "p3m/common.hpp"
+#include "p3m/data_struct.hpp"
+#include "p3m/interpolation.hpp"
+
+#include "ParticleRange.hpp"
+
+#include <utils/Vector.hpp>
+
+#include <memory>
+#include <utility>
+
+template <typename FloatType>
+struct p3m_data_struct_coulomb : public p3m_data_struct_fft<FloatType> {
+  using p3m_data_struct_fft<FloatType>::p3m_data_struct_fft;
+
+  /** number of charged particles (only on head node). */
+  int sum_qpart = 0;
+  /** Sum of square of charges (only on head node). */
+  double sum_q2 = 0.;
+  /** square of sum of charges (only on head node). */
+  double square_sum_q = 0.;
+
+  p3m_interpolation_cache inter_weights;
+};
+
+#ifdef CUDA
+struct P3MGpuParams;
+#endif
+
+template <typename FloatType, Arch Architecture>
+struct CoulombP3MImpl : public CoulombP3M {
+  ~CoulombP3MImpl() override = default;
+
+  /** @brief Coulomb P3M meshes and FFT algorithm. */
+  std::unique_ptr<p3m_data_struct_coulomb<FloatType>> p3m_impl;
+  // this member overshadows its own base class pointer in the parent class
+  p3m_data_struct_coulomb<FloatType> &p3m;
+
+  template <typename... Args>
+  CoulombP3MImpl(
+      std::unique_ptr<p3m_data_struct_coulomb<FloatType>> &&p3m_handle,
+      Args &&...args)
+      : CoulombP3M(*p3m_handle, std::forward<Args>(args)...),
+        p3m_impl{std::move(p3m_handle)}, p3m{*p3m_impl} {}
+
+  void init() override {
+    if constexpr (Architecture == Arch::CPU) {
+      init_cpu_kernels();
+    }
+#ifdef CUDA
+    if constexpr (Architecture == Arch::GPU) {
+      init_gpu_kernels();
+    }
+#endif
+  }
+  void tune() override;
+  void count_charged_particles() override;
+  void count_charged_particles_elc(int n, double sum_q2,
+                                   double square_sum_q) override {
+    p3m.sum_qpart = n;
+    p3m.sum_q2 = sum_q2;
+    p3m.square_sum_q = square_sum_q;
+  }
+
+  void on_activation() override {
+#ifdef CUDA
+    if constexpr (Architecture == Arch::GPU) {
+      request_gpu();
+    }
+#endif
+    sanity_checks();
+    tune();
+#ifdef CUDA
+    if constexpr (Architecture == Arch::GPU) {
+      if (is_tuned()) {
+        init_cpu_kernels();
+      }
+    }
+#endif
+  }
+
+  double long_range_energy(ParticleRange const &particles) override {
+    return long_range_kernel(false, true, particles);
+  }
+
+  void add_long_range_forces(ParticleRange const &particles) override {
+    if constexpr (Architecture == Arch::CPU) {
+      long_range_kernel(true, false, particles);
+    }
+#ifdef CUDA
+    if constexpr (Architecture == Arch::GPU) {
+      add_long_range_forces_gpu(particles);
+    }
+#endif
+  }
+
+  Utils::Vector9d long_range_pressure(ParticleRange const &particles) override;
+
+  void charge_assign(ParticleRange const &particles) override;
+  void assign_charge(double q, Utils::Vector3d const &real_pos,
+                     bool skip_cache = false) override;
+  void assign_charge_prepare_elc(bool images) override {
+    if (images) {
+      p3m.inter_weights.reset(p3m.params.cao);
+    }
+    /* prepare local FFT mesh */
+    for (int i = 0; i < p3m.local_mesh.size; i++) {
+      p3m.mesh.rs_scalar[i] = FloatType(0);
+    }
+  }
+
+protected:
+  /** Compute the k-space part of forces and energies. */
+  double long_range_kernel(bool force_flag, bool energy_flag,
+                           ParticleRange const &particles);
+  void calc_influence_function_force() override;
+  void calc_influence_function_energy() override;
+  void init_cpu_kernels();
+#ifdef CUDA
+  void init_gpu_kernels();
+  void add_long_range_forces_gpu(ParticleRange const &particles);
+  std::shared_ptr<P3MGpuParams> m_gpu_data = nullptr;
+  void request_gpu() const;
+#endif
+};
+
+template <typename FloatType, Arch Architecture,
+          template <typename> class FFTBackendImpl, class... Args>
+std::shared_ptr<CoulombP3M> new_p3m_handle(P3MParameters &&p3m,
+                                           Args &&...args) {
+  auto obj = std::make_shared<CoulombP3MImpl<FloatType, Architecture>>(
+      std::make_unique<p3m_data_struct_coulomb<FloatType>>(std::move(p3m)),
+      std::forward<Args>(args)...);
+  obj->p3m.template make_fft_instance<FFTBackendImpl<FloatType>>(false);
+  return obj;
+}
+
+#endif // P3M
diff --git a/src/core/electrostatics/p3m_gpu.cpp b/src/core/electrostatics/p3m_gpu.cpp
deleted file mode 100644
index 61b5aaadbe3..00000000000
--- a/src/core/electrostatics/p3m_gpu.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (C) 2010-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "config/config.hpp"
-
-#ifdef P3M
-#ifdef CUDA
-
-#include "electrostatics/p3m_gpu.hpp"
-#include "electrostatics/p3m_gpu_cuda.cuh"
-
-#include "actor/visitors.hpp"
-#include "electrostatics/coulomb.hpp"
-
-#include "ParticleRange.hpp"
-#include "PropagationMode.hpp"
-#include "integrators/Propagation.hpp"
-#include "npt.hpp"
-#include "system/GpuParticleData.hpp"
-#include "system/System.hpp"
-
-#include "communication.hpp"
-
-void CoulombP3MGPU::add_long_range_forces(ParticleRange const &particles) {
-#ifdef NPT
-  if (get_system().propagation->integ_switch == INTEG_METHOD_NPT_ISO) {
-    auto const energy = long_range_energy(particles);
-    npt_add_virial_contribution(energy);
-  }
-#else
-  static_cast<void>(particles);
-#endif
-  if (this_node == 0) {
-    auto &gpu = get_system().gpu;
-    p3m_gpu_add_farfield_force(*m_gpu_data, gpu, prefactor, gpu.n_particles());
-  }
-}
-
-void CoulombP3MGPU::init() {
-  auto &system = get_system();
-  if (has_actor_of_type<ElectrostaticLayerCorrection>(
-          system.coulomb.impl->solver)) {
-    init_cpu_kernels();
-  }
-  p3m_gpu_init(m_gpu_data, p3m.params.cao, p3m.params.mesh, p3m.params.alpha,
-               system.box_geo->length(), system.gpu.n_particles());
-}
-
-void CoulombP3MGPU::init_cpu_kernels() { CoulombP3M::init(); }
-
-void CoulombP3MGPU::request_gpu() const {
-  auto &gpu_particle_data = get_system().gpu;
-  gpu_particle_data.enable_property(GpuParticleData::prop::force);
-  gpu_particle_data.enable_property(GpuParticleData::prop::q);
-  gpu_particle_data.enable_property(GpuParticleData::prop::pos);
-}
-
-#endif // CUDA
-#endif // P3M
diff --git a/src/core/electrostatics/p3m_gpu.hpp b/src/core/electrostatics/p3m_gpu.hpp
deleted file mode 100644
index 35c68170a47..00000000000
--- a/src/core/electrostatics/p3m_gpu.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (C) 2014-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-/**
- * @file
- * P3M electrostatics on GPU.
- */
-
-#include "config/config.hpp"
-
-#ifdef P3M
-#ifdef CUDA
-
-#include "electrostatics/p3m.hpp"
-
-#include "ParticleRange.hpp"
-
-#include <memory>
-
-struct P3MGpuParams;
-
-struct CoulombP3MGPU : public CoulombP3M {
-  using CoulombP3M::CoulombP3M;
-
-  void init();
-  void on_activation() {
-    request_gpu();
-    CoulombP3M::on_activation();
-    if (is_tuned()) {
-      init_cpu_kernels();
-    }
-  }
-
-  void add_long_range_forces(ParticleRange const &);
-  void request_gpu() const;
-
-private:
-  /**
-   * @brief Initialize the CPU kernels.
-   * This operation is time-consuming and sets up data members
-   * that are only relevant for ELC force corrections.
-   */
-  void init_cpu_kernels();
-  std::shared_ptr<P3MGpuParams> m_gpu_data = nullptr;
-};
-
-#endif // CUDA
-#endif // P3M
diff --git a/src/core/electrostatics/p3m_gpu_cuda.cu b/src/core/electrostatics/p3m_gpu_cuda.cu
index c734ba0cccd..05a607da88b 100644
--- a/src/core/electrostatics/p3m_gpu_cuda.cu
+++ b/src/core/electrostatics/p3m_gpu_cuda.cu
@@ -248,19 +248,6 @@ __global__ void calculate_influence_function_device(const P3MGpuData p) {
   }
 }
 
-#ifdef P3M_GPU_REAL_DOUBLE
-__device__ double atomicAdd(double *address, double val) {
-  unsigned long long int *address_as_ull = (unsigned long long int *)address;
-  unsigned long long int old = *address_as_ull, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-                    __double_as_longlong(val + __longlong_as_double(assumed)));
-  } while (assumed != old);
-  return __longlong_as_double(old);
-}
-#endif
-
 namespace {
 __device__ inline auto linear_index_r(P3MGpuData const &p, int i, int j,
                                       int k) {
@@ -376,14 +363,14 @@ __global__ void assign_charge_kernel(P3MGpuData const params,
     auto const c = weights[3u * offset + 3u * cao_id_x] *
                    weights[3u * offset + 3u * threadIdx.y + 1u] *
                    weights[3u * offset + 3u * threadIdx.z + 2u] * part_q[id];
-    atomicAdd(&(charge_mesh[index]), c);
+    atomicAdd(&(charge_mesh[index]), REAL_TYPE(c));
 
   } else {
     auto const c =
         Utils::bspline<cao>(static_cast<int>(cao_id_x), m_pos[0]) * part_q[id] *
         Utils::bspline<cao>(static_cast<int>(threadIdx.y), m_pos[1]) *
         Utils::bspline<cao>(static_cast<int>(threadIdx.z), m_pos[2]);
-    atomicAdd(&(charge_mesh[index]), c);
+    atomicAdd(&(charge_mesh[index]), REAL_TYPE(c));
   }
 }
 
@@ -473,7 +460,7 @@ __global__ void assign_forces_kernel(P3MGpuData const params,
 
   extern __shared__ float weights[];
 
-  REAL_TYPE c;
+  REAL_TYPE c = -prefactor * part_q[id];
   if (shared) {
     auto const offset = static_cast<unsigned int>(cao) * part_in_block;
     if ((threadIdx.y < 3u) && (threadIdx.z == 0u)) {
@@ -483,23 +470,23 @@ __global__ void assign_forces_kernel(P3MGpuData const params,
 
     __syncthreads();
 
-    c = -prefactor * weights[3u * offset + 3u * cao_id_x] *
-        weights[3u * offset + 3u * threadIdx.y + 1u] *
-        weights[3u * offset + 3u * threadIdx.z + 2u] * part_q[id];
+    c *= REAL_TYPE(weights[3u * offset + 3u * cao_id_x] *
+                   weights[3u * offset + 3u * threadIdx.y + 1u] *
+                   weights[3u * offset + 3u * threadIdx.z + 2u]);
   } else {
-    c = -prefactor * part_q[id] *
-        Utils::bspline<cao>(static_cast<int>(cao_id_x), m_pos[0]) *
-        Utils::bspline<cao>(static_cast<int>(threadIdx.y), m_pos[1]) *
-        Utils::bspline<cao>(static_cast<int>(threadIdx.z), m_pos[2]);
+    c *=
+        REAL_TYPE(Utils::bspline<cao>(static_cast<int>(cao_id_x), m_pos[0]) *
+                  Utils::bspline<cao>(static_cast<int>(threadIdx.y), m_pos[1]) *
+                  Utils::bspline<cao>(static_cast<int>(threadIdx.z), m_pos[2]));
   }
 
   const REAL_TYPE *force_mesh_x = (REAL_TYPE *)params.force_mesh_x;
   const REAL_TYPE *force_mesh_y = (REAL_TYPE *)params.force_mesh_y;
   const REAL_TYPE *force_mesh_z = (REAL_TYPE *)params.force_mesh_z;
 
-  atomicAdd(&(part_f[3u * id + 0u]), c * force_mesh_x[index]);
-  atomicAdd(&(part_f[3u * id + 1u]), c * force_mesh_y[index]);
-  atomicAdd(&(part_f[3u * id + 2u]), c * force_mesh_z[index]);
+  atomicAdd(&(part_f[3u * id + 0u]), float(c * force_mesh_x[index]));
+  atomicAdd(&(part_f[3u * id + 1u]), float(c * force_mesh_y[index]));
+  atomicAdd(&(part_f[3u * id + 2u]), float(c * force_mesh_z[index]));
 }
 
 void assign_forces(P3MGpuData const &params,
diff --git a/src/core/electrostatics/p3m_gpu_cuda.cuh b/src/core/electrostatics/p3m_gpu_cuda.cuh
index da2a7cd8202..0c20a7ee25d 100644
--- a/src/core/electrostatics/p3m_gpu_cuda.cuh
+++ b/src/core/electrostatics/p3m_gpu_cuda.cuh
@@ -28,7 +28,7 @@
 
 struct P3MGpuParams;
 
-void p3m_gpu_init(std::shared_ptr<P3MGpuParams> &p3m_gpu_data_ptr, int cao,
+void p3m_gpu_init(std::shared_ptr<P3MGpuParams> &data, int cao,
                   Utils::Vector3i const &mesh, double alpha,
                   Utils::Vector3d const &box_l, std::size_t n_part);
 void p3m_gpu_add_farfield_force(P3MGpuParams &data, GpuParticleData &gpu,
diff --git a/src/core/fft/fft.cpp b/src/core/fft/fft.cpp
index 29997d3c897..9dad7b9aa54 100644
--- a/src/core/fft/fft.cpp
+++ b/src/core/fft/fft.cpp
@@ -31,11 +31,12 @@
 #include <utils/Vector.hpp>
 #include <utils/index.hpp>
 #include <utils/math/permute_ifield.hpp>
+#include <utils/mpi/cart_comm.hpp>
 
 #include <boost/mpi/communicator.hpp>
+#include <boost/serialization/vector.hpp>
 
 #include <fftw3.h>
-#include <mpi.h>
 
 #include <algorithm>
 #include <cmath>
@@ -48,7 +49,6 @@
 #include <utility>
 #include <vector>
 
-using Utils::get_linear_index;
 using Utils::permute_ifield;
 
 /** @name MPI tags for FFT communication */
@@ -59,8 +59,35 @@ using Utils::permute_ifield;
 #define REQ_FFT_BACK 302
 /**@}*/
 
+template <typename T>
+static void fft_sendrecv(T const *const sendbuf, int scount, int dest,
+                         T *const recvbuf, int rcount, int source,
+                         boost::mpi::communicator const &comm, int tag) {
+  auto const type = boost::mpi::get_mpi_datatype<T>(*sendbuf);
+  MPI_Sendrecv(reinterpret_cast<void const *>(sendbuf), scount, type, dest, tag,
+               reinterpret_cast<void *>(recvbuf), rcount, type, source, tag,
+               comm, MPI_STATUS_IGNORE);
+}
+
 namespace fft {
 
+template <typename FloatType = double> struct fftw {
+  using complex = fftw_complex;
+  static auto constexpr plan_many_dft = fftw_plan_many_dft;
+  static auto constexpr destroy_plan = fftw_destroy_plan;
+  static auto constexpr execute_dft = fftw_execute_dft;
+  static auto constexpr malloc = fftw_malloc;
+  static auto constexpr free = fftw_free;
+};
+template <> struct fftw<float> {
+  using complex = fftwf_complex;
+  static auto constexpr plan_many_dft = fftwf_plan_many_dft;
+  static auto constexpr destroy_plan = fftwf_destroy_plan;
+  static auto constexpr execute_dft = fftwf_execute_dft;
+  static auto constexpr malloc = fftwf_malloc;
+  static auto constexpr free = fftwf_free;
+};
+
 /** This ugly function does the bookkeeping: which nodes have to
  *  communicate to each other, when you change the node grid.
  *  Changing the regular decomposition requires communication. This
@@ -99,7 +126,7 @@ find_comm_groups(Utils::Vector3i const &grid1, Utils::Vector3i const &grid2,
   /* comm. group cell index */
   int gi[3];
   /* position of a node in a grid */
-  int p1[3], p2[3];
+  Utils::Vector3i p1, p2;
   /* node identity */
   int n;
   /* comm.rank() position in the communication group. */
@@ -144,8 +171,8 @@ find_comm_groups(Utils::Vector3i const &grid1, Utils::Vector3i const &grid2,
           p2[1] = (gi[1] * s2[1]) + ((i / s2[0]) % s2[1]);
           p2[2] = (gi[2] * s2[2]) + (i / (s2[0] * s2[1]));
 
-          n = node_list1[get_linear_index(p1[0], p1[1], p1[2], grid1)];
-          node_list2[get_linear_index(p2[0], p2[1], p2[2], grid2)] = n;
+          n = node_list1[Utils::get_linear_index(p1, grid1)];
+          node_list2[Utils::get_linear_index(p2, grid2)] = n;
 
           pos[3 * n + 0] = p2[0];
           pos[3 * n + 1] = p2[1];
@@ -275,7 +302,8 @@ int calc_send_block(const int *pos1, const int *grid1, const int *pos2,
  *  \param[in]  dim     size of the in-grid.
  *  \param[in]  element size of a grid element (e.g. 1 for Real, 2 for Complex).
  */
-void pack_block_permute1(double const *const in, double *const out,
+template <typename FloatType>
+void pack_block_permute1(FloatType const *const in, FloatType *const out,
                          const int *start, const int *size, const int *dim,
                          int element) {
 
@@ -323,7 +351,8 @@ void pack_block_permute1(double const *const in, double *const out,
  *  \param[in]  dim     size of the in-grid.
  *  \param[in]  element size of a grid element (e.g. 1 for Real, 2 for Complex).
  */
-void pack_block_permute2(double const *const in, double *const out,
+template <typename FloatType>
+void pack_block_permute2(FloatType const *const in, FloatType *const out,
                          const int *start, const int *size, const int *dim,
                          int element) {
 
@@ -359,19 +388,19 @@ void pack_block_permute2(double const *const in, double *const out,
  *  \param in     input mesh.
  *  \param out    output mesh.
  */
-void fft_data_struct::forw_grid_comm(boost::mpi::communicator const &comm,
-                                     fft_forw_plan const &plan,
-                                     double const *in, double *out) {
+template <typename FloatType>
+void fft_data_struct<FloatType>::forw_grid_comm(
+    boost::mpi::communicator const &comm, fft_forw_plan<FloatType> const &plan,
+    FloatType const *in, FloatType *out) {
   for (std::size_t i = 0ul; i < plan.group.size(); i++) {
     plan.pack_function(in, send_buf.data(), &(plan.send_block[6ul * i]),
                        &(plan.send_block[6ul * i + 3ul]), plan.old_mesh,
                        plan.element);
 
     if (plan.group[i] != comm.rank()) {
-      MPI_Sendrecv(send_buf.data(), plan.send_size[i], MPI_DOUBLE,
-                   plan.group[i], REQ_FFT_FORW, recv_buf.data(),
-                   plan.recv_size[i], MPI_DOUBLE, plan.group[i], REQ_FFT_FORW,
-                   comm, MPI_STATUS_IGNORE);
+      fft_sendrecv(send_buf.data(), plan.send_size[i], plan.group[i],
+                   recv_buf.data(), plan.recv_size[i], plan.group[i], comm,
+                   REQ_FFT_FORW);
     } else { /* Self communication... */
       std::swap(send_buf, recv_buf);
     }
@@ -388,10 +417,12 @@ void fft_data_struct::forw_grid_comm(boost::mpi::communicator const &comm,
  *  \param in     input mesh.
  *  \param out    output mesh.
  */
-void fft_data_struct::back_grid_comm(boost::mpi::communicator const &comm,
-                                     fft_forw_plan const &plan_f,
-                                     fft_back_plan const &plan_b,
-                                     double const *in, double *out) {
+template <typename FloatType>
+void fft_data_struct<FloatType>::back_grid_comm(
+    boost::mpi::communicator const &comm,
+    fft_forw_plan<FloatType> const &plan_f,
+    fft_back_plan<FloatType> const &plan_b, FloatType const *in,
+    FloatType *out) {
   /* Back means: Use the send/receive stuff from the forward plan but
      replace the receive blocks by the send blocks and vice
      versa. Attention then also new_mesh and old_mesh are exchanged */
@@ -402,10 +433,9 @@ void fft_data_struct::back_grid_comm(boost::mpi::communicator const &comm,
                          plan_f.element);
 
     if (plan_f.group[i] != comm.rank()) { /* send first, receive second */
-      MPI_Sendrecv(send_buf.data(), plan_f.recv_size[i], MPI_DOUBLE,
-                   plan_f.group[i], REQ_FFT_BACK, recv_buf.data(),
-                   plan_f.send_size[i], MPI_DOUBLE, plan_f.group[i],
-                   REQ_FFT_BACK, comm, MPI_STATUS_IGNORE);
+      fft_sendrecv(send_buf.data(), plan_f.recv_size[i], plan_f.group[i],
+                   recv_buf.data(), plan_f.send_size[i], plan_f.group[i], comm,
+                   REQ_FFT_BACK);
     } else { /* Self communication... */
       std::swap(send_buf, recv_buf);
     }
@@ -466,7 +496,7 @@ int map_3don2d_grid(int const g3d[3], int g2d[3]) {
 }
 
 /** Calculate most square 2D grid. */
-static void calc_2d_grid(int n, int grid[3]) {
+inline void calc_2d_grid(int n, int grid[3]) {
   grid[0] = n;
   grid[1] = 1;
   grid[2] = 1;
@@ -480,21 +510,20 @@ static void calc_2d_grid(int n, int grid[3]) {
   }
 }
 
-int fft_data_struct::initialize_fft(boost::mpi::communicator const &comm,
-                                    Utils::Vector3i const &ca_mesh_dim,
-                                    int const *ca_mesh_margin,
-                                    Utils::Vector3i const &global_mesh_dim,
-                                    Utils::Vector3d const &global_mesh_off,
-                                    int &ks_pnum, Utils::Vector3i const &grid) {
+template <typename FloatType>
+int fft_data_struct<FloatType>::initialize_fft(
+    boost::mpi::communicator const &comm, Utils::Vector3i const &ca_mesh_dim,
+    int const *ca_mesh_margin, Utils::Vector3i const &global_mesh_dim,
+    Utils::Vector3d const &global_mesh_off, int &ks_pnum,
+    Utils::Vector3i const &grid) {
 
   int n_grid[4][3];         /* The four node grids. */
   int my_pos[4][3];         /* The position of comm.rank() in the node grids. */
   std::vector<int> n_id[4]; /* linear node identity lists for the node grids. */
   std::vector<int> n_pos[4]; /* positions of nodes in the node grids. */
 
-  int const rank = comm.rank();
-  int node_pos[3];
-  MPI_Cart_coords(comm, rank, 3, node_pos);
+  auto const rank = comm.rank();
+  auto const node_pos = Utils::Mpi::cart_coords<3>(comm, rank);
 
   max_comm_size = 0;
   max_mesh_size = 0;
@@ -510,10 +539,12 @@ int fft_data_struct::initialize_fft(boost::mpi::communicator const &comm,
     my_pos[0][i] = node_pos[i];
   }
   for (int i = 0; i < comm.size(); i++) {
-    MPI_Cart_coords(comm, i, 3, &(n_pos[0][3 * i + 0]));
-    auto const lin_ind = get_linear_index(
-        n_pos[0][3 * i + 0], n_pos[0][3 * i + 1], n_pos[0][3 * i + 2],
-        {n_grid[0][0], n_grid[0][1], n_grid[0][2]});
+    auto const n_pos_i = Utils::Mpi::cart_coords<3>(comm, i);
+    for (int j = 0; j < 3; ++j) {
+      n_pos[0][3 * i + j] = n_pos_i[j];
+    }
+    auto const lin_ind = Utils::get_linear_index(
+        n_pos_i, {n_grid[0][0], n_grid[0][1], n_grid[0][2]});
     n_id[0][lin_ind] = i;
   }
 
@@ -639,7 +670,7 @@ int fft_data_struct::initialize_fft(boost::mpi::communicator const &comm,
   send_buf.resize(max_comm_size);
   recv_buf.resize(max_comm_size);
   data_buf.resize(max_mesh_size);
-  auto *c_data = (fftw_complex *)(data_buf.data());
+  auto *c_data = (typename fftw<FloatType>::complex *)(data_buf.data());
 
   /* === FFT Routines (Using FFTW / RFFTW package)=== */
   for (int i = 1; i < 4; i++) {
@@ -647,10 +678,10 @@ int fft_data_struct::initialize_fft(boost::mpi::communicator const &comm,
       forw[i].destroy_plan();
     }
     forw[i].dir = FFTW_FORWARD;
-    forw[i].plan_handle =
-        fftw_plan_many_dft(1, &forw[i].new_mesh[2], forw[i].n_ffts, c_data,
-                           nullptr, 1, forw[i].new_mesh[2], c_data, nullptr, 1,
-                           forw[i].new_mesh[2], forw[i].dir, FFTW_PATIENT);
+    forw[i].plan_handle = fftw<FloatType>::plan_many_dft(
+        1, &forw[i].new_mesh[2], forw[i].n_ffts, c_data, nullptr, 1,
+        forw[i].new_mesh[2], c_data, nullptr, 1, forw[i].new_mesh[2],
+        forw[i].dir, FFTW_PATIENT);
     assert(forw[i].plan_handle);
   }
 
@@ -661,10 +692,10 @@ int fft_data_struct::initialize_fft(boost::mpi::communicator const &comm,
       back[i].destroy_plan();
     }
     back[i].dir = FFTW_BACKWARD;
-    back[i].plan_handle =
-        fftw_plan_many_dft(1, &forw[i].new_mesh[2], forw[i].n_ffts, c_data,
-                           nullptr, 1, forw[i].new_mesh[2], c_data, nullptr, 1,
-                           forw[i].new_mesh[2], back[i].dir, FFTW_PATIENT);
+    back[i].plan_handle = fftw<FloatType>::plan_many_dft(
+        1, &forw[i].new_mesh[2], forw[i].n_ffts, c_data, nullptr, 1,
+        forw[i].new_mesh[2], c_data, nullptr, 1, forw[i].new_mesh[2],
+        back[i].dir, FFTW_PATIENT);
     back[i].pack_function = pack_block_permute1;
     assert(back[i].plan_handle);
   }
@@ -679,59 +710,61 @@ int fft_data_struct::initialize_fft(boost::mpi::communicator const &comm,
   return max_mesh_size;
 }
 
-void fft_data_struct::forward_fft(boost::mpi::communicator const &comm,
-                                  double *data) {
+template <typename FloatType>
+void fft_data_struct<FloatType>::forward_fft(
+    boost::mpi::communicator const &comm, FloatType *data) {
   /* ===== first direction  ===== */
 
-  auto *c_data = (fftw_complex *)data;
-  auto *c_data_buf = (fftw_complex *)data_buf.data();
+  auto *c_data = (typename fftw<FloatType>::complex *)data;
+  auto *c_data_buf = (typename fftw<FloatType>::complex *)data_buf.data();
 
   /* communication to current dir row format (in is data) */
   forw_grid_comm(comm, forw[1], data, data_buf.data());
 
   /* complexify the real data array (in is data_buf) */
   for (int i = 0; i < forw[1].new_size; i++) {
-    data[2 * i + 0] = data_buf[i]; /* real value */
-    data[2 * i + 1] = 0;           /* complex value */
+    data[2 * i + 0] = data_buf[i];  /* real value */
+    data[2 * i + 1] = FloatType(0); /* complex value */
   }
   /* perform FFT (in/out is data)*/
-  fftw_execute_dft(forw[1].plan_handle, c_data, c_data);
+  fftw<FloatType>::execute_dft(forw[1].plan_handle, c_data, c_data);
   /* ===== second direction ===== */
   /* communication to current dir row format (in is data) */
   forw_grid_comm(comm, forw[2], data, data_buf.data());
   /* perform FFT (in/out is data_buf) */
-  fftw_execute_dft(forw[2].plan_handle, c_data_buf, c_data_buf);
+  fftw<FloatType>::execute_dft(forw[2].plan_handle, c_data_buf, c_data_buf);
   /* ===== third direction  ===== */
   /* communication to current dir row format (in is data_buf) */
   forw_grid_comm(comm, forw[3], data_buf.data(), data);
   /* perform FFT (in/out is data)*/
-  fftw_execute_dft(forw[3].plan_handle, c_data, c_data);
+  fftw<FloatType>::execute_dft(forw[3].plan_handle, c_data, c_data);
 
   /* REMARK: Result has to be in data. */
 }
 
-void fft_data_struct::backward_fft(boost::mpi::communicator const &comm,
-                                   double *data, bool check_complex) {
+template <typename FloatType>
+void fft_data_struct<FloatType>::backward_fft(
+    boost::mpi::communicator const &comm, FloatType *data, bool check_complex) {
 
-  auto *c_data = (fftw_complex *)data;
-  auto *c_data_buf = (fftw_complex *)data_buf.data();
+  auto *c_data = (typename fftw<FloatType>::complex *)data;
+  auto *c_data_buf = (typename fftw<FloatType>::complex *)data_buf.data();
 
   /* ===== third direction  ===== */
 
   /* perform FFT (in is data) */
-  fftw_execute_dft(back[3].plan_handle, c_data, c_data);
+  fftw<FloatType>::execute_dft(back[3].plan_handle, c_data, c_data);
   /* communicate (in is data)*/
   back_grid_comm(comm, forw[3], back[3], data, data_buf.data());
 
   /* ===== second direction ===== */
   /* perform FFT (in is data_buf) */
-  fftw_execute_dft(back[2].plan_handle, c_data_buf, c_data_buf);
+  fftw<FloatType>::execute_dft(back[2].plan_handle, c_data_buf, c_data_buf);
   /* communicate (in is data_buf) */
   back_grid_comm(comm, forw[2], back[2], data_buf.data(), data);
 
   /* ===== first direction  ===== */
   /* perform FFT (in is data) */
-  fftw_execute_dft(back[1].plan_handle, c_data, c_data);
+  fftw<FloatType>::execute_dft(back[1].plan_handle, c_data, c_data);
   /* throw away the (hopefully) empty complex component (in is data) */
   for (int i = 0; i < forw[1].new_size; i++) {
     data_buf[i] = data[2 * i]; /* real value */
@@ -749,17 +782,19 @@ void fft_data_struct::backward_fft(boost::mpi::communicator const &comm,
   /* REMARK: Result has to be in data. */
 }
 
-void fft_pack_block(double const *const in, double *const out,
+template <typename FloatType>
+void fft_pack_block(FloatType const *const in, FloatType *const out,
                     int const start[3], int const size[3], int const dim[3],
                     int element) {
 
-  auto const copy_size = element * size[2] * static_cast<int>(sizeof(double));
+  auto const copy_size =
+      static_cast<std::size_t>(element * size[2]) * sizeof(FloatType);
   /* offsets for indices in input grid */
   auto const m_in_offset = element * dim[2];
   auto const s_in_offset = element * (dim[2] * (dim[1] - size[1]));
   /* offsets for indices in output grid */
   auto const m_out_offset = element * size[2];
-  /* linear index of in grid, linear index of out grid */
+  /* linear index of input grid, linear index of output grid */
   int li_in = element * (start[2] + dim[2] * (start[1] + dim[1] * start[0]));
   int li_out = 0;
 
@@ -773,11 +808,13 @@ void fft_pack_block(double const *const in, double *const out,
   }
 }
 
-void fft_unpack_block(double const *const in, double *const out,
+template <typename FloatType>
+void fft_unpack_block(FloatType const *const in, FloatType *const out,
                       int const start[3], int const size[3], int const dim[3],
                       int element) {
 
-  auto const copy_size = element * size[2] * static_cast<int>(sizeof(double));
+  auto const copy_size =
+      static_cast<std::size_t>(element * size[2]) * sizeof(FloatType);
   /* offsets for indices in output grid */
   auto const m_out_offset = element * dim[2];
   auto const s_out_offset = element * (dim[2] * (dim[1] - size[1]));
@@ -797,15 +834,60 @@ void fft_unpack_block(double const *const in, double *const out,
   }
 }
 
-void fft_plan::destroy_plan() {
+template <typename FloatType> void fft_plan<FloatType>::destroy_plan() {
   if (plan_handle) {
-    fftw_destroy_plan(plan_handle);
+    fftw<FloatType>::destroy_plan(plan_handle);
     plan_handle = nullptr;
   }
 }
 
-namespace detail {
-void fft_free(void *const p) { ::fftw_free(p); }
-void *fft_malloc(std::size_t length) { return ::fftw_malloc(length); }
-} // namespace detail
+template <class FloatType>
+FloatType *allocator<FloatType>::allocate(const std::size_t n) const {
+  if (n == 0) {
+    return nullptr;
+  }
+  if (n > std::numeric_limits<std::size_t>::max() / sizeof(FloatType)) {
+    throw std::bad_array_new_length();
+  }
+  void *const pv = fftw<FloatType>::malloc(n * sizeof(FloatType));
+  if (!pv) {
+    throw std::bad_alloc();
+  }
+  return static_cast<FloatType *>(pv);
+}
+
+template <class FloatType>
+void allocator<FloatType>::deallocate(FloatType *const p,
+                                      std::size_t) const noexcept {
+  fftw<FloatType>::free(static_cast<void *>(p));
+}
+
+template struct allocator<float>;
+template struct allocator<double>;
+
+template struct fft_plan<float>;
+template struct fft_plan<double>;
+
+template struct fft_forw_plan<float>;
+template struct fft_forw_plan<double>;
+
+template struct fft_back_plan<float>;
+template struct fft_back_plan<double>;
+
+template struct fft_data_struct<float>;
+template struct fft_data_struct<double>;
+
+template void fft_pack_block<float>(float const *in, float *out,
+                                    int const start[3], int const size[3],
+                                    int const dim[3], int element);
+template void fft_pack_block<double>(double const *in, double *out,
+                                     int const start[3], int const size[3],
+                                     int const dim[3], int element);
+template void fft_unpack_block<float>(float const *in, float *out,
+                                      int const start[3], int const size[3],
+                                      int const dim[3], int element);
+template void fft_unpack_block<double>(double const *in, double *out,
+                                       int const start[3], int const size[3],
+                                       int const dim[3], int element);
+
 } // namespace fft
diff --git a/src/core/fft/fft.hpp b/src/core/fft/fft.hpp
index 8747448a371..3ccc7d6dcdf 100644
--- a/src/core/fft/fft.hpp
+++ b/src/core/fft/fft.hpp
@@ -45,10 +45,12 @@
 #include <memory>
 #include <optional>
 #include <span>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
 struct fftw_plan_s;
+struct fftwf_plan_s;
 namespace boost::mpi {
 class environment;
 class communicator;
@@ -56,8 +58,9 @@ class communicator;
 
 namespace fft {
 
-struct fft_plan {
-  using fftw_plan = fftw_plan_s *;
+template <typename FloatType> struct fft_plan {
+  using fftw_plan = std::conditional_t<std::is_same_v<FloatType, double>,
+                                       fftw_plan_s *, fftwf_plan_s *>;
 
   ~fft_plan() { destroy_plan(); }
 
@@ -66,13 +69,14 @@ struct fft_plan {
   /** plan for the FFT. */
   fftw_plan plan_handle = nullptr;
   /** packing function for send blocks. */
-  void (*pack_function)(double const *const, double *const, int const *,
+  void (*pack_function)(FloatType const *const, FloatType *const, int const *,
                         int const *, int const *, int);
   void destroy_plan();
 };
 
 /** @brief Plan for a forward 1D FFT of a flattened 3D array. */
-struct fft_forw_plan : public fft_plan {
+template <typename FloatType>
+struct fft_forw_plan : public fft_plan<FloatType> {
   /** row direction of that FFT. */
   int row_dir;
   /** permutations from normal coordinate system. */
@@ -100,12 +104,13 @@ struct fft_forw_plan : public fft_plan {
   std::vector<int> recv_block;
   /** Recv block communication sizes. */
   std::vector<int> recv_size;
-  /** size of send block elements. */
+  /** size of send block elements, i.e. 1 for real, 2 for complex. */
   int element;
 };
 
 /** @brief Plan for a backward 1D FFT of a flattened 3D array. */
-struct fft_back_plan : public fft_plan {};
+template <typename FloatType>
+struct fft_back_plan : public fft_plan<FloatType> {};
 
 /**
  * @brief Information about the three one dimensional FFTs and how the nodes
@@ -114,7 +119,7 @@ struct fft_back_plan : public fft_plan {};
  * @note FFT numbering starts with 1 for technical reasons (because we have 4
  * node grids, the index 0 is used for the real space charge assignment grid).
  */
-struct fft_data_struct {
+template <typename FloatType> struct fft_data_struct {
 private:
   /**
    * @brief Handle to the MPI environment.
@@ -125,9 +130,9 @@ struct fft_data_struct {
   std::shared_ptr<boost::mpi::environment> m_mpi_env;
 
   /** Information for forward FFTs. */
-  std::array<fft_forw_plan, 4u> forw;
+  std::array<fft_forw_plan<FloatType>, 4u> forw;
   /** Information for backward FFTs. */
-  std::array<fft_back_plan, 4u> back;
+  std::array<fft_back_plan<FloatType>, 4u> back;
 
   /** Whether FFT is initialized or not. */
   bool init_tag = false;
@@ -139,11 +144,11 @@ struct fft_data_struct {
   int max_mesh_size = 0;
 
   /** send buffer. */
-  std::vector<double> send_buf;
+  std::vector<FloatType> send_buf;
   /** receive buffer. */
-  std::vector<double> recv_buf;
+  std::vector<FloatType> recv_buf;
   /** Buffer for receive data. */
-  fft::vector<double> data_buf;
+  fft::vector<FloatType> data_buf;
 
 public:
   explicit fft_data_struct(decltype(m_mpi_env) mpi_env)
@@ -151,8 +156,8 @@ struct fft_data_struct {
 
   // disable copy construction: unsafe because we store raw pointers
   // to FFT plans (avoids double-free and use-after-free)
-  fft_data_struct &operator=(fft_data_struct const &) = delete;
-  fft_data_struct(fft_data_struct const &) = delete;
+  fft_data_struct &operator=(fft_data_struct<FloatType> const &) = delete;
+  fft_data_struct(fft_data_struct<FloatType> const &) = delete;
 
   /** Initialize everything connected to the 3D-FFT.
    *
@@ -177,7 +182,7 @@ struct fft_data_struct {
    *  \param[in,out] data  Mesh.
    *  \param[in]     comm  MPI communicator
    */
-  void forward_fft(boost::mpi::communicator const &comm, double *data);
+  void forward_fft(boost::mpi::communicator const &comm, FloatType *data);
 
   /** Perform an in-place backward 3D FFT.
    *  \warning The content of \a data is overwritten.
@@ -186,7 +191,7 @@ struct fft_data_struct {
    *                                non-zero.
    *  \param[in]     comm           MPI communicator.
    */
-  void backward_fft(boost::mpi::communicator const &comm, double *data,
+  void backward_fft(boost::mpi::communicator const &comm, FloatType *data,
                     bool check_complex);
 
   auto get_mesh_size() const { return forw[3u].new_mesh; }
@@ -195,10 +200,12 @@ struct fft_data_struct {
 
 private:
   void forw_grid_comm(boost::mpi::communicator const &comm,
-                      fft_forw_plan const &plan, double const *in, double *out);
+                      fft_forw_plan<FloatType> const &plan, FloatType const *in,
+                      FloatType *out);
   void back_grid_comm(boost::mpi::communicator const &comm,
-                      fft_forw_plan const &plan_f, fft_back_plan const &plan_b,
-                      double const *in, double *out);
+                      fft_forw_plan<FloatType> const &plan_f,
+                      fft_back_plan<FloatType> const &plan_b,
+                      FloatType const *in, FloatType *out);
 };
 
 /** Pack a block (<tt>size[3]</tt> starting at <tt>start[3]</tt>) of an input
@@ -218,7 +225,8 @@ struct fft_data_struct {
  *  \param[in]  dim     size of the in-grid.
  *  \param[in]  element size of a grid element (e.g. 1 for Real, 2 for Complex).
  */
-void fft_pack_block(double const *in, double *out, int const start[3],
+template <typename FloatType>
+void fft_pack_block(FloatType const *in, FloatType *out, int const start[3],
                     int const size[3], int const dim[3], int element);
 
 /** Unpack a 3d-grid input block (<tt>size[3]</tt>) into an output 3d-grid
@@ -233,7 +241,8 @@ void fft_pack_block(double const *in, double *out, int const start[3],
  *  \param[in]  dim     size of the in-grid.
  *  \param[in]  element size of a grid element (e.g. 1 for Real, 2 for Complex).
  */
-void fft_unpack_block(double const *in, double *out, int const start[3],
+template <typename FloatType>
+void fft_unpack_block(FloatType const *in, FloatType *out, int const start[3],
                       int const size[3], int const dim[3], int element);
 
 int map_3don2d_grid(int const g3d[3], int g2d[3]);
diff --git a/src/core/fft/vector.hpp b/src/core/fft/vector.hpp
index e74df7a3de0..10cc268b302 100644
--- a/src/core/fft/vector.hpp
+++ b/src/core/fft/vector.hpp
@@ -24,43 +24,29 @@
 #include <cstddef>
 #include <limits>
 #include <stdexcept>
+#include <type_traits>
 #include <vector>
 
 namespace fft {
-namespace detail {
-void fft_free(void *p);
-void *fft_malloc(std::size_t length);
-} // namespace detail
 
 /** @brief Aligned allocator for FFT data. */
-template <class T> struct allocator {
-  typedef T value_type;
+template <class FloatType> struct allocator {
+  static_assert(std::is_same_v<FloatType, float> or
+                    std::is_same_v<FloatType, double>,
+                "FFTW only implements float and double");
+  typedef FloatType value_type;
   allocator() noexcept = default; // default ctor not required
-  template <class U> explicit allocator(const allocator<U> &) {}
-  template <class U> bool operator==(const allocator<U> &) const {
+  template <class U> explicit allocator(allocator<U> const &) {}
+  template <class U> bool operator==(allocator<U> const &) const {
     return true;
   }
-  template <class U> bool operator!=(const allocator<U> &) const {
+  template <class U> bool operator!=(allocator<U> const &) const {
     return false;
   }
 
-  T *allocate(const std::size_t n) const {
-    if (n == 0) {
-      return nullptr;
-    }
-    if (n > std::numeric_limits<std::size_t>::max() / sizeof(T)) {
-      throw std::bad_array_new_length();
-    }
-    void *const pv = detail::fft_malloc(n * sizeof(T));
-    if (!pv) {
-      throw std::bad_alloc();
-    }
-    return static_cast<T *>(pv);
-  }
+  FloatType *allocate(std::size_t n) const;
 
-  void deallocate(T *const p, std::size_t) const noexcept {
-    detail::fft_free(static_cast<void *>(p));
-  }
+  void deallocate(FloatType *p, std::size_t) const noexcept;
 };
 
 template <class T> using vector = std::vector<T, allocator<T>>;
diff --git a/src/core/forces.cpp b/src/core/forces.cpp
index d669b87deaf..b9881e166c0 100644
--- a/src/core/forces.cpp
+++ b/src/core/forces.cpp
@@ -35,7 +35,6 @@
 #include "communication.hpp"
 #include "constraints/Constraints.hpp"
 #include "electrostatics/icc.hpp"
-#include "electrostatics/p3m_gpu.hpp"
 #include "forces_inline.hpp"
 #include "galilei/ComFixed.hpp"
 #include "immersed_boundary/ImmersedBoundaries.hpp"
diff --git a/src/core/magnetostatics/dp3m.cpp b/src/core/magnetostatics/dp3m.cpp
index b3ab8871fab..68e409b316c 100644
--- a/src/core/magnetostatics/dp3m.cpp
+++ b/src/core/magnetostatics/dp3m.cpp
@@ -30,6 +30,7 @@
 #ifdef DP3M
 
 #include "magnetostatics/dp3m.hpp"
+#include "magnetostatics/dp3m.impl.hpp"
 
 #include "fft/fft.hpp"
 #include "p3m/TuningAlgorithm.hpp"
@@ -38,7 +39,6 @@
 #include "p3m/influence_function_dipolar.hpp"
 #include "p3m/interpolation.hpp"
 #include "p3m/math.hpp"
-#include "p3m/send_mesh.hpp"
 
 #include "BoxGeometry.hpp"
 #include "LocalBox.hpp"
@@ -76,7 +76,8 @@
 #include <tuple>
 #include <vector>
 
-void DipolarP3M::count_magnetic_particles() {
+template <typename FloatType, Arch Architecture>
+void DipolarP3MImpl<FloatType, Architecture>::count_magnetic_particles() {
   int local_n = 0;
   double local_mu2 = 0.;
 
@@ -105,7 +106,10 @@ double dp3m_rtbisection(double box_size, double r_cut_iL, int n_c_part,
                         double sum_q2, double x1, double x2, double xacc,
                         double tuned_accuracy);
 
-double DipolarP3M::calc_average_self_energy_k_space() const {
+template <typename FloatType, Arch Architecture>
+double
+DipolarP3MImpl<FloatType, Architecture>::calc_average_self_energy_k_space()
+    const {
   auto const &box_geo = *get_system().box_geo;
   auto const node_phi = grid_influence_function_self_energy(
       dp3m.params, dp3m.mesh.start, dp3m.mesh.stop, dp3m.g_energy);
@@ -116,7 +120,8 @@ double DipolarP3M::calc_average_self_energy_k_space() const {
   return phi * std::numbers::pi;
 }
 
-void DipolarP3M::init() {
+template <typename FloatType, Arch Architecture>
+void DipolarP3MImpl<FloatType, Architecture>::init_cpu_kernels() {
   assert(dp3m.params.mesh >= Utils::Vector3i::broadcast(1));
   assert(dp3m.params.cao >= 1 and dp3m.params.cao <= 7);
   assert(dp3m.params.alpha > 0.);
@@ -131,6 +136,7 @@ void DipolarP3M::init() {
 
   assert(dp3m.fft);
   dp3m.local_mesh.calc_local_ca_mesh(dp3m.params, local_geo, verlet_skin, 0.);
+  dp3m.fft->init_halo();
   dp3m.fft->init_fft();
   dp3m.calc_differential_operator();
 
@@ -140,44 +146,29 @@ void DipolarP3M::init() {
   count_magnetic_particles();
 }
 
-DipolarP3M::DipolarP3M(P3MParameters &&parameters, double prefactor,
-                       int tune_timings, bool tune_verbose)
-    : dp3m{std::move(parameters)}, tune_timings{tune_timings},
-      tune_verbose{tune_verbose} {
-
-  set_prefactor(prefactor);
-  m_is_tuned = !dp3m.params.tuning;
-  dp3m.params.tuning = false;
-
-  if (tune_timings <= 0) {
-    throw std::domain_error("Parameter 'timings' must be > 0");
-  }
-
-  if (dp3m.params.mesh != Utils::Vector3i::broadcast(dp3m.params.mesh[0])) {
-    throw std::domain_error("DipolarP3M requires a cubic mesh");
-  }
-}
-
 namespace {
 template <int cao> struct AssignDipole {
-  void operator()(decltype(DipolarP3M::dp3m) &dp3m,
-                  Utils::Vector3d const &real_pos,
+  void operator()(auto &dp3m, Utils::Vector3d const &real_pos,
                   Utils::Vector3d const &dip) const {
+    using value_type =
+        typename std::remove_reference_t<decltype(dp3m)>::value_type;
     auto const weights = p3m_calculate_interpolation_weights<cao>(
         real_pos, dp3m.params.ai, dp3m.local_mesh);
-    p3m_interpolate<cao>(dp3m.local_mesh, weights,
-                         [&dip, &dp3m](int ind, double w) {
-                           dp3m.mesh.rs_fields[0][ind] += w * dip[0];
-                           dp3m.mesh.rs_fields[1][ind] += w * dip[1];
-                           dp3m.mesh.rs_fields[2][ind] += w * dip[2];
-                         });
-
-    dp3m.inter_weights.store<cao>(weights);
+    p3m_interpolate<cao>(
+        dp3m.local_mesh, weights, [&dip, &dp3m](int ind, double w) {
+          dp3m.mesh.rs_fields[0u][ind] += value_type(w * dip[0u]);
+          dp3m.mesh.rs_fields[1u][ind] += value_type(w * dip[1u]);
+          dp3m.mesh.rs_fields[2u][ind] += value_type(w * dip[2u]);
+        });
+
+    dp3m.inter_weights.template store<cao>(weights);
   }
 };
 } // namespace
 
-void DipolarP3M::dipole_assign(ParticleRange const &particles) {
+template <typename FloatType, Arch Architecture>
+void DipolarP3MImpl<FloatType, Architecture>::dipole_assign(
+    ParticleRange const &particles) {
   dp3m.inter_weights.reset(dp3m.params.cao);
 
   /* prepare local FFT mesh */
@@ -195,7 +186,7 @@ void DipolarP3M::dipole_assign(ParticleRange const &particles) {
 
 namespace {
 template <int cao> struct AssignTorques {
-  void operator()(decltype(DipolarP3M::dp3m) &dp3m, double prefac, int d_rs,
+  void operator()(auto &dp3m, double prefac, int d_rs,
                   ParticleRange const &particles) const {
 
     /* magnetic particle index */
@@ -203,12 +194,12 @@ template <int cao> struct AssignTorques {
 
     for (auto &p : particles) {
       if (p.dipm() != 0.) {
-        auto const w = dp3m.inter_weights.load<cao>(p_index);
+        auto const weights = dp3m.inter_weights.template load<cao>(p_index);
 
         Utils::Vector3d E{};
-        p3m_interpolate(dp3m.local_mesh, w,
+        p3m_interpolate(dp3m.local_mesh, weights,
                         [&E, &dp3m, d_rs](int ind, double w) {
-                          E[d_rs] += w * dp3m.mesh.rs_scalar[ind];
+                          E[d_rs] += w * double(dp3m.mesh.rs_scalar[ind]);
                         });
 
         p.torque() -= vector_product(p.calc_dip(), prefac * E);
@@ -219,7 +210,7 @@ template <int cao> struct AssignTorques {
 };
 
 template <int cao> struct AssignForces {
-  void operator()(decltype(DipolarP3M::dp3m) &dp3m, double prefac, int d_rs,
+  void operator()(auto &dp3m, double prefac, int d_rs,
                   ParticleRange const &particles) const {
 
     /* magnetic particle index */
@@ -227,14 +218,15 @@ template <int cao> struct AssignForces {
 
     for (auto &p : particles) {
       if (p.dipm() != 0.) {
-        auto const w = dp3m.inter_weights.load<cao>(p_index);
+        auto const weights = dp3m.inter_weights.template load<cao>(p_index);
 
         Utils::Vector3d E{};
-        p3m_interpolate(dp3m.local_mesh, w, [&E, &dp3m](int ind, double w) {
-          E[0] += w * dp3m.mesh.rs_fields[0][ind];
-          E[1] += w * dp3m.mesh.rs_fields[1][ind];
-          E[2] += w * dp3m.mesh.rs_fields[2][ind];
-        });
+        p3m_interpolate(dp3m.local_mesh, weights,
+                        [&E, &dp3m](int ind, double w) {
+                          E[0u] += w * double(dp3m.mesh.rs_fields[0u][ind]);
+                          E[1u] += w * double(dp3m.mesh.rs_fields[1u][ind]);
+                          E[2u] += w * double(dp3m.mesh.rs_fields[2u][ind]);
+                        });
 
         p.force()[d_rs] += p.calc_dip() * prefac * E;
         ++p_index;
@@ -244,8 +236,9 @@ template <int cao> struct AssignForces {
 };
 } // namespace
 
-double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
-                                     ParticleRange const &particles) {
+template <typename FloatType, Arch Architecture>
+double DipolarP3MImpl<FloatType, Architecture>::long_range_kernel(
+    bool force_flag, bool energy_flag, ParticleRange const &particles) {
   /* k-space energy */
   double energy = 0.;
   auto const &system = get_system();
@@ -260,7 +253,8 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
 
   if (dp3m.sum_mu2 > 0.) {
     dipole_assign(particles);
-    dp3m.fft->perform_fwd_fft();
+    dp3m.fft->perform_vector_halo_gather();
+    dp3m.fft->perform_vector_fwd_fft();
   }
 
   /* === k-space energy calculation  === */
@@ -284,14 +278,14 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
       for_each_3d(mesh_start, dp3m.mesh.size, indices, [&]() {
         auto const shift = indices + offset;
         // Re(mu)*k
-        auto const re = mesh_dip[0u][index] * d_op[shift[KX]] +
-                        mesh_dip[1u][index] * d_op[shift[KY]] +
-                        mesh_dip[2u][index] * d_op[shift[KZ]];
+        auto const re = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) +
+                        mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) +
+                        mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]);
         ++index;
         // Im(mu)*k
-        auto const im = mesh_dip[0u][index] * d_op[shift[KX]] +
-                        mesh_dip[1u][index] * d_op[shift[KY]] +
-                        mesh_dip[2u][index] * d_op[shift[KZ]];
+        auto const im = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) +
+                        mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) +
+                        mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]);
         ++index;
         node_energy += *it_energy * (Utils::sqr(re) + Utils::sqr(im));
         std::advance(it_energy, 1);
@@ -336,15 +330,15 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
       for_each_3d(mesh_start, mesh_stop, indices, [&]() {
         auto const shift = indices + offset;
         // Re(mu)*k
-        auto const re = mesh_dip[0u][index] * d_op[shift[KX]] +
-                        mesh_dip[1u][index] * d_op[shift[KY]] +
-                        mesh_dip[2u][index] * d_op[shift[KZ]];
+        auto const re = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) +
+                        mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) +
+                        mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]);
         dp3m.mesh.ks_scalar[index] = *it_energy * re;
         ++index;
         // Im(mu)*k
-        auto const im = mesh_dip[0u][index] * d_op[shift[KX]] +
-                        mesh_dip[1u][index] * d_op[shift[KY]] +
-                        mesh_dip[2u][index] * d_op[shift[KZ]];
+        auto const im = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) +
+                        mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) +
+                        mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]);
         dp3m.mesh.ks_scalar[index] = *it_energy * im;
         ++index;
         std::advance(it_energy, 1);
@@ -354,13 +348,14 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
       for (int d = 0; d < 3; d++) {
         index = 0u;
         for_each_3d(mesh_start, mesh_stop, indices, [&]() {
-          auto const pos = indices[d] + offset[d];
-          dp3m.mesh.rs_scalar[index] = d_op[pos] * dp3m.mesh.ks_scalar[index];
+          auto const d_op_val = FloatType(d_op[indices[d] + offset[d]]);
+          dp3m.mesh.rs_scalar[index] = d_op_val * dp3m.mesh.ks_scalar[index];
           ++index;
-          dp3m.mesh.rs_scalar[index] = d_op[pos] * dp3m.mesh.ks_scalar[index];
+          dp3m.mesh.rs_scalar[index] = d_op_val * dp3m.mesh.ks_scalar[index];
           ++index;
         });
-        dp3m.fft->perform_space_back_fft();
+        dp3m.fft->perform_scalar_back_fft();
+        dp3m.fft->perform_scalar_halo_spread();
         /* Assign force component from mesh to particle */
         auto const d_rs = (d + dp3m.mesh.ks_pnum) % 3;
         Utils::integral_parameter<int, AssignTorques, 1, 7>(
@@ -380,14 +375,14 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
       for_each_3d(mesh_start, mesh_stop, indices, [&]() {
         auto const shift = indices + offset;
         // Re(mu)*k
-        auto const re = mesh_dip[0u][index] * d_op[shift[KX]] +
-                        mesh_dip[1u][index] * d_op[shift[KY]] +
-                        mesh_dip[2u][index] * d_op[shift[KZ]];
+        auto const re = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) +
+                        mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) +
+                        mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]);
         ++index;
         // Im(mu)*k
-        auto const im = mesh_dip[0u][index] * d_op[shift[KX]] +
-                        mesh_dip[1u][index] * d_op[shift[KY]] +
-                        mesh_dip[2u][index] * d_op[shift[KZ]];
+        auto const im = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) +
+                        mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) +
+                        mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]);
         ++index;
         dp3m.mesh.ks_scalar[index - 2] = *it_force * im;
         dp3m.mesh.ks_scalar[index - 1] = *it_force * (-re);
@@ -398,21 +393,21 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
       for (int d = 0; d < 3; d++) {
         index = 0u;
         for_each_3d(mesh_start, mesh_stop, indices, [&]() {
+          auto const d_op_val = FloatType(d_op[indices[d] + offset[d]]);
           auto const shift = indices + offset;
-          auto const f1 =
-              d_op[indices[d] + offset[d]] * dp3m.mesh.ks_scalar[index];
-          mesh_dip[0u][index] = d_op[shift[KX]] * f1;
-          mesh_dip[1u][index] = d_op[shift[KY]] * f1;
-          mesh_dip[2u][index] = d_op[shift[KZ]] * f1;
+          auto const f1 = d_op_val * dp3m.mesh.ks_scalar[index];
+          mesh_dip[0u][index] = FloatType(d_op[shift[KX]]) * f1;
+          mesh_dip[1u][index] = FloatType(d_op[shift[KY]]) * f1;
+          mesh_dip[2u][index] = FloatType(d_op[shift[KZ]]) * f1;
           ++index;
-          auto const f2 =
-              d_op[indices[d] + offset[d]] * dp3m.mesh.ks_scalar[index];
-          mesh_dip[0u][index] = d_op[shift[KX]] * f2;
-          mesh_dip[1u][index] = d_op[shift[KY]] * f2;
-          mesh_dip[2u][index] = d_op[shift[KZ]] * f2;
+          auto const f2 = d_op_val * dp3m.mesh.ks_scalar[index];
+          mesh_dip[0u][index] = FloatType(d_op[shift[KX]]) * f2;
+          mesh_dip[1u][index] = FloatType(d_op[shift[KY]]) * f2;
+          mesh_dip[2u][index] = FloatType(d_op[shift[KZ]]) * f2;
           ++index;
         });
-        dp3m.fft->perform_field_back_fft();
+        dp3m.fft->perform_vector_back_fft();
+        dp3m.fft->perform_vector_halo_spread();
         /* Assign force component from mesh to particle */
         auto const d_rs = (d + dp3m.mesh.ks_pnum) % 3;
         Utils::integral_parameter<int, AssignForces, 1, 7>(
@@ -507,20 +502,23 @@ double DipolarP3M::calc_surface_term(bool force_flag, bool energy_flag,
   return energy;
 }
 
-void DipolarP3M::calc_influence_function_force() {
-  dp3m.g_force =
-      grid_influence_function<3>(dp3m.params, dp3m.mesh.start, dp3m.mesh.stop,
-                                 get_system().box_geo->length_inv());
+template <typename FloatType, Arch Architecture>
+void DipolarP3MImpl<FloatType, Architecture>::calc_influence_function_force() {
+  dp3m.g_force = grid_influence_function<FloatType, 3>(
+      dp3m.params, dp3m.mesh.start, dp3m.mesh.stop,
+      get_system().box_geo->length_inv());
 }
 
-void DipolarP3M::calc_influence_function_energy() {
-  dp3m.g_energy =
-      grid_influence_function<2>(dp3m.params, dp3m.mesh.start, dp3m.mesh.stop,
-                                 get_system().box_geo->length_inv());
+template <typename FloatType, Arch Architecture>
+void DipolarP3MImpl<FloatType, Architecture>::calc_influence_function_energy() {
+  dp3m.g_energy = grid_influence_function<FloatType, 2>(
+      dp3m.params, dp3m.mesh.start, dp3m.mesh.stop,
+      get_system().box_geo->length_inv());
 }
 
+template <typename FloatType, Arch Architecture>
 class DipolarTuningAlgorithm : public TuningAlgorithm {
-  decltype(DipolarP3M::dp3m) &dp3m;
+  p3m_data_struct_dipoles<FloatType> &dp3m;
   int m_mesh_max = -1, m_mesh_min = -1;
 
 public:
@@ -632,7 +630,8 @@ class DipolarTuningAlgorithm : public TuningAlgorithm {
   }
 };
 
-void DipolarP3M::tune() {
+template <typename FloatType, Arch Architecture>
+void DipolarP3MImpl<FloatType, Architecture>::tune() {
   auto &system = get_system();
   auto const &box_geo = *system.box_geo;
   if (dp3m.params.alpha_L == 0. and dp3m.params.alpha != 0.) {
@@ -648,7 +647,8 @@ void DipolarP3M::tune() {
           "DipolarP3M: no dipolar particles in the system");
     }
     try {
-      DipolarTuningAlgorithm parameters(system, dp3m, prefactor, tune_timings);
+      DipolarTuningAlgorithm<FloatType, Architecture> parameters(
+          system, dp3m, prefactor, tune_timings);
       parameters.setup_logger(tune_verbose);
       // parameter ranges
       parameters.determine_mesh_limits();
@@ -871,10 +871,10 @@ void DipolarP3M::scaleby_box_l() {
   sanity_checks_boxl();
   calc_influence_function_force();
   calc_influence_function_energy();
-  dp3m.energy_correction = 0.0;
 }
 
-void DipolarP3M::calc_energy_correction() {
+template <typename FloatType, Arch Architecture>
+void DipolarP3MImpl<FloatType, Architecture>::calc_energy_correction() {
   auto const &box_geo = *get_system().box_geo;
   auto const Ukp3m = calc_average_self_energy_k_space() * box_geo.volume();
   auto const Ewald_volume = Utils::int_pow<3>(dp3m.params.alpha_L);
@@ -889,4 +889,7 @@ void npt_add_virial_magnetic_contribution(double energy) {
 }
 #endif // NPT
 
+template struct DipolarP3MImpl<float, Arch::CPU>;
+template struct DipolarP3MImpl<double, Arch::CPU>;
+
 #endif // DP3M
diff --git a/src/core/magnetostatics/dp3m.hpp b/src/core/magnetostatics/dp3m.hpp
index 0bdf4f05ec2..43381972e90 100644
--- a/src/core/magnetostatics/dp3m.hpp
+++ b/src/core/magnetostatics/dp3m.hpp
@@ -39,20 +39,17 @@
 
 #include "p3m/common.hpp"
 #include "p3m/data_struct.hpp"
-#include "p3m/interpolation.hpp"
-#include "p3m/send_mesh.hpp"
 
 #include "Particle.hpp"
 #include "ParticleRange.hpp"
 
 #include <utils/Vector.hpp>
 #include <utils/math/AS_erfc_part.hpp>
+#include <utils/math/sqr.hpp>
 
-#include <array>
 #include <cmath>
 #include <numbers>
-#include <utility>
-#include <vector>
+#include <stdexcept>
 
 #ifdef NPT
 /** Update the NpT virial */
@@ -61,38 +58,35 @@ void npt_add_virial_magnetic_contribution(double energy);
 
 /** @brief Dipolar P3M solver. */
 struct DipolarP3M : public Dipoles::Actor<DipolarP3M> {
-  struct p3m_data_struct_impl : public p3m_data_struct {
-    explicit p3m_data_struct_impl(P3MParameters &&parameters)
-        : p3m_data_struct{std::move(parameters)} {}
+  /** @brief Dipolar P3M parameters. */
+  p3m_data_struct &dp3m;
 
-    /** number of dipolar particles (only on head node). */
-    int sum_dip_part = 0;
-    /** Sum of square of magnetic dipoles (only on head node). */
-    double sum_mu2 = 0.;
-
-    /** position shift for calculation of first assignment mesh point. */
-    double pos_shift = 0.;
-
-    p3m_interpolation_cache inter_weights;
-
-    /** cached k-space self-energy correction */
-    double energy_correction = 0.;
-  };
-
-  /** Dipolar P3M parameters. */
-  p3m_data_struct_impl dp3m;
-
-  /** Magnetostatics prefactor. */
   int tune_timings;
   bool tune_verbose;
 
-  DipolarP3M(P3MParameters &&parameters, double prefactor, int tune_timings,
-             bool tune_verbose);
+protected:
+  bool m_is_tuned;
 
-  void on_activation() {
-    sanity_checks();
-    tune();
+public:
+  DipolarP3M(p3m_data_struct &dp3m_data, double prefactor, int tune_timings,
+             bool tune_verbose)
+      : dp3m{dp3m_data}, tune_timings{tune_timings},
+        tune_verbose{tune_verbose} {
+
+    if (tune_timings <= 0) {
+      throw std::domain_error("Parameter 'timings' must be > 0");
+    }
+    if (dp3m.params.mesh != Utils::Vector3i::broadcast(dp3m.params.mesh[0])) {
+      throw std::domain_error("DipolarP3M requires a cubic mesh");
+    }
+    m_is_tuned = not dp3m.params.tuning;
+    dp3m.params.tuning = false;
+    set_prefactor(prefactor);
   }
+
+  virtual ~DipolarP3M() = default;
+
+  virtual void on_activation() = 0;
   /** @brief Recalculate all box-length-dependent parameters. */
   void on_boxl_change() { scaleby_box_l(); }
   void on_node_grid_change() const { sanity_checks_node_grid(); }
@@ -102,7 +96,7 @@ struct DipolarP3M : public Dipoles::Actor<DipolarP3M> {
     init();
   }
   /** @brief Recalculate all derived parameters. */
-  void init();
+  virtual void init() = 0;
 
   void sanity_checks() const {
     sanity_checks_boxl();
@@ -115,10 +109,10 @@ struct DipolarP3M : public Dipoles::Actor<DipolarP3M> {
    * @brief Count the number of magnetic particles and calculate
    * the sum of the squared dipole moments.
    */
-  void count_magnetic_particles();
+  virtual void count_magnetic_particles() = 0;
 
   /** Assign the physical dipoles using the tabulated assignment function. */
-  void dipole_assign(ParticleRange const &particles);
+  virtual void dipole_assign(ParticleRange const &particles) = 0;
 
   /**
    * @brief Tune dipolar P3M parameters to desired accuracy.
@@ -152,22 +146,14 @@ struct DipolarP3M : public Dipoles::Actor<DipolarP3M> {
    * The function is based on routines of the program HE_Q.cpp for charges
    * written by M. Deserno.
    */
-  void tune();
+  virtual void tune() = 0;
   bool is_tuned() const { return m_is_tuned; }
 
   /** Compute the k-space part of energies. */
-  double long_range_energy(ParticleRange const &particles) {
-    return long_range_kernel(false, true, particles);
-  }
+  virtual double long_range_energy(ParticleRange const &particles) = 0;
 
   /** Compute the k-space part of forces. */
-  void add_long_range_forces(ParticleRange const &particles) {
-    long_range_kernel(true, false, particles);
-  }
-
-  /** Compute the k-space part of forces and energies. */
-  double long_range_kernel(bool force_flag, bool energy_flag,
-                           ParticleRange const &particles);
+  virtual void add_long_range_forces(ParticleRange const &particles) = 0;
 
   /** Calculate real-space contribution of p3m dipolar pair forces and torques.
    *  If NPT is compiled in, update the NpT virial.
@@ -266,22 +252,20 @@ struct DipolarP3M : public Dipoles::Actor<DipolarP3M> {
     return prefactor * (mimj * B_r - mir * mjr * C_r);
   }
 
-private:
-  bool m_is_tuned;
-
+protected:
   /** Calculate self-energy in k-space. */
-  double calc_average_self_energy_k_space() const;
+  virtual double calc_average_self_energy_k_space() const = 0;
 
   /** Calculate energy correction that minimizes the error.
    *  This quantity is only calculated once and is cached.
    */
-  void calc_energy_correction();
+  virtual void calc_energy_correction() = 0;
 
   /** Calculate the influence function for the dipolar forces and torques. */
-  void calc_influence_function_force();
+  virtual void calc_influence_function_force() = 0;
 
   /** Calculate the influence function for the dipolar energy. */
-  void calc_influence_function_energy();
+  virtual void calc_influence_function_energy() = 0;
 
   /** Compute the dipolar surface terms */
   double calc_surface_term(bool force_flag, bool energy_flag,
@@ -293,7 +277,7 @@ struct DipolarP3M : public Dipoles::Actor<DipolarP3M> {
   void sanity_checks_periodicity() const;
   void sanity_checks_cell_structure() const;
 
-  void scaleby_box_l();
+  virtual void scaleby_box_l();
 };
 
 #endif // DP3M
diff --git a/src/core/magnetostatics/dp3m.impl.hpp b/src/core/magnetostatics/dp3m.impl.hpp
new file mode 100644
index 00000000000..108b79d21c8
--- /dev/null
+++ b/src/core/magnetostatics/dp3m.impl.hpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright (C) 2010-2024 The ESPResSo project
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
+ *   Max-Planck-Institute for Polymer Research, Theory Group
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *  P3M algorithm for long-range magnetic dipole-dipole interaction.
+ *
+ *  We use here a P3M (Particle-Particle Particle-Mesh) method based
+ *  on the dipolar Ewald summation. Details of the used method can be
+ *  found in @cite hockney88a and @cite deserno98a @cite deserno98b.
+ *
+ *  Further reading: @cite cerda08d
+ */
+
+#pragma once
+
+#include "config/config.hpp"
+
+#ifdef DP3M
+
+#include "magnetostatics/dp3m.hpp"
+
+#include "p3m/common.hpp"
+#include "p3m/data_struct.hpp"
+#include "p3m/interpolation.hpp"
+
+#include "Particle.hpp"
+#include "ParticleRange.hpp"
+
+#include <utils/Vector.hpp>
+#include <utils/math/AS_erfc_part.hpp>
+
+#include <memory>
+#include <utility>
+
+template <typename FloatType>
+struct p3m_data_struct_dipoles : public p3m_data_struct_fft<FloatType> {
+  using p3m_data_struct_fft<FloatType>::p3m_data_struct_fft;
+
+  /** number of dipolar particles (only on head node). */
+  int sum_dip_part = 0;
+  /** Sum of square of magnetic dipoles (only on head node). */
+  double sum_mu2 = 0.;
+
+  /** position shift for calculation of first assignment mesh point. */
+  double pos_shift = 0.;
+
+  /** cached k-space self-energy correction */
+  double energy_correction = 0.;
+
+  p3m_interpolation_cache inter_weights;
+};
+
+template <typename FloatType, Arch Architecture>
+struct DipolarP3MImpl : public DipolarP3M {
+  ~DipolarP3MImpl() override = default;
+
+  /** @brief Coulomb P3M meshes and FFT algorithm. */
+  std::unique_ptr<p3m_data_struct_dipoles<FloatType>> dp3m_impl;
+  // this member overshadows its own base class pointer in the parent class
+  p3m_data_struct_dipoles<FloatType> &dp3m;
+
+  template <typename... Args>
+  DipolarP3MImpl(
+      std::unique_ptr<p3m_data_struct_dipoles<FloatType>> &&dp3m_handle,
+      Args &&...args)
+      : DipolarP3M(*dp3m_handle, std::forward<Args>(args)...),
+        dp3m_impl{std::move(dp3m_handle)}, dp3m{*dp3m_impl} {}
+
+  void init() override {
+    if constexpr (Architecture == Arch::CPU) {
+      init_cpu_kernels();
+    }
+  }
+  void tune() override;
+  void count_magnetic_particles() override;
+
+  void on_activation() override {
+    sanity_checks();
+    tune();
+  }
+
+  double long_range_energy(ParticleRange const &particles) override {
+    return long_range_kernel(false, true, particles);
+  }
+
+  void add_long_range_forces(ParticleRange const &particles) override {
+    if constexpr (Architecture == Arch::CPU) {
+      long_range_kernel(true, false, particles);
+    }
+  }
+
+  void dipole_assign(ParticleRange const &particles) override;
+
+protected:
+  /** Compute the k-space part of forces and energies. */
+  double long_range_kernel(bool force_flag, bool energy_flag,
+                           ParticleRange const &particles);
+  double calc_average_self_energy_k_space() const override;
+  void calc_energy_correction() override;
+  void calc_influence_function_force() override;
+  void calc_influence_function_energy() override;
+  void init_cpu_kernels();
+  void scaleby_box_l() override {
+    DipolarP3M::scaleby_box_l();
+    dp3m.energy_correction = 0.;
+  }
+};
+
+template <typename FloatType, Arch Architecture,
+          template <typename> class FFTBackendImpl, class... Args>
+std::shared_ptr<DipolarP3M> new_dp3m_handle(P3MParameters &&p3m,
+                                            Args &&...args) {
+  auto obj = std::make_shared<DipolarP3MImpl<FloatType, Architecture>>(
+      std::make_unique<p3m_data_struct_dipoles<FloatType>>(std::move(p3m)),
+      std::forward<Args>(args)...);
+  obj->dp3m.template make_fft_instance<FFTBackendImpl<FloatType>>(true);
+  return obj;
+}
+
+#endif // DP3M
diff --git a/src/core/p3m/FFTBackendLegacy.cpp b/src/core/p3m/FFTBackendLegacy.cpp
index 146c241de19..c00f3440c84 100644
--- a/src/core/p3m/FFTBackendLegacy.cpp
+++ b/src/core/p3m/FFTBackendLegacy.cpp
@@ -36,14 +36,18 @@
 #include <span>
 #include <utility>
 
-FFTBackendLegacy::FFTBackendLegacy(p3m_data_struct &obj, bool dipolar)
-    : FFTBackend(obj), dipolar{dipolar},
-      fft{std::make_unique<fft::fft_data_struct>(
+template <typename FloatType>
+FFTBackendLegacy<FloatType>::FFTBackendLegacy(
+    p3m_data_struct_fft<FloatType> &obj, bool dipolar)
+    : FFTBackend<FloatType>(obj), dipolar{dipolar},
+      fft{std::make_unique<fft::fft_data_struct<FloatType>>(
           ::Communication::mpiCallbacksHandle()->share_mpi_env())} {}
 
-FFTBackendLegacy::~FFTBackendLegacy() = default;
+template <typename FloatType>
+FFTBackendLegacy<FloatType>::~FFTBackendLegacy() = default;
 
-void FFTBackendLegacy::update_mesh_data() {
+template <typename FloatType>
+void FFTBackendLegacy<FloatType>::update_mesh_data() {
   auto const mesh_size_ptr = fft->get_mesh_size();
   auto const mesh_start_ptr = fft->get_mesh_start();
   for (auto i = 0u; i < 3u; ++i) {
@@ -58,8 +62,11 @@ void FFTBackendLegacy::update_mesh_data() {
   }
 }
 
-void FFTBackendLegacy::init_fft() {
+template <typename FloatType> void FFTBackendLegacy<FloatType>::init_halo() {
   mesh_comm.resize(::comm_cart, local_mesh);
+}
+
+template <typename FloatType> void FFTBackendLegacy<FloatType>::init_fft() {
   auto ca_mesh_size = fft->initialize_fft(
       ::comm_cart, local_mesh.dim, local_mesh.margin, params.mesh,
       params.mesh_off, mesh.ks_pnum, ::communicator.node_grid);
@@ -73,40 +80,60 @@ void FFTBackendLegacy::init_fft() {
   update_mesh_data();
 }
 
-void FFTBackendLegacy::perform_field_back_fft() {
-  /* Back FFT force component mesh */
+template <typename FloatType>
+void FFTBackendLegacy<FloatType>::perform_vector_back_fft() {
   for (auto &rs_mesh_field : rs_mesh_fields) {
     fft->backward_fft(::comm_cart, rs_mesh_field.data(),
                       check_complex_residuals);
   }
-  /* redistribute force component mesh */
-  std::array<double *, 3u> meshes = {{rs_mesh_fields[0u].data(),
-                                      rs_mesh_fields[1u].data(),
-                                      rs_mesh_fields[2u].data()}};
+}
+
+template <typename FloatType>
+void FFTBackendLegacy<FloatType>::perform_vector_halo_spread() {
+  std::array<FloatType *, 3u> meshes = {{rs_mesh_fields[0u].data(),
+                                         rs_mesh_fields[1u].data(),
+                                         rs_mesh_fields[2u].data()}};
   mesh_comm.spread_grid(::comm_cart, meshes, local_mesh.dim);
 }
 
-void FFTBackendLegacy::perform_fwd_fft() {
-  if (dipolar) {
-    std::array<double *, 3u> meshes = {{rs_mesh_fields[0u].data(),
-                                        rs_mesh_fields[1u].data(),
-                                        rs_mesh_fields[2u].data()}};
-    mesh_comm.gather_grid(::comm_cart, meshes, local_mesh.dim);
-    for (auto &rs_mesh_field : rs_mesh_fields) {
-      fft->forward_fft(::comm_cart, rs_mesh_field.data());
-    }
-  } else {
-    mesh_comm.gather_grid(::comm_cart, rs_mesh.data(), local_mesh.dim);
-    fft->forward_fft(::comm_cart, rs_mesh.data());
+template <typename FloatType>
+void FFTBackendLegacy<FloatType>::perform_scalar_halo_gather() {
+  mesh_comm.gather_grid(::comm_cart, rs_mesh.data(), local_mesh.dim);
+}
+
+template <typename FloatType>
+void FFTBackendLegacy<FloatType>::perform_scalar_fwd_fft() {
+  fft->forward_fft(::comm_cart, rs_mesh.data());
+  update_mesh_data();
+}
+
+template <typename FloatType>
+void FFTBackendLegacy<FloatType>::perform_vector_halo_gather() {
+  std::array<FloatType *, 3u> meshes = {{rs_mesh_fields[0u].data(),
+                                         rs_mesh_fields[1u].data(),
+                                         rs_mesh_fields[2u].data()}};
+  mesh_comm.gather_grid(::comm_cart, meshes, local_mesh.dim);
+}
+
+template <typename FloatType>
+void FFTBackendLegacy<FloatType>::perform_vector_fwd_fft() {
+  for (auto &rs_mesh_field : rs_mesh_fields) {
+    fft->forward_fft(::comm_cart, rs_mesh_field.data());
   }
   update_mesh_data();
 }
 
-void FFTBackendLegacy::perform_space_back_fft() {
-  /* Back FFT force component mesh */
+template <typename FloatType>
+void FFTBackendLegacy<FloatType>::perform_scalar_back_fft() {
   fft->backward_fft(::comm_cart, rs_mesh.data(), check_complex_residuals);
-  /* redistribute force component mesh */
+}
+
+template <typename FloatType>
+void FFTBackendLegacy<FloatType>::perform_scalar_halo_spread() {
   mesh_comm.spread_grid(::comm_cart, rs_mesh.data(), local_mesh.dim);
 }
 
+template class FFTBackendLegacy<float>;
+template class FFTBackendLegacy<double>;
+
 #endif // defined(P3M) or defined(DP3M)
diff --git a/src/core/p3m/FFTBackendLegacy.hpp b/src/core/p3m/FFTBackendLegacy.hpp
index 2ae1a379aef..bf2dd92f7b2 100644
--- a/src/core/p3m/FFTBackendLegacy.hpp
+++ b/src/core/p3m/FFTBackendLegacy.hpp
@@ -34,33 +34,48 @@
 #include <array>
 #include <memory>
 #include <tuple>
+#include <type_traits>
 
 namespace fft {
-struct fft_data_struct;
+template <typename FloatType> struct fft_data_struct;
 } // namespace fft
 
 /**
  * @brief Historic FFT backend based on FFTW3.
  * The 3D FFT is split into three 1D FFTs.
  */
-class FFTBackendLegacy : public FFTBackend {
+template <typename FloatType>
+class FFTBackendLegacy : public FFTBackend<FloatType> {
+  static_assert(std::is_same_v<FloatType, float> or
+                    std::is_same_v<FloatType, double>,
+                "FFTW only implements float and double");
   bool dipolar;
-  std::unique_ptr<fft::fft_data_struct> fft;
+  std::unique_ptr<fft::fft_data_struct<FloatType>> fft;
   /** @brief k-space mesh (local) for k-space calculations. */
-  std::vector<double> ks_mesh;
+  std::vector<FloatType> ks_mesh;
   /** @brief real-space mesh (local) for CA/FFT. */
-  fft::vector<double> rs_mesh;
+  fft::vector<FloatType> rs_mesh;
   /** @brief real-space mesh (local) for the electric or dipolar field. */
-  std::array<fft::vector<double>, 3> rs_mesh_fields;
-  p3m_send_mesh mesh_comm;
+  std::array<fft::vector<FloatType>, 3> rs_mesh_fields;
+  p3m_send_mesh<FloatType> mesh_comm;
+  using FFTBackend<FloatType>::params;
+  using FFTBackend<FloatType>::mesh;
+  using FFTBackend<FloatType>::local_mesh;
+  using FFTBackend<FloatType>::check_complex_residuals;
 
 public:
-  explicit FFTBackendLegacy(p3m_data_struct &obj, bool dipolar);
+  FFTBackendLegacy(p3m_data_struct_fft<FloatType> &obj, bool dipolar);
   ~FFTBackendLegacy() override;
   void init_fft() override;
-  void perform_fwd_fft() override;
-  void perform_field_back_fft() override;
-  void perform_space_back_fft() override;
+  void init_halo() override;
+  void perform_scalar_fwd_fft() override;
+  void perform_vector_fwd_fft() override;
+  void perform_scalar_back_fft() override;
+  void perform_vector_back_fft() override;
+  void perform_scalar_halo_gather() override;
+  void perform_vector_halo_gather() override;
+  void perform_scalar_halo_spread() override;
+  void perform_vector_halo_spread() override;
   void update_mesh_data();
 
   /**
diff --git a/src/core/p3m/common.hpp b/src/core/p3m/common.hpp
index af9a9af5959..4a4b88b22f3 100644
--- a/src/core/p3m/common.hpp
+++ b/src/core/p3m/common.hpp
@@ -53,6 +53,8 @@ auto constexpr P3M_EPSILON_METALLIC = 0.0;
 #include <span>
 #include <stdexcept>
 
+enum class Arch { CPU, GPU };
+
 /** @brief Structure to hold P3M parameters and some dependent variables. */
 struct P3MParameters {
   /** tuning or production? */
@@ -216,13 +218,13 @@ struct P3MLocalMesh {
 };
 
 /** @brief Local mesh FFT buffers. */
-struct P3MFFTMesh {
+template <typename FloatType> struct P3MFFTMesh {
   /** @brief k-space scalar mesh for k-space calculations. */
-  std::span<double> ks_scalar;
+  std::span<FloatType> ks_scalar;
   /** @brief real-space scalar mesh for charge assignment and FFT. */
-  std::span<double> rs_scalar;
+  std::span<FloatType> rs_scalar;
   /** @brief real-space 3D meshes for the electric or dipolar field. */
-  std::array<std::span<double>, 3> rs_fields;
+  std::array<std::span<FloatType>, 3> rs_fields;
 
   /** @brief Indices of the lower left corner of the local mesh grid. */
   Utils::Vector3i start;
diff --git a/src/core/p3m/data_struct.hpp b/src/core/p3m/data_struct.hpp
index 9b607031991..7669a5b359c 100644
--- a/src/core/p3m/data_struct.hpp
+++ b/src/core/p3m/data_struct.hpp
@@ -34,7 +34,7 @@
 #include <utility>
 #include <vector>
 
-class FFTBackend;
+template <typename FloatType> class FFTBackend;
 
 /**
  * @brief Base class for the electrostatics and magnetostatics P3M algorithms.
@@ -49,21 +49,12 @@ struct p3m_data_struct {
   P3MParameters params;
   /** @brief Local mesh properties. */
   P3MLocalMesh local_mesh;
-  /** @brief Local mesh FFT buffers. */
-  P3MFFTMesh mesh;
 
   /**
    * @brief Spatial differential operator in k-space.
    * We use an i*k differentiation.
    */
   std::array<std::vector<int>, 3> d_op;
-  /** Force optimised influence function (k-space) */
-  std::vector<double> g_force;
-  /** Energy optimised influence function (k-space) */
-  std::vector<double> g_energy;
-
-  /** FFT backend. */
-  std::unique_ptr<FFTBackend> fft;
 
   /** Calculate the Fourier transformed differential operator.
    *  Remark: This is done on the level of n-vectors and not k-vectors,
@@ -72,6 +63,21 @@ struct p3m_data_struct {
   void calc_differential_operator() {
     d_op = detail::calc_meshift(params.mesh, true);
   }
+};
+
+template <typename FloatType>
+struct p3m_data_struct_fft : public p3m_data_struct {
+  using p3m_data_struct::p3m_data_struct;
+  using value_type = FloatType;
+  /** @brief Local mesh FFT buffers. */
+  P3MFFTMesh<FloatType> mesh;
+
+  /** Force optimised influence function (k-space) */
+  std::vector<FloatType> g_force;
+  /** Energy optimised influence function (k-space) */
+  std::vector<FloatType> g_energy;
+  /** FFT backend. */
+  std::unique_ptr<FFTBackend<FloatType>> fft;
 
   template <typename T, class... Args> void make_fft_instance(Args... args) {
     assert(fft == nullptr);
@@ -85,25 +91,37 @@ struct p3m_data_struct {
  * The backend can read some members of @ref p3m_data_struct
  * but can only modify the FFT buffers in @ref P3MFFTMesh.
  */
-class FFTBackend {
+template <typename FloatType> class FFTBackend {
 protected:
   P3MParameters const &params;
   P3MLocalMesh const &local_mesh;
-  P3MFFTMesh &mesh;
+  P3MFFTMesh<FloatType> &mesh;
 
 public:
   bool check_complex_residuals = false;
-  explicit FFTBackend(p3m_data_struct &obj)
+  explicit FFTBackend(p3m_data_struct_fft<FloatType> &obj)
       : params{obj.params}, local_mesh{obj.local_mesh}, mesh{obj.mesh} {}
   virtual ~FFTBackend() = default;
   /** @brief Initialize the FFT plans and buffers. */
   virtual void init_fft() = 0;
-  /** @brief Carry out the forward FFT of the scalar or field meshes. */
-  virtual void perform_fwd_fft() = 0;
+  /** @brief Initialize the halo buffers. */
+  virtual void init_halo() = 0;
+  /** @brief Carry out the forward FFT of the scalar mesh. */
+  virtual void perform_scalar_fwd_fft() = 0;
+  /** @brief Carry out the forward FFT of the vector meshes. */
+  virtual void perform_vector_fwd_fft() = 0;
   /** @brief Carry out the backward FFT of the scalar mesh. */
-  virtual void perform_field_back_fft() = 0;
-  /** @brief Carry out the backward FFT of the field meshes. */
-  virtual void perform_space_back_fft() = 0;
+  virtual void perform_scalar_back_fft() = 0;
+  /** @brief Carry out the backward FFT of the vector meshes. */
+  virtual void perform_vector_back_fft() = 0;
+  /** @brief Update scalar mesh halo with data from neighbors (accumulation). */
+  virtual void perform_scalar_halo_gather() = 0;
+  /** @brief Update vector mesh halo with data from neighbors (accumulation). */
+  virtual void perform_vector_halo_gather() = 0;
+  /** @brief Update scalar mesh halo of all neighbors. */
+  virtual void perform_scalar_halo_spread() = 0;
+  /** @brief Update vector mesh halo of all neighbors. */
+  virtual void perform_vector_halo_spread() = 0;
   /** @brief Get indices of the k-space data layout. */
   virtual std::tuple<int, int, int> get_permutations() const = 0;
 };
diff --git a/src/core/p3m/influence_function.hpp b/src/core/p3m/influence_function.hpp
index eb259d89d92..c3373a88d15 100644
--- a/src/core/p3m/influence_function.hpp
+++ b/src/core/p3m/influence_function.hpp
@@ -110,19 +110,17 @@ double G_opt(int cao, double alpha, Utils::Vector3d const &k,
  * @param inv_box_l Inverse box length.
  * @return Values of G_opt at regular grid points.
  */
-template <std::size_t S, std::size_t m = 0>
-std::vector<double> grid_influence_function(P3MParameters const &params,
-                                            Utils::Vector3i const &n_start,
-                                            Utils::Vector3i const &n_stop,
-                                            int const KX, int const KY,
-                                            int const KZ,
-                                            Utils::Vector3d const &inv_box_l) {
+template <typename FloatType, std::size_t S, std::size_t m = 0>
+std::vector<FloatType> grid_influence_function(
+    P3MParameters const &params, Utils::Vector3i const &n_start,
+    Utils::Vector3i const &n_stop, int const KX, int const KY, int const KZ,
+    Utils::Vector3d const &inv_box_l) {
 
   auto const shifts = detail::calc_meshift(params.mesh);
   auto const size = n_stop - n_start;
 
   /* The influence function grid */
-  auto g = std::vector<double>(Utils::product(size), 0.);
+  auto g = std::vector<FloatType>(Utils::product(size), FloatType(0));
 
   /* Skip influence function calculation in tuning mode,
      the results need not be correct for timing. */
@@ -143,7 +141,7 @@ std::vector<double> grid_influence_function(P3MParameters const &params,
           Utils::Vector3d{{shifts[0u][indices[KX]] * wavevector[0u],
                            shifts[1u][indices[KY]] * wavevector[1u],
                            shifts[2u][indices[KZ]] * wavevector[2u]}};
-      g[index] = G_opt<S, m>(params.cao, params.alpha, k, params.a);
+      g[index] = FloatType(G_opt<S, m>(params.cao, params.alpha, k, params.a));
     }
     ++index;
   });
diff --git a/src/core/p3m/influence_function_dipolar.hpp b/src/core/p3m/influence_function_dipolar.hpp
index d52a1a6d675..74af200a208 100644
--- a/src/core/p3m/influence_function_dipolar.hpp
+++ b/src/core/p3m/influence_function_dipolar.hpp
@@ -105,16 +105,15 @@ double G_opt_dipolar(P3MParameters const &params, Utils::Vector3i const &shift,
  * @param inv_box_l Inverse box length
  * @return Values of the influence function at regular grid points.
  */
-template <std::size_t S>
-std::vector<double> grid_influence_function(P3MParameters const &params,
-                                            Utils::Vector3i const &n_start,
-                                            Utils::Vector3i const &n_stop,
-                                            Utils::Vector3d const &inv_box_l) {
+template <typename FloatType, std::size_t S>
+std::vector<FloatType> grid_influence_function(
+    P3MParameters const &params, Utils::Vector3i const &n_start,
+    Utils::Vector3i const &n_stop, Utils::Vector3d const &inv_box_l) {
 
   auto const size = n_stop - n_start;
 
   /* The influence function grid */
-  auto g = std::vector<double>(Utils::product(size), 0.);
+  auto g = std::vector<FloatType>(Utils::product(size), FloatType(0));
 
   /* Skip influence function calculation in tuning mode,
      the results need not be correct for timing. */
@@ -138,7 +137,8 @@ std::vector<double> grid_influence_function(P3MParameters const &params,
       [&]() {
         if (((indices[0] % half_mesh != 0) or (indices[1] % half_mesh != 0) or
              (indices[2] % half_mesh != 0))) {
-          g[index] = prefactor * G_opt_dipolar<S>(params, shift_off, d_op_off);
+          g[index] = FloatType(prefactor *
+                               G_opt_dipolar<S>(params, shift_off, d_op_off));
         }
         ++index;
       },
@@ -183,9 +183,10 @@ inline double G_opt_dipolar_self_energy(P3MParameters const &params,
  * @param g Energies on the grid.
  * @return Total self-energy.
  */
+template <typename FloatType>
 inline double grid_influence_function_self_energy(
     P3MParameters const &params, Utils::Vector3i const &n_start,
-    Utils::Vector3i const &n_stop, std::vector<double> const &g) {
+    Utils::Vector3i const &n_stop, std::vector<FloatType> const &g) {
 
   auto const offset = detail::calc_meshift(params.mesh, false)[0];
   auto const d_op = detail::calc_meshift(params.mesh, true)[0];
@@ -202,7 +203,7 @@ inline double grid_influence_function_self_energy(
         if (((indices[0] % half_mesh != 0) or (indices[1] % half_mesh != 0) or
              (indices[2] % half_mesh != 0))) {
           auto const U2 = G_opt_dipolar_self_energy(params, shift_off);
-          energy += g[index] * U2 * d_op_off.norm2();
+          energy += double(g[index]) * U2 * d_op_off.norm2();
         }
         ++index;
       },
diff --git a/src/core/p3m/send_mesh.cpp b/src/core/p3m/send_mesh.cpp
index 9bb377a7688..47079361960 100644
--- a/src/core/p3m/send_mesh.cpp
+++ b/src/core/p3m/send_mesh.cpp
@@ -31,13 +31,25 @@
 #include <utils/mpi/cart_comm.hpp>
 
 #include <boost/mpi/communicator.hpp>
+#include <boost/mpi/datatype.hpp>
 
 #include <mpi.h>
 
+#include <algorithm>
 #include <cstddef>
 #include <span>
 #include <utility>
 
+template <typename T>
+static void mesh_sendrecv(T const *const sendbuf, int scount, int dest,
+                          T *const recvbuf, int rcount, int source,
+                          boost::mpi::communicator const &comm, int tag) {
+  auto const type = boost::mpi::get_mpi_datatype<T>(*sendbuf);
+  MPI_Sendrecv(reinterpret_cast<void const *>(sendbuf), scount, type, dest, tag,
+               reinterpret_cast<void *>(recvbuf), rcount, type, source, tag,
+               comm, MPI_STATUS_IGNORE);
+}
+
 /** Add values of a 3d-grid input block (size[3]) to values of 3d-grid
  *  output array with dimension dim[3] at start position start[3].
  *
@@ -47,8 +59,10 @@
  *  \param size        Dimensions of the block
  *  \param dim         Dimensions of the output grid.
  */
-static void p3m_add_block(double const *in, double *out, int const start[3],
-                          int const size[3], int const dim[3]) {
+template <typename FloatType>
+static void p3m_add_block(FloatType const *in, FloatType *out,
+                          int const start[3], int const size[3],
+                          int const dim[3]) {
   /* fast, mid and slow changing indices */
   int f, m, s;
   /* linear index of in grid, linear index of out grid */
@@ -71,8 +85,9 @@ static void p3m_add_block(double const *in, double *out, int const start[3],
   }
 }
 
-void p3m_send_mesh::resize(boost::mpi::communicator const &comm,
-                           P3MLocalMesh const &local_mesh) {
+template <typename FloatType>
+void p3m_send_mesh<FloatType>::resize(boost::mpi::communicator const &comm,
+                                      P3MLocalMesh const &local_mesh) {
   int done[3] = {0, 0, 0};
   /* send grids */
   for (int i = 0; i < 3; i++) {
@@ -101,8 +116,7 @@ void p3m_send_mesh::resize(boost::mpi::communicator const &comm,
       s_dim[i][j] = s_ur[i][j] - s_ld[i][j];
       s_size[i] *= s_dim[i][j];
     }
-    if (s_size[i] > max)
-      max = s_size[i];
+    max = std::max(max, s_size[i]);
   }
   /* communication */
   auto const node_neighbors = Utils::Mpi::cart_neighbors<3>(comm);
@@ -112,15 +126,14 @@ void p3m_send_mesh::resize(boost::mpi::communicator const &comm,
     auto const j = (i % 2 == 0) ? i + 1 : i - 1;
 
     if (node_neighbors[i] != comm.rank()) {
-      MPI_Sendrecv(&(local_mesh.margin[i]), 1, MPI_INT, node_neighbors[i],
-                   REQ_P3M_INIT, &(r_margin[j]), 1, MPI_INT, node_neighbors[j],
-                   REQ_P3M_INIT, comm, MPI_STATUS_IGNORE);
+      mesh_sendrecv(&(local_mesh.margin[i]), 1, node_neighbors[i],
+                    &(r_margin[j]), 1, node_neighbors[j], comm, REQ_P3M_INIT);
     } else {
       r_margin[j] = local_mesh.margin[i];
     }
   }
   /* recv grids */
-  for (int i = 0; i < 3; i++)
+  for (int i = 0; i < 3; i++) {
     for (int j = 0; j < 3; j++) {
       if (j == i) {
         r_ld[i * 2][j] = s_ld[i * 2][j] + local_mesh.margin[2 * j];
@@ -135,20 +148,21 @@ void p3m_send_mesh::resize(boost::mpi::communicator const &comm,
         r_ur[(i * 2) + 1][j] = s_ur[(i * 2) + 1][j];
       }
     }
+  }
   for (int i = 0; i < 6; i++) {
     r_size[i] = 1;
     for (int j = 0; j < 3; j++) {
       r_dim[i][j] = r_ur[i][j] - r_ld[i][j];
       r_size[i] *= r_dim[i][j];
     }
-    if (r_size[i] > max)
-      max = r_size[i];
+    max = std::max(max, r_size[i]);
   }
 }
 
-void p3m_send_mesh::gather_grid(boost::mpi::communicator const &comm,
-                                std::span<double *> meshes,
-                                Utils::Vector3i const &dim) {
+template <typename FloatType>
+void p3m_send_mesh<FloatType>::gather_grid(boost::mpi::communicator const &comm,
+                                           std::span<FloatType *> meshes,
+                                           Utils::Vector3i const &dim) {
   auto const node_neighbors = Utils::Mpi::cart_neighbors<3>(comm);
   send_grid.resize(max * meshes.size());
   recv_grid.resize(max * meshes.size());
@@ -158,19 +172,20 @@ void p3m_send_mesh::gather_grid(boost::mpi::communicator const &comm,
     auto const r_dir = (s_dir % 2 == 0) ? s_dir + 1 : s_dir - 1;
 
     /* pack send block */
-    if (s_size[s_dir] > 0)
+    if (s_size[s_dir] > 0) {
       for (std::size_t i = 0; i < meshes.size(); i++) {
         fft::fft_pack_block(meshes[i], send_grid.data() + i * s_size[s_dir],
                             s_ld[s_dir], s_dim[s_dir], dim.data(), 1);
       }
+    }
 
     /* communication */
     if (node_neighbors[s_dir] != comm.rank()) {
-      MPI_Sendrecv(
-          send_grid.data(), static_cast<int>(meshes.size()) * s_size[s_dir],
-          MPI_DOUBLE, node_neighbors[s_dir], REQ_P3M_GATHER, recv_grid.data(),
-          static_cast<int>(meshes.size()) * r_size[r_dir], MPI_DOUBLE,
-          node_neighbors[r_dir], REQ_P3M_GATHER, comm, MPI_STATUS_IGNORE);
+      auto const send_size = static_cast<int>(meshes.size()) * s_size[s_dir];
+      auto const recv_size = static_cast<int>(meshes.size()) * r_size[r_dir];
+      mesh_sendrecv(send_grid.data(), send_size, node_neighbors[s_dir],
+                    recv_grid.data(), recv_size, node_neighbors[r_dir], comm,
+                    REQ_P3M_GATHER);
     } else {
       std::swap(send_grid, recv_grid);
     }
@@ -184,9 +199,10 @@ void p3m_send_mesh::gather_grid(boost::mpi::communicator const &comm,
   }
 }
 
-void p3m_send_mesh::spread_grid(boost::mpi::communicator const &comm,
-                                std::span<double *> meshes,
-                                Utils::Vector3i const &dim) {
+template <typename FloatType>
+void p3m_send_mesh<FloatType>::spread_grid(boost::mpi::communicator const &comm,
+                                           std::span<FloatType *> meshes,
+                                           Utils::Vector3i const &dim) {
   auto const node_neighbors = Utils::Mpi::cart_neighbors<3>(comm);
   send_grid.resize(max * meshes.size());
   recv_grid.resize(max * meshes.size());
@@ -196,22 +212,23 @@ void p3m_send_mesh::spread_grid(boost::mpi::communicator const &comm,
     auto const r_dir = (s_dir % 2 == 0) ? s_dir + 1 : s_dir - 1;
 
     /* pack send block */
-    if (r_size[r_dir] > 0)
+    if (r_size[r_dir] > 0) {
       for (std::size_t i = 0; i < meshes.size(); i++) {
         fft::fft_pack_block(meshes[i], send_grid.data() + i * r_size[r_dir],
                             r_ld[r_dir], r_dim[r_dir], dim.data(), 1);
       }
+    }
     /* communication */
     if (node_neighbors[r_dir] != comm.rank()) {
-      MPI_Sendrecv(
-          send_grid.data(), r_size[r_dir] * static_cast<int>(meshes.size()),
-          MPI_DOUBLE, node_neighbors[r_dir], REQ_P3M_SPREAD, recv_grid.data(),
-          s_size[s_dir] * static_cast<int>(meshes.size()), MPI_DOUBLE,
-          node_neighbors[s_dir], REQ_P3M_SPREAD, comm, MPI_STATUS_IGNORE);
+      auto const send_size = static_cast<int>(meshes.size()) * r_size[r_dir];
+      auto const recv_size = static_cast<int>(meshes.size()) * s_size[s_dir];
+      mesh_sendrecv(send_grid.data(), send_size, node_neighbors[r_dir],
+                    recv_grid.data(), recv_size, node_neighbors[s_dir], comm,
+                    REQ_P3M_SPREAD);
     } else {
       std::swap(send_grid, recv_grid);
     }
-    /* un pack recv block */
+    /* unpack recv block */
     if (s_size[s_dir] > 0) {
       for (std::size_t i = 0; i < meshes.size(); i++) {
         fft::fft_unpack_block(recv_grid.data() + i * s_size[s_dir], meshes[i],
@@ -221,4 +238,7 @@ void p3m_send_mesh::spread_grid(boost::mpi::communicator const &comm,
   }
 }
 
+template class p3m_send_mesh<float>;
+template class p3m_send_mesh<double>;
+
 #endif // defined(P3M) or defined(DP3M)
diff --git a/src/core/p3m/send_mesh.hpp b/src/core/p3m/send_mesh.hpp
index 8fa73435880..fbbb1126579 100644
--- a/src/core/p3m/send_mesh.hpp
+++ b/src/core/p3m/send_mesh.hpp
@@ -34,8 +34,8 @@
 #include <span>
 #include <vector>
 
-/** Structure for send/recv meshes. */
-class p3m_send_mesh {
+/** @brief P3M halo communicator. */
+template <typename FloatType> class p3m_send_mesh {
   enum Requests {
     REQ_P3M_INIT = 200,
     REQ_P3M_GATHER = 201,
@@ -43,17 +43,17 @@ class p3m_send_mesh {
   };
   /** dimension of sub meshes to send. */
   int s_dim[6][3];
-  /** left down corners of sub meshes to send. */
+  /** lower left corners of sub meshes to send. */
   int s_ld[6][3];
-  /** up right corners of sub meshes to send. */
+  /** upper right corners of sub meshes to send. */
   int s_ur[6][3];
   /** sizes for send buffers. */
   int s_size[6];
   /** dimension of sub meshes to recv. */
   int r_dim[6][3];
-  /** left down corners of sub meshes to recv. */
+  /** lower left corners of sub meshes to recv. */
   int r_ld[6][3];
-  /** up right corners of sub meshes to recv. */
+  /** upper right corners of sub meshes to recv. */
   int r_ur[6][3];
   /** sizes for recv buffers. */
   int r_size[6];
@@ -61,22 +61,22 @@ class p3m_send_mesh {
   int max;
 
   /** vector to store grid points to send. */
-  std::vector<double> send_grid;
+  std::vector<FloatType> send_grid;
   /** vector to store grid points to recv */
-  std::vector<double> recv_grid;
+  std::vector<FloatType> recv_grid;
 
 public:
   void resize(boost::mpi::communicator const &comm,
               P3MLocalMesh const &local_mesh);
   void gather_grid(boost::mpi::communicator const &comm,
-                   std::span<double *> meshes, Utils::Vector3i const &dim);
-  void gather_grid(boost::mpi::communicator const &comm, double *mesh,
+                   std::span<FloatType *> meshes, Utils::Vector3i const &dim);
+  void gather_grid(boost::mpi::communicator const &comm, FloatType *mesh,
                    Utils::Vector3i const &dim) {
     gather_grid(comm, std::span(&mesh, 1u), dim);
   }
   void spread_grid(boost::mpi::communicator const &comm,
-                   std::span<double *> meshes, Utils::Vector3i const &dim);
-  void spread_grid(boost::mpi::communicator const &comm, double *mesh,
+                   std::span<FloatType *> meshes, Utils::Vector3i const &dim);
+  void spread_grid(boost::mpi::communicator const &comm, FloatType *mesh,
                    Utils::Vector3i const &dim) {
     spread_grid(comm, std::span(&mesh, 1u), dim);
   }
diff --git a/src/core/unit_tests/EspressoSystemStandAlone_test.cpp b/src/core/unit_tests/EspressoSystemStandAlone_test.cpp
index a5f22987bd8..b14ecd19bb7 100644
--- a/src/core/unit_tests/EspressoSystemStandAlone_test.cpp
+++ b/src/core/unit_tests/EspressoSystemStandAlone_test.cpp
@@ -41,10 +41,13 @@ namespace utf = boost::unit_test;
 #include "cuda/utils.hpp"
 #include "electrostatics/coulomb.hpp"
 #include "electrostatics/p3m.hpp"
+#include "electrostatics/p3m.impl.hpp"
 #include "galilei/Galilei.hpp"
 #include "integrate.hpp"
 #include "integrators/Propagation.hpp"
 #include "magnetostatics/dipoles.hpp"
+#include "magnetostatics/dp3m.hpp"
+#include "magnetostatics/dp3m.impl.hpp"
 #include "nonbonded_interactions/lj.hpp"
 #include "nonbonded_interactions/nonbonded_interaction_data.hpp"
 #include "observables/ParticleVelocities.hpp"
@@ -299,8 +302,8 @@ BOOST_FIXTURE_TEST_CASE(espresso_system_stand_alone, ParticleFactory) {
 #ifdef P3M
   {
     // add charges
-    set_particle_property(pid1, &Particle::q, +1.);
-    set_particle_property(pid2, &Particle::q, -1.);
+    set_particle_property(pid1, &Particle::q, +0.5);
+    set_particle_property(pid2, &Particle::q, -0.5);
 
     // set up P3M
     auto const prefactor = 2.;
@@ -312,13 +315,12 @@ BOOST_FIXTURE_TEST_CASE(espresso_system_stand_alone, ParticleFactory) {
                              5,
                              0.615,
                              1e-3};
-    auto solver =
-        std::make_shared<CoulombP3M>(std::move(p3m), prefactor, 1, false, true);
-    solver->p3m.make_fft_instance<FFTBackendLegacy>(false);
+    auto solver = new_p3m_handle<double, Arch::CPU, FFTBackendLegacy>(
+        std::move(p3m), prefactor, 1, false, true);
     add_actor(comm, espresso::system, system.coulomb.impl->solver, solver,
               [&system]() { system.on_coulomb_change(); });
 
-    // measure energies
+    // measure energies and forces
     auto const step = 0.02;
     auto const pos1 = start_positions.at(pid1);
     Utils::Vector3d pos2{box_center, box_center - 0.1, 1.0};
@@ -327,19 +329,95 @@ BOOST_FIXTURE_TEST_CASE(espresso_system_stand_alone, ParticleFactory) {
       pos2[0] += step;
       set_particle_pos(pid2, pos2);
       auto const r = (pos2 - pos1).norm();
-      // check P3M energy
       auto const obs_energy = system.calculate_energy();
+      system.integrate(0, INTEG_REUSE_FORCES_CONDITIONALLY);
       if (rank == 0) {
         // at very short distances, the real-space contribution to
         // the energy is much larger than the k-space contribution
-        auto const energy_ref = -prefactor / r;
+        auto const energy_ref = -prefactor * 0.25 / r;
         auto const energy_p3m = obs_energy->coulomb[0] + obs_energy->coulomb[1];
         BOOST_CHECK_CLOSE(energy_p3m, energy_ref, 0.002);
+        auto pf = get_particle_property(pid1, &Particle::force_and_torque);
+        BOOST_REQUIRE(pf.has_value());
+        BOOST_CHECK_GE(std::abs(pf->f[0u]), 0.1);
+        BOOST_CHECK_LE(std::abs(pf->f[1u]), 1e-12);
+        BOOST_CHECK_LE(std::abs(pf->f[2u]), 1e-12);
+        BOOST_CHECK_EQUAL(pf->torque.norm(), 0.);
       }
     }
+
+    // deactivate actor
+    solver->detach_system(espresso::system);
+    system.coulomb.impl->solver = std::nullopt;
+    system.on_coulomb_change();
+    auto const obs_energy = system.calculate_energy();
+    if (rank == 0) {
+      auto const energy_p3m = obs_energy->coulomb[0] + obs_energy->coulomb[1];
+      BOOST_CHECK_EQUAL(energy_p3m, 0.);
+    }
   }
 #endif // P3M
 
+  // check magnetostatics
+#ifdef DP3M
+  {
+    // add charges
+    set_particle_property(pid1, &Particle::dipm, +0.5);
+    set_particle_property(pid2, &Particle::dipm, -0.5);
+
+    // set up P3M
+    auto const prefactor = 2.;
+    auto p3m = P3MParameters{false,
+                             0.0,
+                             3.5,
+                             Utils::Vector3i::broadcast(12),
+                             Utils::Vector3d::broadcast(0.5),
+                             5,
+                             0.615,
+                             1e-3};
+    auto solver = new_dp3m_handle<double, Arch::CPU, FFTBackendLegacy>(
+        std::move(p3m), prefactor, 1, false);
+    add_actor(comm, espresso::system, system.dipoles.impl->solver, solver,
+              [&system]() { system.on_dipoles_change(); });
+
+    // measure energies and forces
+    auto const step = 0.02;
+    auto const pos1 = start_positions.at(pid1);
+    Utils::Vector3d pos2{box_center, box_center - 0.1, 1.0};
+    for (int i = 0; i < 10; ++i) {
+      // move particle
+      pos2[0] += step;
+      set_particle_pos(pid2, pos2);
+      auto const r = (pos2 - pos1).norm();
+      auto const obs_energy = system.calculate_energy();
+      system.integrate(0, INTEG_REUSE_FORCES_CONDITIONALLY);
+      if (rank == 0) {
+        // at very short distances, the real-space contribution to
+        // the energy is much larger than the k-space contribution
+        auto const energy_ref = -prefactor * 0.25 / Utils::int_pow<3>(r);
+        auto const energy_p3m = obs_energy->dipolar[0] + obs_energy->dipolar[1];
+        BOOST_CHECK_CLOSE(energy_p3m, energy_ref, 0.002);
+        auto pf = get_particle_property(pid1, &Particle::force_and_torque);
+        BOOST_REQUIRE(pf.has_value());
+        BOOST_CHECK_GE(std::abs(pf->f[0u]), 0.1);
+        BOOST_CHECK_LE(std::abs(pf->f[1u]), 1e-12);
+        BOOST_CHECK_LE(std::abs(pf->f[2u]), 1e-12);
+        BOOST_CHECK_LE(pf->torque.norm(), 1e-12);
+      }
+    }
+
+    // deactivate actor
+    solver->detach_system(espresso::system);
+    system.dipoles.impl->solver = std::nullopt;
+    system.on_dipoles_change();
+    auto const obs_energy = system.calculate_energy();
+    if (rank == 0) {
+      auto const energy_p3m = obs_energy->dipolar[0] + obs_energy->dipolar[1];
+      BOOST_CHECK_EQUAL(energy_p3m, 0.);
+    }
+  }
+#endif // DP3M
+
   // check integration
   {
     // set up velocity-Verlet integrator
diff --git a/src/core/unit_tests/ParticleFactory.hpp b/src/core/unit_tests/ParticleFactory.hpp
index 71f2d14bf1b..861087b0137 100644
--- a/src/core/unit_tests/ParticleFactory.hpp
+++ b/src/core/unit_tests/ParticleFactory.hpp
@@ -26,6 +26,7 @@
 
 #include <utils/Vector.hpp>
 
+#include <optional>
 #include <vector>
 
 /** Fixture to create particles during a test and remove them at the end. */
@@ -61,15 +62,26 @@ struct ParticleFactory {
   }
 
   template <typename T>
-  void set_particle_property(int p_id, T &(Particle::*setter)(),
+  void set_particle_property(int p_id, T &(Particle::*getter)(),
                              T const &value) const {
     auto &system = System::get_system();
     auto p = system.cell_structure->get_local_particle(p_id);
     if (p != nullptr) {
-      (p->*setter)() = value;
+      (p->*getter)() = value;
     }
   }
 
+  template <typename T>
+  auto get_particle_property(int p_id, T &(Particle::*getter)()) const {
+    auto &system = System::get_system();
+    auto p = system.cell_structure->get_local_particle(p_id);
+    std::optional<T> opt = std::nullopt;
+    if (p != nullptr) {
+      opt = (p->*getter)();
+    }
+    return opt;
+  }
+
   void clear_particles() {
     for (auto pid : particle_cache) {
       remove_particle(pid);
diff --git a/src/core/unit_tests/fft_test.cpp b/src/core/unit_tests/fft_test.cpp
index 94cfc33f7fc..129e7fd9ea8 100644
--- a/src/core/unit_tests/fft_test.cpp
+++ b/src/core/unit_tests/fft_test.cpp
@@ -157,8 +157,8 @@ BOOST_AUTO_TEST_CASE(fft_map_grid) {
 
 BOOST_AUTO_TEST_CASE(fft_exceptions) {
   auto constexpr size_max = std::numeric_limits<std::size_t>::max();
-  auto constexpr bad_size = size_max / sizeof(int) + 1ul;
-  fft::allocator<int> allocator{};
+  auto constexpr bad_size = size_max / sizeof(float) + 1ul;
+  fft::allocator<float> allocator{};
   BOOST_CHECK_EQUAL(allocator.allocate(0ul), nullptr);
   BOOST_CHECK_THROW(allocator.allocate(bad_size), std::bad_array_new_length);
 }
diff --git a/src/python/espressomd/electrostatics.py b/src/python/espressomd/electrostatics.py
index a197c56528f..87b23f0090c 100644
--- a/src/python/espressomd/electrostatics.py
+++ b/src/python/espressomd/electrostatics.py
@@ -219,12 +219,17 @@ class P3M(_P3MBase):
     check_complex_residuals: :obj:`bool`, optional
         Raise a warning if the backward Fourier transform has non-zero
         complex residuals when set to ``True`` (default).
+    single_precision : :obj:`bool`
+        Use single-precision floating-point arithmetic.
 
     """
     _so_name = "Coulomb::CoulombP3M"
     _so_creation_policy = "GLOBAL"
     _so_features = ("P3M",)
 
+    def default_params(self):
+        return {"single_precision": False, **super().default_params()}
+
 
 @script_interface_register
 class P3MGPU(_P3MBase):
@@ -275,6 +280,9 @@ class P3MGPU(_P3MBase):
     _so_creation_policy = "GLOBAL"
     _so_features = ("P3M", "CUDA")
 
+    def default_params(self):
+        return {"single_precision": True, **super().default_params()}
+
 
 @script_interface_register
 class ELC(ElectrostaticInteraction):
diff --git a/src/python/espressomd/magnetostatics.py b/src/python/espressomd/magnetostatics.py
index ca611d6b461..0132dd32a05 100644
--- a/src/python/espressomd/magnetostatics.py
+++ b/src/python/espressomd/magnetostatics.py
@@ -106,6 +106,8 @@ class DipolarP3M(MagnetostaticInteraction):
         (default is ``True``, i.e., activated).
     timings : :obj:`int`
         Number of force calculations during tuning.
+    single_precision : :obj:`bool`
+        Use single-precision floating-point arithmetic.
 
     """
     _so_name = "Dipoles::DipolarP3M"
@@ -152,6 +154,7 @@ def default_params(self):
                 "epsilon": 0.0,
                 "mesh_off": [0.5, 0.5, 0.5],
                 "prefactor": 0.,
+                "single_precision": False,
                 "tune": True,
                 "timings": 10,
                 "verbose": True}
diff --git a/src/script_interface/electrostatics/Actor_impl.hpp b/src/script_interface/electrostatics/Actor.impl.hpp
similarity index 100%
rename from src/script_interface/electrostatics/Actor_impl.hpp
rename to src/script_interface/electrostatics/Actor.impl.hpp
diff --git a/src/script_interface/electrostatics/CoulombP3M.hpp b/src/script_interface/electrostatics/CoulombP3M.hpp
index 34ea03c20e1..78d1e4d8e24 100644
--- a/src/script_interface/electrostatics/CoulombP3M.hpp
+++ b/src/script_interface/electrostatics/CoulombP3M.hpp
@@ -26,6 +26,7 @@
 #include "Actor.hpp"
 
 #include "core/electrostatics/p3m.hpp"
+#include "core/electrostatics/p3m.impl.hpp"
 #include "core/p3m/FFTBackendLegacy.hpp"
 
 #include "script_interface/get_value.hpp"
@@ -36,12 +37,26 @@
 namespace ScriptInterface {
 namespace Coulomb {
 
-class CoulombP3M : public Actor<CoulombP3M, ::CoulombP3M> {
+template <Arch Architecture>
+class CoulombP3M : public Actor<CoulombP3M<Architecture>, ::CoulombP3M> {
   bool m_tune;
+  bool m_single_precision;
+
+public:
+  using Base = Actor<CoulombP3M<Architecture>, ::CoulombP3M>;
+  using Base::actor;
+  using Base::add_parameters;
+  using Base::context;
+
+protected:
+  using Base::m_actor;
+  using Base::set_charge_neutrality_tolerance;
 
 public:
   CoulombP3M() {
     add_parameters({
+        {"single_precision", AutoParameter::read_only,
+         [this]() { return m_single_precision; }},
         {"alpha_L", AutoParameter::read_only,
          [this]() { return actor()->p3m.params.alpha_L; }},
         {"r_cut_iL", AutoParameter::read_only,
@@ -76,7 +91,12 @@ class CoulombP3M : public Actor<CoulombP3M, ::CoulombP3M> {
 
   void do_construct(VariantMap const &params) override {
     m_tune = get_value<bool>(params, "tune");
+    m_single_precision = get_value<bool>(params, "single_precision");
     context()->parallel_try_catch([&]() {
+      if (Architecture == Arch::GPU and not m_single_precision) {
+        throw std::invalid_argument(
+            "P3M GPU only implemented in single-precision mode");
+      }
       auto p3m = P3MParameters{!get_value_or<bool>(params, "is_tuned", !m_tune),
                                get_value<double>(params, "epsilon"),
                                get_value<double>(params, "r_cut"),
@@ -85,14 +105,26 @@ class CoulombP3M : public Actor<CoulombP3M, ::CoulombP3M> {
                                get_value<int>(params, "cao"),
                                get_value<double>(params, "alpha"),
                                get_value<double>(params, "accuracy")};
-      m_actor = std::make_shared<CoreActorClass>(
-          std::move(p3m), get_value<double>(params, "prefactor"),
-          get_value<int>(params, "timings"), get_value<bool>(params, "verbose"),
-          get_value<bool>(params, "check_complex_residuals"));
-      m_actor->p3m.make_fft_instance<FFTBackendLegacy>(false);
+      make_handle(m_single_precision, std::move(p3m),
+                  get_value<double>(params, "prefactor"),
+                  get_value<int>(params, "timings"),
+                  get_value<bool>(params, "verbose"),
+                  get_value<bool>(params, "check_complex_residuals"));
     });
     set_charge_neutrality_tolerance(params);
   }
+
+private:
+  template <class... Args>
+  void make_handle(bool single_precision, Args &&...args) {
+    if (single_precision) {
+      m_actor = new_p3m_handle<float, Architecture, FFTBackendLegacy>(
+          std::forward<Args>(args)...);
+    } else {
+      m_actor = new_p3m_handle<double, Architecture, FFTBackendLegacy>(
+          std::forward<Args>(args)...);
+    }
+  }
 };
 
 } // namespace Coulomb
diff --git a/src/script_interface/electrostatics/CoulombP3MGPU.hpp b/src/script_interface/electrostatics/CoulombP3MGPU.hpp
deleted file mode 100644
index 187fde4308c..00000000000
--- a/src/script_interface/electrostatics/CoulombP3MGPU.hpp
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (C) 2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "config/config.hpp"
-
-#ifdef P3M
-#ifdef CUDA
-
-#include "Actor.hpp"
-
-#include "core/electrostatics/p3m_gpu.hpp"
-#include "core/p3m/FFTBackendLegacy.hpp"
-
-#include "script_interface/get_value.hpp"
-
-#include <memory>
-#include <string>
-
-namespace ScriptInterface {
-namespace Coulomb {
-
-class CoulombP3MGPU : public Actor<CoulombP3MGPU, ::CoulombP3MGPU> {
-  bool m_tune;
-
-public:
-  CoulombP3MGPU() {
-    add_parameters({
-        {"alpha_L", AutoParameter::read_only,
-         [this]() { return actor()->p3m.params.alpha_L; }},
-        {"r_cut_iL", AutoParameter::read_only,
-         [this]() { return actor()->p3m.params.r_cut_iL; }},
-        {"mesh", AutoParameter::read_only,
-         [this]() { return actor()->p3m.params.mesh; }},
-        {"mesh_off", AutoParameter::read_only,
-         [this]() { return actor()->p3m.params.mesh_off; }},
-        {"cao", AutoParameter::read_only,
-         [this]() { return actor()->p3m.params.cao; }},
-        {"accuracy", AutoParameter::read_only,
-         [this]() { return actor()->p3m.params.accuracy; }},
-        {"epsilon", AutoParameter::read_only,
-         [this]() { return actor()->p3m.params.epsilon; }},
-        {"a", AutoParameter::read_only,
-         [this]() { return actor()->p3m.params.a; }},
-        {"alpha", AutoParameter::read_only,
-         [this]() { return actor()->p3m.params.alpha; }},
-        {"r_cut", AutoParameter::read_only,
-         [this]() { return actor()->p3m.params.r_cut; }},
-        {"is_tuned", AutoParameter::read_only,
-         [this]() { return actor()->is_tuned(); }},
-        {"verbose", AutoParameter::read_only,
-         [this]() { return actor()->tune_verbose; }},
-        {"timings", AutoParameter::read_only,
-         [this]() { return actor()->tune_timings; }},
-        {"tune", AutoParameter::read_only, [this]() { return m_tune; }},
-        {"check_complex_residuals", AutoParameter::read_only,
-         [this]() { return actor()->check_complex_residuals; }},
-    });
-  }
-
-  void do_construct(VariantMap const &params) override {
-    m_tune = get_value<bool>(params, "tune");
-    context()->parallel_try_catch([&]() {
-      auto p3m = P3MParameters{!get_value_or<bool>(params, "is_tuned", !m_tune),
-                               get_value<double>(params, "epsilon"),
-                               get_value<double>(params, "r_cut"),
-                               get_value<Utils::Vector3i>(params, "mesh"),
-                               get_value<Utils::Vector3d>(params, "mesh_off"),
-                               get_value<int>(params, "cao"),
-                               get_value<double>(params, "alpha"),
-                               get_value<double>(params, "accuracy")};
-      m_actor = std::make_shared<CoreActorClass>(
-          std::move(p3m), get_value<double>(params, "prefactor"),
-          get_value<int>(params, "timings"), get_value<bool>(params, "verbose"),
-          get_value<bool>(params, "check_complex_residuals"));
-      m_actor->p3m.make_fft_instance<FFTBackendLegacy>(false); // for CPU part
-    });
-    set_charge_neutrality_tolerance(params);
-  }
-};
-
-} // namespace Coulomb
-} // namespace ScriptInterface
-
-#endif // CUDA
-#endif // P3M
diff --git a/src/script_interface/electrostatics/ElectrostaticLayerCorrection.hpp b/src/script_interface/electrostatics/ElectrostaticLayerCorrection.hpp
index b009a5bf204..b31ff09d45e 100644
--- a/src/script_interface/electrostatics/ElectrostaticLayerCorrection.hpp
+++ b/src/script_interface/electrostatics/ElectrostaticLayerCorrection.hpp
@@ -45,9 +45,9 @@ class ElectrostaticLayerCorrection
 
   using BaseSolver = boost::variant<
 #ifdef CUDA
-      std::shared_ptr<CoulombP3MGPU>,
+      std::shared_ptr<CoulombP3M<Arch::GPU>>,
 #endif // CUDA
-      std::shared_ptr<CoulombP3M>>;
+      std::shared_ptr<CoulombP3M<Arch::CPU>>>;
   BaseSolver m_solver;
 
   void on_bind_system(::System::System &system) override {
@@ -88,19 +88,20 @@ class ElectrostaticLayerCorrection
     auto so_ptr = get_value<ObjectRef>(params, "actor");
     context()->parallel_try_catch([&]() {
 #ifdef CUDA
-      if (auto so_solver = std::dynamic_pointer_cast<CoulombP3MGPU>(so_ptr)) {
-        solver = so_solver->actor();
-        m_solver = so_solver;
-      } else
+      if (auto so = std::dynamic_pointer_cast<CoulombP3M<Arch::GPU>>(so_ptr)) {
+        solver = so->actor();
+        m_solver = so;
+        return;
+      }
 #endif // CUDA
-        if (auto so_solver = std::dynamic_pointer_cast<CoulombP3M>(so_ptr)) {
-          solver = so_solver->actor();
-          m_solver = so_solver;
-        } else {
-          throw std::invalid_argument("Parameter 'actor' of type " +
-                                      so_ptr->name().to_string() +
-                                      " isn't supported by ELC");
-        }
+      if (auto so = std::dynamic_pointer_cast<CoulombP3M<Arch::CPU>>(so_ptr)) {
+        solver = so->actor();
+        m_solver = so;
+        return;
+      }
+      throw std::invalid_argument("Parameter 'actor' of type " +
+                                  so_ptr->name().to_string() +
+                                  " isn't supported by ELC");
     });
     context()->parallel_try_catch([&]() {
       auto elc = elc_data{get_value<double>(params, "maxPWerror"),
diff --git a/src/script_interface/electrostatics/initialize.cpp b/src/script_interface/electrostatics/initialize.cpp
index 476ea4ca9c5..b34c086c2a3 100644
--- a/src/script_interface/electrostatics/initialize.cpp
+++ b/src/script_interface/electrostatics/initialize.cpp
@@ -23,12 +23,11 @@
 
 #ifdef ELECTROSTATICS
 
-#include "Actor_impl.hpp"
+#include "Actor.impl.hpp"
 
 #include "Container.hpp"
 #include "CoulombMMM1D.hpp"
 #include "CoulombP3M.hpp"
-#include "CoulombP3MGPU.hpp"
 #include "CoulombScafacos.hpp"
 #include "DebyeHueckel.hpp"
 #include "ElectrostaticLayerCorrection.hpp"
@@ -50,9 +49,9 @@ void initialize(Utils::Factory<ObjectHandle> *om) {
 #ifdef ELECTROSTATICS
   om->register_new<DebyeHueckel>("Coulomb::DebyeHueckel");
 #ifdef P3M
-  om->register_new<CoulombP3M>("Coulomb::CoulombP3M");
+  om->register_new<CoulombP3M<Arch::CPU>>("Coulomb::CoulombP3M");
 #ifdef CUDA
-  om->register_new<CoulombP3MGPU>("Coulomb::CoulombP3MGPU");
+  om->register_new<CoulombP3M<Arch::GPU>>("Coulomb::CoulombP3MGPU");
 #endif
   om->register_new<ElectrostaticLayerCorrection>(
       "Coulomb::ElectrostaticLayerCorrection");
diff --git a/src/script_interface/magnetostatics/Actor_impl.hpp b/src/script_interface/magnetostatics/Actor.impl.hpp
similarity index 100%
rename from src/script_interface/magnetostatics/Actor_impl.hpp
rename to src/script_interface/magnetostatics/Actor.impl.hpp
diff --git a/src/script_interface/magnetostatics/DipolarLayerCorrection.hpp b/src/script_interface/magnetostatics/DipolarLayerCorrection.hpp
index 3a32a80bc44..55cce422e52 100644
--- a/src/script_interface/magnetostatics/DipolarLayerCorrection.hpp
+++ b/src/script_interface/magnetostatics/DipolarLayerCorrection.hpp
@@ -45,7 +45,7 @@ class DipolarLayerCorrection
   using DipolarDSR = DipolarDirectSum;
   using BaseSolver = boost::variant<
 #ifdef DP3M
-      std::shared_ptr<DipolarP3M>,
+      std::shared_ptr<DipolarP3M<Arch::CPU>>,
 #endif
       std::shared_ptr<DipolarDSR>>;
   BaseSolver m_solver;
@@ -78,19 +78,20 @@ class DipolarLayerCorrection
     auto so_ptr = get_value<ObjectRef>(params, "actor");
     context()->parallel_try_catch([&]() {
 #ifdef DP3M
-      if (auto so_solver = std::dynamic_pointer_cast<DipolarP3M>(so_ptr)) {
-        solver = so_solver->actor();
-        m_solver = so_solver;
-      } else
+      if (auto so = std::dynamic_pointer_cast<DipolarP3M<Arch::CPU>>(so_ptr)) {
+        solver = so->actor();
+        m_solver = so;
+        return;
+      }
 #endif // DP3M
-        if (auto so_solver = std::dynamic_pointer_cast<DipolarDSR>(so_ptr)) {
-          solver = so_solver->actor();
-          m_solver = so_solver;
-        } else {
-          throw std::invalid_argument("Parameter 'actor' of type " +
-                                      so_ptr->name().to_string() +
-                                      " isn't supported by DLC");
-        }
+      if (auto so = std::dynamic_pointer_cast<DipolarDSR>(so_ptr)) {
+        solver = so->actor();
+        m_solver = so;
+        return;
+      }
+      throw std::invalid_argument("Parameter 'actor' of type " +
+                                  so_ptr->name().to_string() +
+                                  " isn't supported by DLC");
     });
     context()->parallel_try_catch([&]() {
       auto dlc = dlc_data(get_value<double>(params, "maxPWerror"),
diff --git a/src/script_interface/magnetostatics/DipolarP3M.hpp b/src/script_interface/magnetostatics/DipolarP3M.hpp
index 7474345e9d4..20dff5968ee 100644
--- a/src/script_interface/magnetostatics/DipolarP3M.hpp
+++ b/src/script_interface/magnetostatics/DipolarP3M.hpp
@@ -26,6 +26,7 @@
 #include "Actor.hpp"
 
 #include "core/magnetostatics/dp3m.hpp"
+#include "core/magnetostatics/dp3m.impl.hpp"
 #include "core/p3m/FFTBackendLegacy.hpp"
 
 #include "script_interface/get_value.hpp"
@@ -35,12 +36,25 @@
 namespace ScriptInterface {
 namespace Dipoles {
 
-class DipolarP3M : public Actor<DipolarP3M, ::DipolarP3M> {
+template <Arch Architecture>
+class DipolarP3M : public Actor<DipolarP3M<Architecture>, ::DipolarP3M> {
   bool m_tune;
+  bool m_single_precision;
+
+public:
+  using Base = Actor<DipolarP3M<Architecture>, ::DipolarP3M>;
+  using Base::actor;
+  using Base::add_parameters;
+  using Base::context;
+
+protected:
+  using Base::m_actor;
 
 public:
   DipolarP3M() {
     add_parameters({
+        {"single_precision", AutoParameter::read_only,
+         [this]() { return m_single_precision; }},
         {"alpha_L", AutoParameter::read_only,
          [this]() { return actor()->dp3m.params.alpha_L; }},
         {"r_cut_iL", AutoParameter::read_only,
@@ -73,6 +87,8 @@ class DipolarP3M : public Actor<DipolarP3M, ::DipolarP3M> {
 
   void do_construct(VariantMap const &params) override {
     m_tune = get_value<bool>(params, "tune");
+    m_single_precision = get_value<bool>(params, "single_precision");
+    static_assert(Architecture == Arch::CPU, "GPU not implemented");
     context()->parallel_try_catch([&]() {
       auto p3m = P3MParameters{!get_value_or<bool>(params, "is_tuned", !m_tune),
                                get_value<double>(params, "epsilon"),
@@ -82,13 +98,24 @@ class DipolarP3M : public Actor<DipolarP3M, ::DipolarP3M> {
                                get_value<int>(params, "cao"),
                                get_value<double>(params, "alpha"),
                                get_value<double>(params, "accuracy")};
-      m_actor = std::make_shared<CoreActorClass>(
-          std::move(p3m), get_value<double>(params, "prefactor"),
-          get_value<int>(params, "timings"),
-          get_value<bool>(params, "verbose"));
-      m_actor->dp3m.make_fft_instance<FFTBackendLegacy>(true);
+      make_handle(m_single_precision, std::move(p3m),
+                  get_value<double>(params, "prefactor"),
+                  get_value<int>(params, "timings"),
+                  get_value<bool>(params, "verbose"));
     });
   }
+
+private:
+  template <class... Args>
+  void make_handle(bool single_precision, Args &&...args) {
+    if (single_precision) {
+      m_actor = new_dp3m_handle<float, Architecture, FFTBackendLegacy>(
+          std::forward<Args>(args)...);
+    } else {
+      m_actor = new_dp3m_handle<double, Architecture, FFTBackendLegacy>(
+          std::forward<Args>(args)...);
+    }
+  }
 };
 
 } // namespace Dipoles
diff --git a/src/script_interface/magnetostatics/initialize.cpp b/src/script_interface/magnetostatics/initialize.cpp
index 04b22818ed5..26265863b05 100644
--- a/src/script_interface/magnetostatics/initialize.cpp
+++ b/src/script_interface/magnetostatics/initialize.cpp
@@ -16,13 +16,14 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
+
 #include "initialize.hpp"
 
 #include "config/config.hpp"
 
 #ifdef DIPOLES
 
-#include "Actor_impl.hpp"
+#include "Actor.impl.hpp"
 
 #include "Container.hpp"
 #include "DipolarDirectSum.hpp"
@@ -49,7 +50,7 @@ void initialize(Utils::Factory<ObjectHandle> *om) {
   om->register_new<DipolarDirectSumGpu>("Dipoles::DipolarDirectSumGpu");
 #endif
 #ifdef DP3M
-  om->register_new<DipolarP3M>("Dipoles::DipolarP3M");
+  om->register_new<DipolarP3M<Arch::CPU>>("Dipoles::DipolarP3M");
 #endif
 #ifdef SCAFACOS_DIPOLES
   om->register_new<DipolarScafacos>("Dipoles::DipolarScafacos");
diff --git a/src/script_interface/tests/Actors_test.cpp b/src/script_interface/tests/Actors_test.cpp
index 591864587f8..e02364c8e22 100644
--- a/src/script_interface/tests/Actors_test.cpp
+++ b/src/script_interface/tests/Actors_test.cpp
@@ -28,9 +28,9 @@
 #include <boost/test/unit_test.hpp>
 
 #include "script_interface/electrostatics/Actor.hpp"
-#include "script_interface/electrostatics/Actor_impl.hpp"
+#include "script_interface/electrostatics/Actor.impl.hpp"
 #include "script_interface/magnetostatics/Actor.hpp"
-#include "script_interface/magnetostatics/Actor_impl.hpp"
+#include "script_interface/magnetostatics/Actor.impl.hpp"
 #include "script_interface/scafacos/scafacos.hpp"
 
 #include "core/actor/visitors.hpp"
diff --git a/testsuite/python/coulomb_interface.py b/testsuite/python/coulomb_interface.py
index fc40860e4a2..55b89100b60 100644
--- a/testsuite/python/coulomb_interface.py
+++ b/testsuite/python/coulomb_interface.py
@@ -159,6 +159,7 @@ def test_mmm1d_cpu_tuning_exceptions(self):
     def test_elc_p3m_exceptions(self):
         P3M = espressomd.electrostatics.P3M
         ELC = espressomd.electrostatics.ELC
+        P3MGPU = espressomd.electrostatics.P3MGPU
         # create valid solvers
         dh = espressomd.electrostatics.DH(prefactor=1.2, kappa=0.8, r_cut=2.0)
         p3m_params = dict(prefactor=1., epsilon=0.1, accuracy=1e-2,
@@ -173,6 +174,9 @@ def test_elc_p3m_exceptions(self):
             P3M(**{**p3m_params, 'timings': -2})
         with self.assertRaisesRegex(ValueError, "Parameter 'mesh' has to be an integer or integer list of length 3"):
             P3M(**{**p3m_params, 'mesh': [8, 8]})
+        if espressomd.has_features(["CUDA"]) and espressomd.gpu_available():
+            with self.assertRaisesRegex(ValueError, "P3M GPU only implemented in single-precision mode"):
+                P3MGPU(single_precision=False, **p3m_params)
         with self.assertRaisesRegex(ValueError, "Parameter 'actor' of type Coulomb::ElectrostaticLayerCorrection isn't supported by ELC"):
             ELC(gap_size=2., maxPWerror=1., actor=elc)
         with self.assertRaisesRegex(ValueError, "Parameter 'actor' of type Coulomb::DebyeHueckel isn't supported by ELC"):
diff --git a/testsuite/python/p3m_madelung.py b/testsuite/python/p3m_madelung.py
index f2a01e08bd6..af98348672d 100644
--- a/testsuite/python/p3m_madelung.py
+++ b/testsuite/python/p3m_madelung.py
@@ -98,13 +98,16 @@ def check():
             energy_per_dip = self.get_normalized_obs_per_dipole(dipm, spacing)
             np.testing.assert_allclose(energy_per_dip, mc, atol=0., rtol=2e-9)
 
-        dp3m = espressomd.magnetostatics.DipolarP3M(**dp3m_params)
-        self.system.magnetostatics.solver = dp3m
-        check()
-        self.system.magnetostatics.clear()
-        mdlc = espressomd.magnetostatics.DLC(actor=dp3m, **mdlc_params)
-        self.system.magnetostatics.solver = mdlc
-        check()
+        for single_precision in [False, True]:
+            with self.subTest(msg=f"DP3M CPU {single_precision=}"):
+                dp3m = espressomd.magnetostatics.DipolarP3M(
+                    single_precision=single_precision, **dp3m_params)
+                self.system.magnetostatics.solver = dp3m
+                check()
+                self.system.magnetostatics.clear()
+                mdlc = espressomd.magnetostatics.DLC(actor=dp3m, **mdlc_params)
+                self.system.magnetostatics.solver = mdlc
+                check()
 
     @utx.skipIfMissingFeatures(["DP3M"])
     def test_infinite_magnetic_sheet(self):
@@ -142,13 +145,16 @@ def check():
             energy_per_dip = self.get_normalized_obs_per_dipole(dipm, spacing)
             np.testing.assert_allclose(energy_per_dip, mc, atol=0., rtol=4e-6)
 
-        dp3m = espressomd.magnetostatics.DipolarP3M(**dp3m_params)
-        self.system.magnetostatics.solver = dp3m
-        check()
-        self.system.magnetostatics.clear()
-        mdlc = espressomd.magnetostatics.DLC(actor=dp3m, **mdlc_params)
-        self.system.magnetostatics.solver = mdlc
-        check()
+        for single_precision in [False, True]:
+            with self.subTest(msg=f"DP3M CPU {single_precision=}"):
+                dp3m = espressomd.magnetostatics.DipolarP3M(
+                    single_precision=single_precision, **dp3m_params)
+                self.system.magnetostatics.solver = dp3m
+                check()
+                self.system.magnetostatics.clear()
+                mdlc = espressomd.magnetostatics.DLC(actor=dp3m, **mdlc_params)
+                self.system.magnetostatics.solver = mdlc
+                check()
 
     @utx.skipIfMissingFeatures(["DP3M"])
     def test_infinite_magnetic_cube(self):
@@ -186,9 +192,12 @@ def check():
             energy_per_dip = self.get_normalized_obs_per_dipole(dipm, spacing)
             np.testing.assert_allclose(energy_per_dip, mc, atol=0., rtol=3e-4)
 
-        dp3m = espressomd.magnetostatics.DipolarP3M(**dp3m_params)
-        self.system.magnetostatics.solver = dp3m
-        check()
+        for single_precision in [False, True]:
+            with self.subTest(msg=f"DP3M CPU {single_precision=}"):
+                dp3m = espressomd.magnetostatics.DipolarP3M(
+                    single_precision=single_precision, **dp3m_params)
+                self.system.magnetostatics.solver = dp3m
+                check()
 
     @utx.skipIfMissingFeatures(["P3M"])
     def test_infinite_ionic_wire(self):
@@ -212,13 +221,17 @@ def check():
             np.testing.assert_allclose(p_tensor, ref_pressure, atol=1e-12,
                                        rtol=1e-6)
 
-        p3m = espressomd.electrostatics.P3M(**p3m_params)
-        self.system.electrostatics.solver = p3m
-        check()
+        for single_precision in [False, True]:
+            with self.subTest(msg=f"P3M CPU {single_precision=}"):
+                p3m = espressomd.electrostatics.P3M(
+                    single_precision=single_precision, **p3m_params)
+                self.system.electrostatics.solver = p3m
+                check()
         if espressomd.has_features("CUDA") and espressomd.gpu_available():
-            p3m = espressomd.electrostatics.P3MGPU(**p3m_params)
-            self.system.electrostatics.solver = p3m
-            check()
+            with self.subTest(msg=f"P3M GPU single_precision=True"):
+                p3m = espressomd.electrostatics.P3MGPU(**p3m_params)
+                self.system.electrostatics.solver = p3m
+                check()
 
     @utx.skipIfMissingFeatures(["P3M"])
     def test_infinite_ionic_sheet(self):
@@ -245,19 +258,23 @@ def check(pressure=True):
             np.testing.assert_allclose(p_tensor, ref_pressure, atol=1e-12,
                                        rtol=5e-7)
 
-        p3m = espressomd.electrostatics.P3M(**p3m_params)
-        self.system.electrostatics.solver = p3m
-        check(pressure=True)
-        elc = espressomd.electrostatics.ELC(actor=p3m, **elc_params)
-        self.system.electrostatics.solver = elc
-        check(pressure=False)
+        for single_precision in [False, True]:
+            with self.subTest(msg=f"P3M CPU {single_precision=}"):
+                p3m = espressomd.electrostatics.P3M(
+                    single_precision=single_precision, **p3m_params)
+                self.system.electrostatics.solver = p3m
+                check(pressure=True)
+                elc = espressomd.electrostatics.ELC(actor=p3m, **elc_params)
+                self.system.electrostatics.solver = elc
+                check(pressure=False)
         if espressomd.has_features("CUDA") and espressomd.gpu_available():
-            p3m = espressomd.electrostatics.P3MGPU(**p3m_params)
-            self.system.electrostatics.solver = p3m
-            check(pressure=True)
-            elc = espressomd.electrostatics.ELC(actor=p3m, **elc_params)
-            self.system.electrostatics.solver = elc
-            check(pressure=False)
+            with self.subTest(msg=f"P3M GPU single_precision=True"):
+                p3m = espressomd.electrostatics.P3MGPU(**p3m_params)
+                self.system.electrostatics.solver = p3m
+                check(pressure=True)
+                elc = espressomd.electrostatics.ELC(actor=p3m, **elc_params)
+                self.system.electrostatics.solver = elc
+                check(pressure=False)
 
     @utx.skipIfMissingFeatures(["P3M"])
     def test_infinite_ionic_cube(self):
@@ -269,26 +286,30 @@ def test_infinite_ionic_cube(self):
         mc = -1.74756459463318219
         base = [1., 1., 1.]
         ref_energy, ref_pressure = self.get_reference_obs_per_ion(mc, base)
-        p3m_params = dict(prefactor=1., accuracy=3e-7, mesh=52, r_cut=4.4, cao=7,
-                          alpha=0.8895166, tune=False)
+        p3m_params = dict(prefactor=1., accuracy=1e-9, mesh=72, r_cut=5.6,
+                          cao=7, alpha=0.8163977, tune=False)
 
         def check():
             energy, p_scalar, p_tensor = self.get_normalized_obs_per_ion()
             np.testing.assert_allclose(energy, ref_energy, atol=0., rtol=1e-6)
             np.testing.assert_allclose(p_scalar, np.trace(ref_pressure) / 3.,
                                        atol=1e-12, rtol=5e-6)
-            np.testing.assert_allclose(p_tensor, ref_pressure, atol=1e-12,
+            np.testing.assert_allclose(p_tensor, ref_pressure, atol=5e-9,
                                        rtol=5e-6)
 
-        p3m = espressomd.electrostatics.P3M(**p3m_params)
-        self.system.electrostatics.solver = p3m
-        check()
+        for single_precision in [False, True]:
+            with self.subTest(msg=f"P3M CPU {single_precision=}"):
+                p3m = espressomd.electrostatics.P3M(
+                    single_precision=single_precision, **p3m_params)
+                self.system.electrostatics.solver = p3m
+                check()
         if espressomd.has_features("CUDA") and espressomd.gpu_available():
-            p3m_params = dict(prefactor=1., accuracy=3e-7, mesh=42, r_cut=5.5,
-                              cao=7, alpha=0.709017, tune=False)
-            p3m = espressomd.electrostatics.P3MGPU(**p3m_params)
-            self.system.electrostatics.solver = p3m
-            check()
+            with self.subTest(msg=f"P3M GPU single_precision=True"):
+                p3m_params = dict(prefactor=1., accuracy=3e-7, mesh=42,
+                                  r_cut=5.5, cao=7, alpha=0.709017, tune=False)
+                p3m = espressomd.electrostatics.P3MGPU(**p3m_params)
+                self.system.electrostatics.solver = p3m
+                check()
 
 
 if __name__ == "__main__":