From 938a703503aa2309b424deded9671ceaca72fb3a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Wed, 3 Jul 2024 23:22:07 +0200
Subject: [PATCH 1/2] Extract historic FFT algorithm

Encapsulate the FFT algorithm and move its sources to a new folder.
Reconcile electrostatics and magnetostatics FFT operations. Hide
FFTW3 with type erasure. Alternative FFT backends can now be used.
---
 src/core/CMakeLists.txt                       |   4 +
 src/core/electrostatics/elc.cpp               |   2 +-
 src/core/electrostatics/p3m.cpp               | 148 ++++-----
 src/core/electrostatics/p3m.hpp               |  44 +--
 src/core/fft/CMakeLists.txt                   |  24 ++
 src/core/{p3m => fft}/fft.cpp                 | 311 +++++++++---------
 src/core/{p3m => fft}/fft.hpp                 | 194 +++++------
 src/core/fft/vector.hpp                       |  68 ++++
 src/core/magnetostatics/dp3m.cpp              | 249 ++++++--------
 src/core/magnetostatics/dp3m.hpp              |  50 +--
 src/core/p3m/CMakeLists.txt                   |  14 +-
 src/core/p3m/FFTBackendLegacy.cpp             | 112 +++++++
 src/core/p3m/FFTBackendLegacy.hpp             |  78 +++++
 src/core/p3m/common.cpp                       |   6 +-
 src/core/p3m/common.hpp                       |  45 +--
 src/core/p3m/data_struct.hpp                  |  82 ++++-
 src/core/p3m/for_each_3d.hpp                  |   1 +
 src/core/p3m/influence_function.hpp           |  37 ++-
 src/core/p3m/interpolation.hpp                |   6 +-
 src/core/p3m/send_mesh.cpp                    |  37 ++-
 src/core/p3m/send_mesh.hpp                    |  38 +--
 .../EspressoSystemStandAlone_test.cpp         |   2 +
 src/core/unit_tests/fft_test.cpp              |  19 +-
 .../electrostatics/CoulombP3M.hpp             |   2 +
 .../electrostatics/CoulombP3MGPU.hpp          |   2 +
 .../magnetostatics/DipolarP3M.hpp             |   3 +
 26 files changed, 937 insertions(+), 641 deletions(-)
 create mode 100644 src/core/fft/CMakeLists.txt
 rename src/core/{p3m => fft}/fft.cpp (75%)
 rename src/core/{p3m => fft}/fft.hpp (54%)
 create mode 100644 src/core/fft/vector.hpp
 create mode 100644 src/core/p3m/FFTBackendLegacy.cpp
 create mode 100644 src/core/p3m/FFTBackendLegacy.hpp

diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 6153849cc7a..0e7b82024d1 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -95,6 +95,10 @@ if(ESPRESSO_BUILD_WITH_WALBERLA)
             $<$<BOOL:${ESPRESSO_BUILD_WITH_CUDA}>:espresso::walberla_cuda>)
 endif()
 
+if(ESPRESSO_BUILD_WITH_FFTW)
+  add_subdirectory(fft)
+endif()
+
 add_subdirectory(accumulators)
 add_subdirectory(analysis)
 add_subdirectory(bond_breakage)
diff --git a/src/core/electrostatics/elc.cpp b/src/core/electrostatics/elc.cpp
index d7832e5f9ee..6c2c3f05313 100644
--- a/src/core/electrostatics/elc.cpp
+++ b/src/core/electrostatics/elc.cpp
@@ -1128,7 +1128,7 @@ void charge_assign(elc_data const &elc, CoulombP3M &solver,
   }
   /* prepare local FFT mesh */
   for (int i = 0; i < solver.p3m.local_mesh.size; i++)
-    solver.p3m.rs_mesh[i] = 0.;
+    solver.p3m.mesh.rs_scalar[i] = 0.;
 
   for (auto zipped : p_q_pos_range) {
     auto const p_q = boost::get<0>(zipped);
diff --git a/src/core/electrostatics/p3m.cpp b/src/core/electrostatics/p3m.cpp
index b16cb96f141..5e604a4fd51 100644
--- a/src/core/electrostatics/p3m.cpp
+++ b/src/core/electrostatics/p3m.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 The ESPResSo project
+ * Copyright (C) 2010-2024 The ESPResSo project
  * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
  *   Max-Planck-Institute for Polymer Research, Theory Group
  *
@@ -33,9 +33,9 @@
 #include "electrostatics/p3m_gpu.hpp"
 #include "electrostatics/p3m_gpu_error.hpp"
 
+#include "fft/fft.hpp"
 #include "p3m/TuningAlgorithm.hpp"
 #include "p3m/TuningLogger.hpp"
-#include "p3m/fft.hpp"
 #include "p3m/for_each_3d.hpp"
 #include "p3m/influence_function.hpp"
 #include "p3m/math.hpp"
@@ -106,30 +106,27 @@ void CoulombP3M::count_charged_particles() {
 /** Calculate the optimal influence function of @cite hockney88a.
  *  (optimised for force calculations)
  *
- *  Each node calculates only the values for its domain in k-space
- *  (see fft.plan[3].mesh and fft.plan[3].start).
+ *  Each node calculates only the values for its domain in k-space.
  *
  *  See also: @cite hockney88a eq. 8-22 (p. 275). Note the somewhat
  *  different convention for the prefactors, which is described in
  *  @cite deserno98a @cite deserno98b.
  */
 void CoulombP3M::calc_influence_function_force() {
-  auto const start = Utils::Vector3i{p3m.fft.plan[3].start};
-  auto const size = Utils::Vector3i{p3m.fft.plan[3].new_mesh};
-
-  p3m.g_force = grid_influence_function<1>(p3m.params, start, start + size,
-                                           get_system().box_geo->length_inv());
+  auto const [KX, KY, KZ] = p3m.fft->get_permutations();
+  p3m.g_force =
+      grid_influence_function<1>(p3m.params, p3m.mesh.start, p3m.mesh.stop, KX,
+                                 KY, KZ, get_system().box_geo->length_inv());
 }
 
 /** Calculate the influence function optimized for the energy and the
  *  self energy correction.
  */
 void CoulombP3M::calc_influence_function_energy() {
-  auto const start = Utils::Vector3i{p3m.fft.plan[3].start};
-  auto const size = Utils::Vector3i{p3m.fft.plan[3].new_mesh};
-
-  p3m.g_energy = grid_influence_function<0>(p3m.params, start, start + size,
-                                            get_system().box_geo->length_inv());
+  auto const [KX, KY, KZ] = p3m.fft->get_permutations();
+  p3m.g_energy =
+      grid_influence_function<0>(p3m.params, p3m.mesh.start, p3m.mesh.stop, KX,
+                                 KY, KZ, get_system().box_geo->length_inv());
 }
 
 /** Aliasing sum used by @ref p3m_k_space_error. */
@@ -138,8 +135,8 @@ static auto p3m_tune_aliasing_sums(Utils::Vector3i const &shift,
                                    Utils::Vector3d const &mesh_i, int cao,
                                    double alpha_L_i) {
 
-  auto constexpr m_start = Utils::Vector3i::broadcast(-P3M_BRILLOUIN);
-  auto constexpr m_stop = Utils::Vector3i::broadcast(P3M_BRILLOUIN + 1);
+  auto constexpr mesh_start = Utils::Vector3i::broadcast(-P3M_BRILLOUIN);
+  auto constexpr mesh_stop = Utils::Vector3i::broadcast(P3M_BRILLOUIN + 1);
   auto const factor1 = Utils::sqr(std::numbers::pi * alpha_L_i);
   auto alias1 = 0.;
   auto alias2 = 0.;
@@ -148,7 +145,7 @@ static auto p3m_tune_aliasing_sums(Utils::Vector3i const &shift,
   Utils::Vector3i nm{};
   Utils::Vector3d fnm{};
   for_each_3d(
-      m_start, m_stop, indices,
+      mesh_start, mesh_stop, indices,
       [&]() {
         auto const norm_sq = nm.norm2();
         auto const ex = exp(-factor1 * norm_sq);
@@ -202,14 +199,14 @@ static double p3m_k_space_error(double pref, Utils::Vector3i const &mesh,
   auto const cotangent_sum = math::get_analytic_cotangent_sum_kernel(cao);
   auto const mesh_i = 1. / Utils::Vector3d(mesh);
   auto const alpha_L_i = 1. / alpha_L;
-  auto const m_stop = mesh / 2;
-  auto const m_start = -m_stop;
+  auto const mesh_stop = mesh / 2;
+  auto const mesh_start = -mesh_stop;
   auto indices = Utils::Vector3i{};
   auto values = Utils::Vector3d{};
   auto he_q = 0.;
 
   for_each_3d(
-      m_start, m_stop, indices,
+      mesh_start, mesh_stop, indices,
       [&]() {
         if ((indices[0] != 0) or (indices[1] != 0) or (indices[2] != 0)) {
           auto const n2 = indices.norm2();
@@ -253,18 +250,9 @@ void CoulombP3M::init() {
     elc_layer = actor->elc.space_layer;
   }
 
+  assert(p3m.fft);
   p3m.local_mesh.calc_local_ca_mesh(p3m.params, local_geo, skin, elc_layer);
-  p3m.sm.resize(comm_cart, p3m.local_mesh);
-
-  int ca_mesh_size = fft_init(p3m.local_mesh.dim, p3m.local_mesh.margin,
-                              p3m.params.mesh, p3m.params.mesh_off, p3m.ks_pnum,
-                              p3m.fft, ::communicator.node_grid, comm_cart);
-  p3m.rs_mesh.resize(ca_mesh_size);
-
-  for (auto &e : p3m.E_mesh) {
-    e.resize(ca_mesh_size);
-  }
-
+  p3m.fft->init_fft();
   p3m.calc_differential_operator();
 
   /* fix box length dependent constants */
@@ -290,7 +278,7 @@ CoulombP3M::CoulombP3M(P3MParameters &&parameters, double prefactor,
 
 namespace {
 template <int cao> struct AssignCharge {
-  void operator()(p3m_data_struct &p3m, double q,
+  void operator()(decltype(CoulombP3M::p3m) &p3m, double q,
                   Utils::Vector3d const &real_pos,
                   p3m_interpolation_cache &inter_weights) {
     auto const w = p3m_calculate_interpolation_weights<cao>(
@@ -299,21 +287,22 @@ template <int cao> struct AssignCharge {
     inter_weights.store(w);
 
     p3m_interpolate(p3m.local_mesh, w, [q, &p3m](int ind, double w) {
-      p3m.rs_mesh[ind] += w * q;
+      p3m.mesh.rs_scalar[ind] += w * q;
     });
   }
 
-  void operator()(p3m_data_struct &p3m, double q,
+  void operator()(decltype(CoulombP3M::p3m) &p3m, double q,
                   Utils::Vector3d const &real_pos) {
     p3m_interpolate(
         p3m.local_mesh,
         p3m_calculate_interpolation_weights<cao>(real_pos, p3m.params.ai,
                                                  p3m.local_mesh),
-        [q, &p3m](int ind, double w) { p3m.rs_mesh[ind] += w * q; });
+        [q, &p3m](int ind, double w) { p3m.mesh.rs_scalar[ind] += w * q; });
   }
 
   template <typename combined_ranges>
-  void operator()(p3m_data_struct &p3m, combined_ranges const &p_q_pos_range) {
+  void operator()(decltype(CoulombP3M::p3m) &p3m,
+                  combined_ranges const &p_q_pos_range) {
     for (auto zipped : p_q_pos_range) {
       auto const p_q = boost::get<0>(zipped);
       auto const &p_pos = boost::get<1>(zipped);
@@ -330,7 +319,7 @@ void CoulombP3M::charge_assign(ParticleRange const &particles) {
 
   /* prepare local FFT mesh */
   for (int i = 0; i < p3m.local_mesh.size; i++)
-    p3m.rs_mesh[i] = 0.0;
+    p3m.mesh.rs_scalar[i] = 0.0;
 
   auto p_q_range = ParticlePropertyRange::charge_range(particles);
   auto p_pos_range = ParticlePropertyRange::pos_range(particles);
@@ -352,7 +341,7 @@ void CoulombP3M::assign_charge(double q, Utils::Vector3d const &real_pos) {
 
 template <int cao> struct AssignForces {
   template <typename combined_ranges>
-  void operator()(p3m_data_struct &p3m, double force_prefac,
+  void operator()(decltype(CoulombP3M::p3m) &p3m, double force_prefac,
                   combined_ranges const &p_q_force_range) const {
 
     assert(cao == p3m.inter_weights.cao());
@@ -369,8 +358,9 @@ template <int cao> struct AssignForces {
 
         Utils::Vector3d force{};
         p3m_interpolate(p3m.local_mesh, w, [&force, &p3m](int ind, double w) {
-          force += w * Utils::Vector3d{p3m.E_mesh[0][ind], p3m.E_mesh[1][ind],
-                                       p3m.E_mesh[2][ind]};
+          force += w * Utils::Vector3d{p3m.mesh.rs_fields[0u][ind],
+                                       p3m.mesh.rs_fields[1u][ind],
+                                       p3m.mesh.rs_fields[2u][ind]};
         });
 
         p_force -= pref * force;
@@ -400,36 +390,35 @@ static auto calc_dipole_moment(boost::mpi::communicator const &comm,
  */
 Utils::Vector9d
 CoulombP3M::long_range_pressure(ParticleRange const &particles) {
-  using namespace detail::FFT_indexing;
-
   auto const &box_geo = *get_system().box_geo;
-
   Utils::Vector9d node_k_space_pressure_tensor{};
 
   if (p3m.sum_q2 > 0.) {
     charge_assign(particles);
-    p3m.sm.gather_grid(p3m.rs_mesh.data(), comm_cart, p3m.local_mesh.dim);
-    fft_perform_forw(p3m.rs_mesh.data(), p3m.fft, comm_cart);
+    p3m.fft->perform_fwd_fft();
 
-    auto constexpr m_start = Utils::Vector3i::broadcast(0);
+    auto constexpr mesh_start = Utils::Vector3i::broadcast(0);
+    auto const &mesh_stop = p3m.mesh.size;
+    auto const &offset = p3m.mesh.start;
     auto const half_alpha_inv_sq = Utils::sqr(1. / 2. / p3m.params.alpha);
-    auto const m_stop = std::span(p3m.fft.plan[3].new_mesh);
-    auto const offset = std::span(p3m.fft.plan[3].start);
     auto const wavevector = (2. * std::numbers::pi) * box_geo.length_inv();
+    auto const [KX, KY, KZ] = p3m.fft->get_permutations();
     auto indices = Utils::Vector3i{};
     auto index = std::size_t(0u);
     auto diagonal = 0.;
 
-    for_each_3d(m_start, m_stop, indices, [&]() {
-      auto const kx = p3m.d_op[RX][indices[KX] + offset[KX]] * wavevector[RX];
-      auto const ky = p3m.d_op[RY][indices[KY] + offset[KY]] * wavevector[RY];
-      auto const kz = p3m.d_op[RZ][indices[KZ] + offset[KZ]] * wavevector[RZ];
+    for_each_3d(mesh_start, mesh_stop, indices, [&]() {
+      auto const shift = indices + offset;
+      auto const kx = p3m.d_op[0u][shift[KX]] * wavevector[0u];
+      auto const ky = p3m.d_op[1u][shift[KY]] * wavevector[1u];
+      auto const kz = p3m.d_op[2u][shift[KZ]] * wavevector[2u];
       auto const norm_sq = Utils::sqr(kx) + Utils::sqr(ky) + Utils::sqr(kz);
 
       if (norm_sq != 0.) {
         auto const node_k_space_energy =
-            p3m.g_energy[index] * (Utils::sqr(p3m.rs_mesh[2u * index + 0u]) +
-                                   Utils::sqr(p3m.rs_mesh[2u * index + 1u]));
+            p3m.g_energy[index] *
+            (Utils::sqr(p3m.mesh.rs_scalar[2u * index + 0u]) +
+             Utils::sqr(p3m.mesh.rs_scalar[2u * index + 1u]));
         auto const vterm = -2. * (1. / norm_sq + half_alpha_inv_sq);
         auto const pref = node_k_space_energy * vterm;
         node_k_space_pressure_tensor[0u] += pref * kx * kx; /* sigma_xx */
@@ -470,10 +459,7 @@ double CoulombP3M::long_range_kernel(bool force_flag, bool energy_flag,
             system.coulomb.impl->solver)) {
       charge_assign(particles);
     }
-    /* Gather information for FFT grid inside the nodes domain (inner local
-     * mesh) and perform forward 3D FFT (Charge Assignment Mesh). */
-    p3m.sm.gather_grid(p3m.rs_mesh.data(), comm_cart, p3m.local_mesh.dim);
-    fft_perform_forw(p3m.rs_mesh.data(), p3m.fft, comm_cart);
+    p3m.fft->perform_fwd_fft();
   }
 
   auto p_q_range = ParticlePropertyRange::charge_range(particles);
@@ -496,45 +482,41 @@ double CoulombP3M::long_range_kernel(bool force_flag, bool energy_flag,
   /* === k-space force calculation  === */
   if (force_flag) {
     /* i*k differentiation */
-    auto constexpr m_start = Utils::Vector3i::broadcast(0);
-    auto const m_stop = std::span(p3m.fft.plan[3].new_mesh);
-    auto const offset = std::span(p3m.fft.plan[3].start);
+    auto constexpr mesh_start = Utils::Vector3i::broadcast(0);
+    auto const &mesh_stop = p3m.mesh.size;
+    auto const &offset = p3m.mesh.start;
     auto const wavevector = (2. * std::numbers::pi) * box_geo.length_inv();
     auto indices = Utils::Vector3i{};
     auto index = std::size_t(0u);
 
     /* compute electric field */
     // Eq. (3.49) @cite deserno00b
-    for_each_3d(m_start, m_stop, indices, [&]() {
-      auto const rho_hat = std::complex<double>(p3m.rs_mesh[2u * index + 0u],
-                                                p3m.rs_mesh[2u * index + 1u]);
+    for_each_3d(mesh_start, mesh_stop, indices, [&]() {
+      auto const rho_hat =
+          std::complex<double>(p3m.mesh.rs_scalar[2u * index + 0u],
+                               p3m.mesh.rs_scalar[2u * index + 1u]);
       auto const phi_hat = p3m.g_force[index] * rho_hat;
 
       for (int d = 0; d < 3; d++) {
         /* direction in r-space: */
-        int d_rs = (d + p3m.ks_pnum) % 3;
+        int d_rs = (d + p3m.mesh.ks_pnum) % 3;
         /* directions */
         auto const k =
             p3m.d_op[d_rs][indices[d] + offset[d]] * wavevector[d_rs];
 
         /* i*k*(Re+i*Im) = - Im*k + i*Re*k     (i=sqrt(-1)) */
-        p3m.E_mesh[d_rs][2u * index + 0u] = -k * phi_hat.imag();
-        p3m.E_mesh[d_rs][2u * index + 1u] = +k * phi_hat.real();
+        p3m.mesh.rs_fields[d_rs][2u * index + 0u] = -k * phi_hat.imag();
+        p3m.mesh.rs_fields[d_rs][2u * index + 1u] = +k * phi_hat.real();
       }
 
       ++index;
     });
 
-    /* Back FFT force component mesh */
-    auto const check_complex = !p3m.params.tuning and check_complex_residuals;
-    for (int d = 0; d < 3; d++) {
-      fft_perform_back(p3m.E_mesh[d].data(), check_complex, p3m.fft, comm_cart);
-    }
-
-    /* redistribute force component mesh */
-    std::array<double *, 3> E_fields = {
-        {p3m.E_mesh[0].data(), p3m.E_mesh[1].data(), p3m.E_mesh[2].data()}};
-    p3m.sm.spread_grid(E_fields, comm_cart, p3m.local_mesh.dim);
+    auto const check_residuals =
+        not p3m.params.tuning and check_complex_residuals;
+    p3m.fft->check_complex_residuals = check_residuals;
+    p3m.fft->perform_field_back_fft();
+    p3m.fft->check_complex_residuals = false;
 
     auto const force_prefac = prefactor / volume;
     Utils::integral_parameter<int, AssignForces, 1, 7>(
@@ -556,11 +538,13 @@ double CoulombP3M::long_range_kernel(bool force_flag, bool energy_flag,
   /* === k-space energy calculation  === */
   if (energy_flag or npt_flag) {
     auto node_energy = 0.;
-    for (int i = 0; i < p3m.fft.plan[3].new_size; i++) {
+    auto const mesh_length = Utils::product(p3m.mesh.size);
+    for (int i = 0; i < mesh_length; i++) {
       // Use the energy optimized influence function for energy!
       // Eq. (3.40) @cite deserno00b
-      node_energy += p3m.g_energy[i] * (Utils::sqr(p3m.rs_mesh[2 * i]) +
-                                        Utils::sqr(p3m.rs_mesh[2 * i + 1]));
+      node_energy +=
+          p3m.g_energy[i] * (Utils::sqr(p3m.mesh.rs_scalar[2 * i]) +
+                             Utils::sqr(p3m.mesh.rs_scalar[2 * i + 1]));
     }
     node_energy /= 2. * volume;
 
@@ -596,7 +580,7 @@ double CoulombP3M::long_range_kernel(bool force_flag, bool energy_flag,
 }
 
 class CoulombTuningAlgorithm : public TuningAlgorithm {
-  p3m_data_struct &p3m;
+  decltype(CoulombP3M::p3m) &p3m;
   double m_mesh_density_min = -1., m_mesh_density_max = -1.;
   // indicates if mesh should be tuned
   bool m_tune_mesh = false;
@@ -605,7 +589,7 @@ class CoulombTuningAlgorithm : public TuningAlgorithm {
   P3MParameters &get_params() override { return p3m.params; }
 
 public:
-  CoulombTuningAlgorithm(System::System &system, p3m_data_struct &input_p3m,
+  CoulombTuningAlgorithm(System::System &system, decltype(p3m) &input_p3m,
                          double prefactor, int timings)
       : TuningAlgorithm(system, prefactor, timings), p3m{input_p3m} {}
 
diff --git a/src/core/electrostatics/p3m.hpp b/src/core/electrostatics/p3m.hpp
index 7f485dca30a..de7bbdee9fa 100644
--- a/src/core/electrostatics/p3m.hpp
+++ b/src/core/electrostatics/p3m.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 The ESPResSo project
+ * Copyright (C) 2010-2024 The ESPResSo project
  * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
  *   Max-Planck-Institute for Polymer Research, Theory Group
  *
@@ -42,7 +42,6 @@
 
 #include "p3m/common.hpp"
 #include "p3m/data_struct.hpp"
-#include "p3m/fft.hpp"
 #include "p3m/interpolation.hpp"
 #include "p3m/send_mesh.hpp"
 
@@ -54,37 +53,26 @@
 #include <array>
 #include <cmath>
 #include <numbers>
+#include <utility>
 
-struct p3m_data_struct : public p3m_data_struct_base {
-  explicit p3m_data_struct(P3MParameters &&parameters)
-      : p3m_data_struct_base{std::move(parameters)} {}
-
-  /** local mesh. */
-  P3MLocalMesh local_mesh;
-  /** real space mesh (local) for CA/FFT. */
-  fft_vector<double> rs_mesh;
-  /** mesh (local) for the electric field. */
-  std::array<fft_vector<double>, 3> E_mesh;
-
-  /** number of charged particles (only on head node). */
-  int sum_qpart = 0;
-  /** Sum of square of charges (only on head node). */
-  double sum_q2 = 0.;
-  /** square of sum of charges (only on head node). */
-  double square_sum_q = 0.;
-
-  p3m_interpolation_cache inter_weights;
+/** @brief P3M solver. */
+struct CoulombP3M : public Coulomb::Actor<CoulombP3M> {
+  struct p3m_data_struct_impl : public p3m_data_struct {
+    explicit p3m_data_struct_impl(P3MParameters &&parameters)
+        : p3m_data_struct{std::move(parameters)} {}
 
-  /** send/recv mesh sizes */
-  p3m_send_mesh sm;
+    /** number of charged particles (only on head node). */
+    int sum_qpart = 0;
+    /** Sum of square of charges (only on head node). */
+    double sum_q2 = 0.;
+    /** square of sum of charges (only on head node). */
+    double square_sum_q = 0.;
 
-  fft_data_struct fft;
-};
+    p3m_interpolation_cache inter_weights;
+  };
 
-/** @brief P3M solver. */
-struct CoulombP3M : public Coulomb::Actor<CoulombP3M> {
   /** P3M parameters. */
-  p3m_data_struct p3m;
+  p3m_data_struct_impl p3m;
 
   int tune_timings;
   bool tune_verbose;
diff --git a/src/core/fft/CMakeLists.txt b/src/core/fft/CMakeLists.txt
new file mode 100644
index 00000000000..359cc857ac8
--- /dev/null
+++ b/src/core/fft/CMakeLists.txt
@@ -0,0 +1,24 @@
+#
+# Copyright (C) 2018-2024 The ESPResSo project
+#
+# This file is part of ESPResSo.
+#
+# ESPResSo is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ESPResSo is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+if(FFTW3_FOUND)
+  target_link_libraries(espresso_core PUBLIC FFTW3::FFTW3)
+endif()
+
+target_sources(espresso_core PRIVATE fft.cpp)
diff --git a/src/core/p3m/fft.cpp b/src/core/fft/fft.cpp
similarity index 75%
rename from src/core/p3m/fft.cpp
rename to src/core/fft/fft.cpp
index bd6ad27e38c..29997d3c897 100644
--- a/src/core/p3m/fft.cpp
+++ b/src/core/fft/fft.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 The ESPResSo project
+ * Copyright (C) 2010-2024 The ESPResSo project
  * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
  *   Max-Planck-Institute for Polymer Research, Theory Group
  *
@@ -25,19 +25,19 @@
  *
  */
 
-#include "config/config.hpp"
-
-#if defined(P3M) || defined(DP3M)
-
-#include "p3m/fft.hpp"
+#include "fft.hpp"
+#include "vector.hpp"
 
 #include <utils/Vector.hpp>
 #include <utils/index.hpp>
 #include <utils/math/permute_ifield.hpp>
 
+#include <boost/mpi/communicator.hpp>
+
 #include <fftw3.h>
 #include <mpi.h>
 
+#include <algorithm>
 #include <cmath>
 #include <cstddef>
 #include <cstdio>
@@ -59,6 +59,8 @@ using Utils::permute_ifield;
 #define REQ_FFT_BACK 302
 /**@}*/
 
+namespace fft {
+
 /** This ugly function does the bookkeeping: which nodes have to
  *  communicate to each other, when you change the node grid.
  *  Changing the regular decomposition requires communication. This
@@ -183,7 +185,7 @@ namespace {
  * \param[in]  n_pos    Position of the node in @p n_grid.
  * \param[in]  n_grid   node grid.
  * \param[in]  mesh     global mesh dimensions.
- * \param[in]  mesh_off global mesh offset (see \ref p3m_data_struct).
+ * \param[in]  mesh_off global mesh offset.
  * \param[out] loc_mesh local mesh dimension.
  * \param[out] start    first point of local mesh in global mesh.
  * \return Number of mesh points in local mesh.
@@ -228,7 +230,7 @@ int calc_local_mesh(const int *n_pos, const int *n_grid, const int *mesh,
  *  \param[in]  pos2     Position of recv node in @p grid2.
  *  \param[in]  grid2    node grid 2.
  *  \param[in]  mesh     global mesh dimensions.
- *  \param[in]  mesh_off global mesh offset (see \ref p3m_data_struct).
+ *  \param[in]  mesh_off global mesh offset.
  *  \param[out] block    send block specification.
  *  \return Size of the send block.
  */
@@ -349,69 +351,69 @@ void pack_block_permute2(double const *const in, double *const out,
   }
 }
 
+} // namespace
+
 /** Communicate the grid data according to the given forward FFT plan.
+ *  \param comm   MPI communicator.
  *  \param plan   FFT communication plan.
  *  \param in     input mesh.
  *  \param out    output mesh.
- *  \param fft    FFT communication plan.
- *  \param comm   MPI communicator.
  */
-void forw_grid_comm(fft_forw_plan plan, const double *in, double *out,
-                    fft_data_struct &fft,
-                    const boost::mpi::communicator &comm) {
+void fft_data_struct::forw_grid_comm(boost::mpi::communicator const &comm,
+                                     fft_forw_plan const &plan,
+                                     double const *in, double *out) {
   for (std::size_t i = 0ul; i < plan.group.size(); i++) {
-    plan.pack_function(in, fft.send_buf.data(), &(plan.send_block[6ul * i]),
+    plan.pack_function(in, send_buf.data(), &(plan.send_block[6ul * i]),
                        &(plan.send_block[6ul * i + 3ul]), plan.old_mesh,
                        plan.element);
 
     if (plan.group[i] != comm.rank()) {
-      MPI_Sendrecv(fft.send_buf.data(), plan.send_size[i], MPI_DOUBLE,
-                   plan.group[i], REQ_FFT_FORW, fft.recv_buf.data(),
+      MPI_Sendrecv(send_buf.data(), plan.send_size[i], MPI_DOUBLE,
+                   plan.group[i], REQ_FFT_FORW, recv_buf.data(),
                    plan.recv_size[i], MPI_DOUBLE, plan.group[i], REQ_FFT_FORW,
                    comm, MPI_STATUS_IGNORE);
     } else { /* Self communication... */
-      std::swap(fft.send_buf, fft.recv_buf);
+      std::swap(send_buf, recv_buf);
     }
-    fft_unpack_block(fft.recv_buf.data(), out, &(plan.recv_block[6ul * i]),
+    fft_unpack_block(recv_buf.data(), out, &(plan.recv_block[6ul * i]),
                      &(plan.recv_block[6ul * i + 3ul]), plan.new_mesh,
                      plan.element);
   }
 }
 
 /** Communicate the grid data according to the given backward FFT plan.
+ *  \param comm   MPI communicator.
  *  \param plan_f Forward FFT plan.
  *  \param plan_b Backward FFT plan.
  *  \param in     input mesh.
  *  \param out    output mesh.
- *  \param fft    FFT communication plan.
- *  \param comm   MPI communicator.
  */
-void back_grid_comm(fft_forw_plan plan_f, fft_back_plan plan_b,
-                    const double *in, double *out, fft_data_struct &fft,
-                    const boost::mpi::communicator &comm) {
+void fft_data_struct::back_grid_comm(boost::mpi::communicator const &comm,
+                                     fft_forw_plan const &plan_f,
+                                     fft_back_plan const &plan_b,
+                                     double const *in, double *out) {
   /* Back means: Use the send/receive stuff from the forward plan but
      replace the receive blocks by the send blocks and vice
      versa. Attention then also new_mesh and old_mesh are exchanged */
 
   for (std::size_t i = 0ul; i < plan_f.group.size(); i++) {
-    plan_b.pack_function(in, fft.send_buf.data(), &(plan_f.recv_block[6ul * i]),
+    plan_b.pack_function(in, send_buf.data(), &(plan_f.recv_block[6ul * i]),
                          &(plan_f.recv_block[6ul * i + 3ul]), plan_f.new_mesh,
                          plan_f.element);
 
     if (plan_f.group[i] != comm.rank()) { /* send first, receive second */
-      MPI_Sendrecv(fft.send_buf.data(), plan_f.recv_size[i], MPI_DOUBLE,
-                   plan_f.group[i], REQ_FFT_BACK, fft.recv_buf.data(),
+      MPI_Sendrecv(send_buf.data(), plan_f.recv_size[i], MPI_DOUBLE,
+                   plan_f.group[i], REQ_FFT_BACK, recv_buf.data(),
                    plan_f.send_size[i], MPI_DOUBLE, plan_f.group[i],
                    REQ_FFT_BACK, comm, MPI_STATUS_IGNORE);
     } else { /* Self communication... */
-      std::swap(fft.send_buf, fft.recv_buf);
+      std::swap(send_buf, recv_buf);
     }
-    fft_unpack_block(fft.recv_buf.data(), out, &(plan_f.send_block[6ul * i]),
+    fft_unpack_block(recv_buf.data(), out, &(plan_f.send_block[6ul * i]),
                      &(plan_f.send_block[6ul * i + 3ul]), plan_f.old_mesh,
                      plan_f.element);
   }
 }
-} // namespace
 
 /** Calculate 'best' mapping between a 2D and 3D grid.
  *  Required for the communication from 3D regular domain
@@ -478,11 +480,12 @@ static void calc_2d_grid(int n, int grid[3]) {
   }
 }
 
-int fft_init(Utils::Vector3i const &ca_mesh_dim, int const *ca_mesh_margin,
-             Utils::Vector3i const &global_mesh_dim,
-             Utils::Vector3d const &global_mesh_off, int &ks_pnum,
-             fft_data_struct &fft, Utils::Vector3i const &grid,
-             boost::mpi::communicator const &comm) {
+int fft_data_struct::initialize_fft(boost::mpi::communicator const &comm,
+                                    Utils::Vector3i const &ca_mesh_dim,
+                                    int const *ca_mesh_margin,
+                                    Utils::Vector3i const &global_mesh_dim,
+                                    Utils::Vector3d const &global_mesh_off,
+                                    int &ks_pnum, Utils::Vector3i const &grid) {
 
   int n_grid[4][3];         /* The four node grids. */
   int my_pos[4][3];         /* The position of comm.rank() in the node grids. */
@@ -493,8 +496,8 @@ int fft_init(Utils::Vector3i const &ca_mesh_dim, int const *ca_mesh_margin,
   int node_pos[3];
   MPI_Cart_coords(comm, rank, 3, node_pos);
 
-  fft.max_comm_size = 0;
-  fft.max_mesh_size = 0;
+  max_comm_size = 0;
+  max_mesh_size = 0;
   for (int i = 0; i < 4; i++) {
     n_id[i].resize(1 * comm.size());
     n_pos[i].resize(3 * comm.size());
@@ -517,21 +520,21 @@ int fft_init(Utils::Vector3i const &ca_mesh_dim, int const *ca_mesh_margin,
   /* FFT node grids (n_grid[1 - 3]) */
   calc_2d_grid(comm.size(), n_grid[1]);
   /* resort n_grid[1] dimensions if necessary */
-  fft.plan[1].row_dir = map_3don2d_grid(n_grid[0], n_grid[1]);
-  fft.plan[0].n_permute = 0;
+  forw[1].row_dir = map_3don2d_grid(n_grid[0], n_grid[1]);
+  forw[0].n_permute = 0;
   for (int i = 1; i < 4; i++)
-    fft.plan[i].n_permute = (fft.plan[1].row_dir + i) % 3;
+    forw[i].n_permute = (forw[1].row_dir + i) % 3;
   for (int i = 0; i < 3; i++) {
     n_grid[2][i] = n_grid[1][(i + 1) % 3];
     n_grid[3][i] = n_grid[1][(i + 2) % 3];
   }
-  fft.plan[2].row_dir = (fft.plan[1].row_dir - 1) % 3;
-  fft.plan[3].row_dir = (fft.plan[1].row_dir - 2) % 3;
+  forw[2].row_dir = (forw[1].row_dir - 1) % 3;
+  forw[3].row_dir = (forw[1].row_dir - 2) % 3;
 
   /* === communication groups === */
   /* copy local mesh off real space charge assignment grid */
   for (int i = 0; i < 3; i++)
-    fft.plan[0].new_mesh[i] = ca_mesh_dim[i];
+    forw[0].new_mesh[i] = ca_mesh_dim[i];
 
   for (int i = 1; i < 4; i++) {
     auto group = find_comm_groups(
@@ -540,8 +543,8 @@ int fft_init(Utils::Vector3i const &ca_mesh_dim, int const *ca_mesh_margin,
         std::span(n_id[i]), std::span(n_pos[i]), my_pos[i], rank);
     if (not group) {
       /* try permutation */
-      std::swap(n_grid[i][(fft.plan[i].row_dir + 1) % 3],
-                n_grid[i][(fft.plan[i].row_dir + 2) % 3]);
+      std::swap(n_grid[i][(forw[i].row_dir + 1) % 3],
+                n_grid[i][(forw[i].row_dir + 2) % 3]);
 
       group = find_comm_groups(
           {n_grid[i - 1][0], n_grid[i - 1][1], n_grid[i - 1][2]},
@@ -553,188 +556,185 @@ int fft_init(Utils::Vector3i const &ca_mesh_dim, int const *ca_mesh_margin,
       }
     }
 
-    fft.plan[i].group = group.value();
+    forw[i].group = group.value();
 
-    fft.plan[i].send_block.resize(6 * fft.plan[i].group.size());
-    fft.plan[i].send_size.resize(fft.plan[i].group.size());
-    fft.plan[i].recv_block.resize(6 * fft.plan[i].group.size());
-    fft.plan[i].recv_size.resize(fft.plan[i].group.size());
+    forw[i].send_block.resize(6 * forw[i].group.size());
+    forw[i].send_size.resize(forw[i].group.size());
+    forw[i].recv_block.resize(6 * forw[i].group.size());
+    forw[i].recv_size.resize(forw[i].group.size());
 
-    fft.plan[i].new_size = calc_local_mesh(
+    forw[i].new_size = calc_local_mesh(
         my_pos[i], n_grid[i], global_mesh_dim.data(), global_mesh_off.data(),
-        fft.plan[i].new_mesh, fft.plan[i].start);
-    permute_ifield(fft.plan[i].new_mesh, 3, -(fft.plan[i].n_permute));
-    permute_ifield(fft.plan[i].start, 3, -(fft.plan[i].n_permute));
-    fft.plan[i].n_ffts = fft.plan[i].new_mesh[0] * fft.plan[i].new_mesh[1];
+        forw[i].new_mesh, forw[i].start);
+    permute_ifield(forw[i].new_mesh, 3, -(forw[i].n_permute));
+    permute_ifield(forw[i].start, 3, -(forw[i].n_permute));
+    forw[i].n_ffts = forw[i].new_mesh[0] * forw[i].new_mesh[1];
 
     /* === send/recv block specifications === */
-    for (std::size_t j = 0ul; j < fft.plan[i].group.size(); j++) {
+    for (std::size_t j = 0ul; j < forw[i].group.size(); j++) {
       /* send block: comm.rank() to comm-group-node i (identity: node) */
-      int node = fft.plan[i].group[j];
-      fft.plan[i].send_size[j] = calc_send_block(
+      int node = forw[i].group[j];
+      forw[i].send_size[j] = calc_send_block(
           my_pos[i - 1], n_grid[i - 1], &(n_pos[i][3 * node]), n_grid[i],
           global_mesh_dim.data(), global_mesh_off.data(),
-          &(fft.plan[i].send_block[6ul * j]));
-      permute_ifield(&(fft.plan[i].send_block[6ul * j]), 3,
-                     -(fft.plan[i - 1].n_permute));
-      permute_ifield(&(fft.plan[i].send_block[6ul * j + 3ul]), 3,
-                     -(fft.plan[i - 1].n_permute));
-      if (fft.plan[i].send_size[j] > fft.max_comm_size)
-        fft.max_comm_size = fft.plan[i].send_size[j];
+          &(forw[i].send_block[6ul * j]));
+      permute_ifield(&(forw[i].send_block[6ul * j]), 3,
+                     -(forw[i - 1].n_permute));
+      permute_ifield(&(forw[i].send_block[6ul * j + 3ul]), 3,
+                     -(forw[i - 1].n_permute));
+      if (forw[i].send_size[j] > max_comm_size)
+        max_comm_size = forw[i].send_size[j];
       /* First plan send blocks have to be adjusted, since the CA grid
          may have an additional margin outside the actual domain of the
          node */
       if (i == 1) {
         for (std::size_t k = 0ul; k < 3ul; k++)
-          fft.plan[1].send_block[6ul * j + k] += ca_mesh_margin[2ul * k];
+          forw[1].send_block[6ul * j + k] += ca_mesh_margin[2ul * k];
       }
       /* recv block: comm.rank() from comm-group-node i (identity: node) */
-      fft.plan[i].recv_size[j] = calc_send_block(
+      forw[i].recv_size[j] = calc_send_block(
           my_pos[i], n_grid[i], &(n_pos[i - 1][3 * node]), n_grid[i - 1],
           global_mesh_dim.data(), global_mesh_off.data(),
-          &(fft.plan[i].recv_block[6ul * j]));
-      permute_ifield(&(fft.plan[i].recv_block[6ul * j]), 3,
-                     -(fft.plan[i].n_permute));
-      permute_ifield(&(fft.plan[i].recv_block[6ul * j + 3ul]), 3,
-                     -(fft.plan[i].n_permute));
-      if (fft.plan[i].recv_size[j] > fft.max_comm_size)
-        fft.max_comm_size = fft.plan[i].recv_size[j];
+          &(forw[i].recv_block[6ul * j]));
+      permute_ifield(&(forw[i].recv_block[6ul * j]), 3, -(forw[i].n_permute));
+      permute_ifield(&(forw[i].recv_block[6ul * j + 3ul]), 3,
+                     -(forw[i].n_permute));
+      if (forw[i].recv_size[j] > max_comm_size)
+        max_comm_size = forw[i].recv_size[j];
     }
 
     for (std::size_t j = 0ul; j < 3ul; j++)
-      fft.plan[i].old_mesh[j] = fft.plan[i - 1].new_mesh[j];
+      forw[i].old_mesh[j] = forw[i - 1].new_mesh[j];
     if (i == 1) {
-      fft.plan[i].element = 1;
+      forw[i].element = 1;
     } else {
-      fft.plan[i].element = 2;
-      for (std::size_t j = 0ul; j < fft.plan[i].group.size(); j++) {
-        fft.plan[i].send_size[j] *= 2;
-        fft.plan[i].recv_size[j] *= 2;
+      forw[i].element = 2;
+      for (std::size_t j = 0ul; j < forw[i].group.size(); j++) {
+        forw[i].send_size[j] *= 2;
+        forw[i].recv_size[j] *= 2;
       }
     }
   }
 
   /* Factor 2 for complex fields */
-  fft.max_comm_size *= 2;
-  fft.max_mesh_size = Utils::product(ca_mesh_dim);
+  max_comm_size *= 2;
+  max_mesh_size = Utils::product(ca_mesh_dim);
   for (int i = 1; i < 4; i++)
-    if (2 * fft.plan[i].new_size > fft.max_mesh_size)
-      fft.max_mesh_size = 2 * fft.plan[i].new_size;
+    if (2 * forw[i].new_size > max_mesh_size)
+      max_mesh_size = 2 * forw[i].new_size;
 
   /* === pack function === */
   for (int i = 1; i < 4; i++) {
-    fft.plan[i].pack_function = pack_block_permute2;
+    forw[i].pack_function = pack_block_permute2;
   }
   ks_pnum = 6;
-  if (fft.plan[1].row_dir == 2) {
-    fft.plan[1].pack_function = fft_pack_block;
+  if (forw[1].row_dir == 2) {
+    forw[1].pack_function = fft_pack_block;
     ks_pnum = 4;
-  } else if (fft.plan[1].row_dir == 1) {
-    fft.plan[1].pack_function = pack_block_permute1;
+  } else if (forw[1].row_dir == 1) {
+    forw[1].pack_function = pack_block_permute1;
     ks_pnum = 5;
   }
 
-  fft.send_buf.resize(fft.max_comm_size);
-  fft.recv_buf.resize(fft.max_comm_size);
-  fft.data_buf.resize(fft.max_mesh_size);
-  auto *c_data = (fftw_complex *)(fft.data_buf.data());
+  send_buf.resize(max_comm_size);
+  recv_buf.resize(max_comm_size);
+  data_buf.resize(max_mesh_size);
+  auto *c_data = (fftw_complex *)(data_buf.data());
 
   /* === FFT Routines (Using FFTW / RFFTW package)=== */
   for (int i = 1; i < 4; i++) {
-    fft.plan[i].dir = FFTW_FORWARD;
-    /* FFT plan creation.*/
-
-    if (fft.init_tag)
-      fftw_destroy_plan(fft.plan[i].our_fftw_plan);
-    fft.plan[i].our_fftw_plan = fftw_plan_many_dft(
-        1, &fft.plan[i].new_mesh[2], fft.plan[i].n_ffts, c_data, nullptr, 1,
-        fft.plan[i].new_mesh[2], c_data, nullptr, 1, fft.plan[i].new_mesh[2],
-        fft.plan[i].dir, FFTW_PATIENT);
+    if (init_tag) {
+      forw[i].destroy_plan();
+    }
+    forw[i].dir = FFTW_FORWARD;
+    forw[i].plan_handle =
+        fftw_plan_many_dft(1, &forw[i].new_mesh[2], forw[i].n_ffts, c_data,
+                           nullptr, 1, forw[i].new_mesh[2], c_data, nullptr, 1,
+                           forw[i].new_mesh[2], forw[i].dir, FFTW_PATIENT);
+    assert(forw[i].plan_handle);
   }
 
   /* === The BACK Direction === */
   /* this is needed because slightly different functions are used */
   for (int i = 1; i < 4; i++) {
-    fft.back[i].dir = FFTW_BACKWARD;
-
-    if (fft.init_tag)
-      fftw_destroy_plan(fft.back[i].our_fftw_plan);
-    fft.back[i].our_fftw_plan = fftw_plan_many_dft(
-        1, &fft.plan[i].new_mesh[2], fft.plan[i].n_ffts, c_data, nullptr, 1,
-        fft.plan[i].new_mesh[2], c_data, nullptr, 1, fft.plan[i].new_mesh[2],
-        fft.back[i].dir, FFTW_PATIENT);
-
-    fft.back[i].pack_function = pack_block_permute1;
+    if (init_tag) {
+      back[i].destroy_plan();
+    }
+    back[i].dir = FFTW_BACKWARD;
+    back[i].plan_handle =
+        fftw_plan_many_dft(1, &forw[i].new_mesh[2], forw[i].n_ffts, c_data,
+                           nullptr, 1, forw[i].new_mesh[2], c_data, nullptr, 1,
+                           forw[i].new_mesh[2], back[i].dir, FFTW_PATIENT);
+    back[i].pack_function = pack_block_permute1;
+    assert(back[i].plan_handle);
   }
-  if (fft.plan[1].row_dir == 2) {
-    fft.back[1].pack_function = fft_pack_block;
-  } else if (fft.plan[1].row_dir == 1) {
-    fft.back[1].pack_function = pack_block_permute2;
+  if (forw[1].row_dir == 2) {
+    back[1].pack_function = fft_pack_block;
+  } else if (forw[1].row_dir == 1) {
+    back[1].pack_function = pack_block_permute2;
   }
 
-  fft.init_tag = true;
+  init_tag = true;
 
-  return fft.max_mesh_size;
+  return max_mesh_size;
 }
 
-void fft_perform_forw(double *data, fft_data_struct &fft,
-                      const boost::mpi::communicator &comm) {
+void fft_data_struct::forward_fft(boost::mpi::communicator const &comm,
+                                  double *data) {
   /* ===== first direction  ===== */
 
   auto *c_data = (fftw_complex *)data;
-  auto *c_data_buf = (fftw_complex *)fft.data_buf.data();
+  auto *c_data_buf = (fftw_complex *)data_buf.data();
 
   /* communication to current dir row format (in is data) */
-  forw_grid_comm(fft.plan[1], data, fft.data_buf.data(), fft, comm);
+  forw_grid_comm(comm, forw[1], data, data_buf.data());
 
-  /* complexify the real data array (in is fft.data_buf) */
-  for (int i = 0; i < fft.plan[1].new_size; i++) {
-    data[2 * i + 0] = fft.data_buf[i]; /* real value */
-    data[2 * i + 1] = 0;               /* complex value */
+  /* complexify the real data array (in is data_buf) */
+  for (int i = 0; i < forw[1].new_size; i++) {
+    data[2 * i + 0] = data_buf[i]; /* real value */
+    data[2 * i + 1] = 0;           /* complex value */
   }
   /* perform FFT (in/out is data)*/
-  fftw_execute_dft(fft.plan[1].our_fftw_plan, c_data, c_data);
+  fftw_execute_dft(forw[1].plan_handle, c_data, c_data);
   /* ===== second direction ===== */
   /* communication to current dir row format (in is data) */
-  forw_grid_comm(fft.plan[2], data, fft.data_buf.data(), fft, comm);
-  /* perform FFT (in/out is fft.data_buf) */
-  fftw_execute_dft(fft.plan[2].our_fftw_plan, c_data_buf, c_data_buf);
+  forw_grid_comm(comm, forw[2], data, data_buf.data());
+  /* perform FFT (in/out is data_buf) */
+  fftw_execute_dft(forw[2].plan_handle, c_data_buf, c_data_buf);
   /* ===== third direction  ===== */
-  /* communication to current dir row format (in is fft.data_buf) */
-  forw_grid_comm(fft.plan[3], fft.data_buf.data(), data, fft, comm);
+  /* communication to current dir row format (in is data_buf) */
+  forw_grid_comm(comm, forw[3], data_buf.data(), data);
   /* perform FFT (in/out is data)*/
-  fftw_execute_dft(fft.plan[3].our_fftw_plan, c_data, c_data);
+  fftw_execute_dft(forw[3].plan_handle, c_data, c_data);
 
   /* REMARK: Result has to be in data. */
 }
 
-void fft_perform_back(double *data, bool check_complex, fft_data_struct &fft,
-                      const boost::mpi::communicator &comm) {
+void fft_data_struct::backward_fft(boost::mpi::communicator const &comm,
+                                   double *data, bool check_complex) {
 
   auto *c_data = (fftw_complex *)data;
-  auto *c_data_buf = (fftw_complex *)fft.data_buf.data();
+  auto *c_data_buf = (fftw_complex *)data_buf.data();
 
   /* ===== third direction  ===== */
 
   /* perform FFT (in is data) */
-  fftw_execute_dft(fft.back[3].our_fftw_plan, c_data, c_data);
+  fftw_execute_dft(back[3].plan_handle, c_data, c_data);
   /* communicate (in is data)*/
-  back_grid_comm(fft.plan[3], fft.back[3], data, fft.data_buf.data(), fft,
-                 comm);
+  back_grid_comm(comm, forw[3], back[3], data, data_buf.data());
 
   /* ===== second direction ===== */
-  /* perform FFT (in is fft.data_buf) */
-  fftw_execute_dft(fft.back[2].our_fftw_plan, c_data_buf, c_data_buf);
-  /* communicate (in is fft.data_buf) */
-  back_grid_comm(fft.plan[2], fft.back[2], fft.data_buf.data(), data, fft,
-                 comm);
+  /* perform FFT (in is data_buf) */
+  fftw_execute_dft(back[2].plan_handle, c_data_buf, c_data_buf);
+  /* communicate (in is data_buf) */
+  back_grid_comm(comm, forw[2], back[2], data_buf.data(), data);
 
   /* ===== first direction  ===== */
   /* perform FFT (in is data) */
-  fftw_execute_dft(fft.back[1].our_fftw_plan, c_data, c_data);
+  fftw_execute_dft(back[1].plan_handle, c_data, c_data);
   /* throw away the (hopefully) empty complex component (in is data) */
-  for (int i = 0; i < fft.plan[1].new_size; i++) {
-    fft.data_buf[i] = data[2 * i]; /* real value */
+  for (int i = 0; i < forw[1].new_size; i++) {
+    data_buf[i] = data[2 * i]; /* real value */
     // Vincent:
     if (check_complex and std::abs(data[2 * i + 1]) > 1e-5) {
       printf("Complex value is not zero (i=%d,data=%g)!!!\n", i,
@@ -743,9 +743,8 @@ void fft_perform_back(double *data, bool check_complex, fft_data_struct &fft,
         throw std::runtime_error("Complex value is not zero");
     }
   }
-  /* communicate (in is fft.data_buf) */
-  back_grid_comm(fft.plan[1], fft.back[1], fft.data_buf.data(), data, fft,
-                 comm);
+  /* communicate (in is data_buf) */
+  back_grid_comm(comm, forw[1], back[1], data_buf.data(), data);
 
   /* REMARK: Result has to be in data. */
 }
@@ -797,4 +796,16 @@ void fft_unpack_block(double const *const in, double *const out,
     li_out += s_out_offset;
   }
 }
-#endif
+
+void fft_plan::destroy_plan() {
+  if (plan_handle) {
+    fftw_destroy_plan(plan_handle);
+    plan_handle = nullptr;
+  }
+}
+
+namespace detail {
+void fft_free(void *const p) { ::fftw_free(p); }
+void *fft_malloc(std::size_t length) { return ::fftw_malloc(length); }
+} // namespace detail
+} // namespace fft
diff --git a/src/core/p3m/fft.hpp b/src/core/fft/fft.hpp
similarity index 54%
rename from src/core/p3m/fft.hpp
rename to src/core/fft/fft.hpp
index 03dd4a0de6a..8747448a371 100644
--- a/src/core/p3m/fft.hpp
+++ b/src/core/fft/fft.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 The ESPResSo project
+ * Copyright (C) 2010-2024 The ESPResSo project
  * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
  *   Max-Planck-Institute for Polymer Research, Theory Group
  *
@@ -26,71 +26,49 @@
  *  Routines, row decomposition, data structures and communication for the
  *  3D-FFT.
  *
- *  The 3D-FFT is split into 3 ond dimensional FFTs. The data is
+ *  The 3D-FFT is split into three 1D-FFTs. The data is
  *  distributed in such a way, that for the actual direction of the
  *  FFT each node has a certain number of rows for which it performs a
  *  1D-FFT. After performing the FFT on that direction the data is
  *  redistributed.
  *
- *  For simplicity at the moment I have implemented a full complex to
- *  complex FFT (even though a real to complex FFT would be sufficient).
- *
- *  For more information about FFT usage, see \ref fft.cpp "fft.cpp".
+ *  For simplicity, a full complex-to-complex FFT is implemented,
+ *  even though a real-to-complex FFT would be sufficient.
  */
 
-#include "config/config.hpp"
-
-#if defined(P3M) || defined(DP3M)
+#include "vector.hpp"
 
 #include <utils/Vector.hpp>
 
-#include <boost/mpi/communicator.hpp>
-
-#include <fftw3.h>
-
+#include <array>
 #include <cstddef>
-#include <limits>
-#include <new>
+#include <memory>
+#include <optional>
+#include <span>
+#include <utility>
 #include <vector>
 
-/** Aligned allocator for fft data. */
-template <class T> struct fft_allocator {
-  typedef T value_type;
-  fft_allocator() noexcept = default; // default ctor not required
-  template <class U> explicit fft_allocator(const fft_allocator<U> &) {}
-  template <class U> bool operator==(const fft_allocator<U> &) const {
-    return true;
-  }
-  template <class U> bool operator!=(const fft_allocator<U> &) const {
-    return false;
-  }
-
-  T *allocate(const std::size_t n) const {
-    if (n == 0) {
-      return nullptr;
-    }
-    if (n > std::numeric_limits<std::size_t>::max() / sizeof(T)) {
-      throw std::bad_array_new_length();
-    }
-    void *const pv = fftw_malloc(n * sizeof(T));
-    if (!pv) {
-      throw std::bad_alloc();
-    }
-    return static_cast<T *>(pv);
-  }
-  void deallocate(T *const p, std::size_t) const noexcept { fftw_free(p); }
-};
+struct fftw_plan_s;
+namespace boost::mpi {
+class environment;
+class communicator;
+} // namespace boost::mpi
 
-template <class T> using fft_vector = std::vector<T, fft_allocator<T>>;
+namespace fft {
 
 struct fft_plan {
+  using fftw_plan = fftw_plan_s *;
+
+  ~fft_plan() { destroy_plan(); }
+
   /** plan direction: forward or backward FFT (enum value from FFTW). */
   int dir;
   /** plan for the FFT. */
-  fftw_plan our_fftw_plan;
+  fftw_plan plan_handle = nullptr;
   /** packing function for send blocks. */
   void (*pack_function)(double const *const, double *const, int const *,
                         int const *, int const *, int);
+  void destroy_plan();
 };
 
 /** @brief Plan for a forward 1D FFT of a flattened 3D array. */
@@ -129,18 +107,27 @@ struct fft_forw_plan : public fft_plan {
 /** @brief Plan for a backward 1D FFT of a flattened 3D array. */
 struct fft_back_plan : public fft_plan {};
 
-/** Information about the three one dimensional FFTs and how the nodes
- *  have to communicate inbetween.
+/**
+ * @brief Information about the three one dimensional FFTs and how the nodes
+ * have to communicate inbetween.
  *
- *  @note FFT numbering starts with 1 for technical reasons (because we have 4
- *        node grids, the index 0 is used for the real space charge assignment
- *        grid).
+ * @note FFT numbering starts with 1 for technical reasons (because we have 4
+ * node grids, the index 0 is used for the real space charge assignment grid).
  */
-struct fft_data_struct { // NOLINT(bugprone-reserved-identifier)
+struct fft_data_struct {
+private:
+  /**
+   * @brief Handle to the MPI environment.
+   * Has to be the first member in the class definition, so that FFT plans
+   * are destroyed before the MPI environment expires (non-static class
+   * members are destroyed in the reverse order of their initialization).
+   */
+  std::shared_ptr<boost::mpi::environment> m_mpi_env;
+
   /** Information for forward FFTs. */
-  fft_forw_plan plan[4];
+  std::array<fft_forw_plan, 4u> forw;
   /** Information for backward FFTs. */
-  fft_back_plan back[4];
+  std::array<fft_back_plan, 4u> back;
 
   /** Whether FFT is initialized or not. */
   bool init_tag = false;
@@ -156,47 +143,64 @@ struct fft_data_struct { // NOLINT(bugprone-reserved-identifier)
   /** receive buffer. */
   std::vector<double> recv_buf;
   /** Buffer for receive data. */
-  fft_vector<double> data_buf;
+  fft::vector<double> data_buf;
+
+public:
+  explicit fft_data_struct(decltype(m_mpi_env) mpi_env)
+      : m_mpi_env{std::move(mpi_env)} {}
+
+  // disable copy construction: unsafe because we store raw pointers
+  // to FFT plans (avoids double-free and use-after-free)
+  fft_data_struct &operator=(fft_data_struct const &) = delete;
+  fft_data_struct(fft_data_struct const &) = delete;
+
+  /** Initialize everything connected to the 3D-FFT.
+   *
+   *  \param[in]  comm            MPI communicator.
+   *  \param[in]  ca_mesh_dim     Local CA mesh dimensions.
+   *  \param[in]  ca_mesh_margin  Local CA mesh margins.
+   *  \param[in]  global_mesh_dim Global CA mesh dimensions.
+   *  \param[in]  global_mesh_off Global CA mesh offset.
+   *  \param[out] ks_pnum         Number of permutations in k-space.
+   *  \param[in]  grid            Number of nodes in each spatial dimension.
+   *  \return Maximal size of local fft mesh (needed for allocation of ca_mesh).
+   */
+  int initialize_fft(boost::mpi::communicator const &comm,
+                     Utils::Vector3i const &ca_mesh_dim,
+                     int const *ca_mesh_margin,
+                     Utils::Vector3i const &global_mesh_dim,
+                     Utils::Vector3d const &global_mesh_off, int &ks_pnum,
+                     Utils::Vector3i const &grid);
+
+  /** Perform an in-place forward 3D FFT.
+   *  \warning The content of \a data is overwritten.
+   *  \param[in,out] data  Mesh.
+   *  \param[in]     comm  MPI communicator
+   */
+  void forward_fft(boost::mpi::communicator const &comm, double *data);
+
+  /** Perform an in-place backward 3D FFT.
+   *  \warning The content of \a data is overwritten.
+   *  \param[in,out] data           Mesh.
+   *  \param[in]     check_complex  Throw an error if the complex component is
+   *                                non-zero.
+   *  \param[in]     comm           MPI communicator.
+   */
+  void backward_fft(boost::mpi::communicator const &comm, double *data,
+                    bool check_complex);
+
+  auto get_mesh_size() const { return forw[3u].new_mesh; }
+
+  auto get_mesh_start() const { return forw[3u].start; }
+
+private:
+  void forw_grid_comm(boost::mpi::communicator const &comm,
+                      fft_forw_plan const &plan, double const *in, double *out);
+  void back_grid_comm(boost::mpi::communicator const &comm,
+                      fft_forw_plan const &plan_f, fft_back_plan const &plan_b,
+                      double const *in, double *out);
 };
 
-/** Initialize everything connected to the 3D-FFT.
- *
- *  \param[in]  ca_mesh_dim     Local CA mesh dimensions.
- *  \param[in]  ca_mesh_margin  Local CA mesh margins.
- *  \param[in]  global_mesh_dim Global CA mesh dimensions.
- *  \param[in]  global_mesh_off Global CA mesh offset.
- *  \param[out] ks_pnum         Number of permutations in k-space.
- *  \param[out] fft             FFT plan.
- *  \param[in]  grid            Number of nodes in each spatial dimension.
- *  \param[in]  comm            MPI communicator.
- *  \return Maximal size of local fft mesh (needed for allocation of ca_mesh).
- */
-int fft_init(Utils::Vector3i const &ca_mesh_dim, int const *ca_mesh_margin,
-             Utils::Vector3i const &global_mesh_dim,
-             Utils::Vector3d const &global_mesh_off, int &ks_pnum,
-             fft_data_struct &fft, Utils::Vector3i const &grid,
-             boost::mpi::communicator const &comm);
-
-/** Perform an in-place forward 3D FFT.
- *  \warning The content of \a data is overwritten.
- *  \param[in,out] data  Mesh.
- *  \param[in,out] fft   FFT plan.
- *  \param[in]     comm  MPI communicator
- */
-void fft_perform_forw(double *data, fft_data_struct &fft,
-                      const boost::mpi::communicator &comm);
-
-/** Perform an in-place backward 3D FFT.
- *  \warning The content of \a data is overwritten.
- *  \param[in,out] data           Mesh.
- *  \param[in]     check_complex  Throw an error if the complex component is
- *                                non-zero.
- *  \param[in,out] fft            FFT plan.
- *  \param[in]     comm           MPI communicator.
- */
-void fft_perform_back(double *data, bool check_complex, fft_data_struct &fft,
-                      const boost::mpi::communicator &comm);
-
 /** Pack a block (<tt>size[3]</tt> starting at <tt>start[3]</tt>) of an input
  *  3d-grid with dimension <tt>dim[3]</tt> into an output 3d-block with
  *  dimension <tt>size[3]</tt>.
@@ -234,4 +238,10 @@ void fft_unpack_block(double const *in, double *out, int const start[3],
 
 int map_3don2d_grid(int const g3d[3], int g2d[3]);
 
-#endif // defined(P3M) || defined(DP3M)
+std::optional<std::vector<int>> find_comm_groups(Utils::Vector3i const &,
+                                                 Utils::Vector3i const &,
+                                                 std::span<int const>,
+                                                 std::span<int>, std::span<int>,
+                                                 std::span<int>, int);
+
+} // namespace fft
diff --git a/src/core/fft/vector.hpp b/src/core/fft/vector.hpp
new file mode 100644
index 00000000000..e74df7a3de0
--- /dev/null
+++ b/src/core/fft/vector.hpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2010-2024 The ESPResSo project
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
+ *   Max-Planck-Institute for Polymer Research, Theory Group
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <limits>
+#include <stdexcept>
+#include <vector>
+
+namespace fft {
+namespace detail {
+void fft_free(void *p);
+void *fft_malloc(std::size_t length);
+} // namespace detail
+
+/** @brief Aligned allocator for FFT data. */
+template <class T> struct allocator {
+  typedef T value_type;
+  allocator() noexcept = default; // default ctor not required
+  template <class U> explicit allocator(const allocator<U> &) {}
+  template <class U> bool operator==(const allocator<U> &) const {
+    return true;
+  }
+  template <class U> bool operator!=(const allocator<U> &) const {
+    return false;
+  }
+
+  T *allocate(const std::size_t n) const {
+    if (n == 0) {
+      return nullptr;
+    }
+    if (n > std::numeric_limits<std::size_t>::max() / sizeof(T)) {
+      throw std::bad_array_new_length();
+    }
+    void *const pv = detail::fft_malloc(n * sizeof(T));
+    if (!pv) {
+      throw std::bad_alloc();
+    }
+    return static_cast<T *>(pv);
+  }
+
+  void deallocate(T *const p, std::size_t) const noexcept {
+    detail::fft_free(static_cast<void *>(p));
+  }
+};
+
+template <class T> using vector = std::vector<T, allocator<T>>;
+
+} // namespace fft
diff --git a/src/core/magnetostatics/dp3m.cpp b/src/core/magnetostatics/dp3m.cpp
index 4214900fb52..b3ab8871fab 100644
--- a/src/core/magnetostatics/dp3m.cpp
+++ b/src/core/magnetostatics/dp3m.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 The ESPResSo project
+ * Copyright (C) 2010-2024 The ESPResSo project
  * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
  *   Max-Planck-Institute for Polymer Research, Theory Group
  *
@@ -31,10 +31,10 @@
 
 #include "magnetostatics/dp3m.hpp"
 
+#include "fft/fft.hpp"
 #include "p3m/TuningAlgorithm.hpp"
 #include "p3m/TuningLogger.hpp"
 #include "p3m/common.hpp"
-#include "p3m/fft.hpp"
 #include "p3m/influence_function_dipolar.hpp"
 #include "p3m/interpolation.hpp"
 #include "p3m/math.hpp"
@@ -106,12 +106,9 @@ double dp3m_rtbisection(double box_size, double r_cut_iL, int n_c_part,
                         double tuned_accuracy);
 
 double DipolarP3M::calc_average_self_energy_k_space() const {
-  auto const start = Utils::Vector3i{dp3m.fft.plan[3].start};
-  auto const size = Utils::Vector3i{dp3m.fft.plan[3].new_mesh};
-
   auto const &box_geo = *get_system().box_geo;
   auto const node_phi = grid_influence_function_self_energy(
-      dp3m.params, start, start + size, dp3m.g_energy);
+      dp3m.params, dp3m.mesh.start, dp3m.mesh.stop, dp3m.g_energy);
 
   double phi = 0.;
   boost::mpi::reduce(comm_cart, node_phi, phi, std::plus<>(), 0);
@@ -131,21 +128,10 @@ void DipolarP3M::init() {
 
   dp3m.params.cao3 = Utils::int_pow<3>(dp3m.params.cao);
   dp3m.params.recalc_a_ai_cao_cut(box_geo.length());
-  dp3m.local_mesh.calc_local_ca_mesh(dp3m.params, local_geo, verlet_skin, 0.);
-
-  dp3m.sm.resize(comm_cart, dp3m.local_mesh);
-
-  int ca_mesh_size =
-      fft_init(dp3m.local_mesh.dim, dp3m.local_mesh.margin, dp3m.params.mesh,
-               dp3m.params.mesh_off, dp3m.ks_pnum, dp3m.fft,
-               ::communicator.node_grid, comm_cart);
-  dp3m.rs_mesh.resize(ca_mesh_size);
-  dp3m.ks_mesh.resize(ca_mesh_size);
-
-  for (auto &val : dp3m.rs_mesh_dip) {
-    val.resize(ca_mesh_size);
-  }
 
+  assert(dp3m.fft);
+  dp3m.local_mesh.calc_local_ca_mesh(dp3m.params, local_geo, verlet_skin, 0.);
+  dp3m.fft->init_fft();
   dp3m.calc_differential_operator();
 
   /* fix box length dependent constants */
@@ -174,15 +160,16 @@ DipolarP3M::DipolarP3M(P3MParameters &&parameters, double prefactor,
 
 namespace {
 template <int cao> struct AssignDipole {
-  void operator()(dp3m_data_struct &dp3m, Utils::Vector3d const &real_pos,
+  void operator()(decltype(DipolarP3M::dp3m) &dp3m,
+                  Utils::Vector3d const &real_pos,
                   Utils::Vector3d const &dip) const {
     auto const weights = p3m_calculate_interpolation_weights<cao>(
         real_pos, dp3m.params.ai, dp3m.local_mesh);
     p3m_interpolate<cao>(dp3m.local_mesh, weights,
                          [&dip, &dp3m](int ind, double w) {
-                           dp3m.rs_mesh_dip[0][ind] += w * dip[0];
-                           dp3m.rs_mesh_dip[1][ind] += w * dip[1];
-                           dp3m.rs_mesh_dip[2][ind] += w * dip[2];
+                           dp3m.mesh.rs_fields[0][ind] += w * dip[0];
+                           dp3m.mesh.rs_fields[1][ind] += w * dip[1];
+                           dp3m.mesh.rs_fields[2][ind] += w * dip[2];
                          });
 
     dp3m.inter_weights.store<cao>(weights);
@@ -194,9 +181,9 @@ void DipolarP3M::dipole_assign(ParticleRange const &particles) {
   dp3m.inter_weights.reset(dp3m.params.cao);
 
   /* prepare local FFT mesh */
-  for (auto &i : dp3m.rs_mesh_dip)
+  for (auto &rs_mesh_field : dp3m.mesh.rs_fields)
     for (int j = 0; j < dp3m.local_mesh.size; j++)
-      i[j] = 0.;
+      rs_mesh_field[j] = 0.;
 
   for (auto const &p : particles) {
     if (p.dipm() != 0.) {
@@ -208,7 +195,7 @@ void DipolarP3M::dipole_assign(ParticleRange const &particles) {
 
 namespace {
 template <int cao> struct AssignTorques {
-  void operator()(dp3m_data_struct const &dp3m, double prefac, int d_rs,
+  void operator()(decltype(DipolarP3M::dp3m) &dp3m, double prefac, int d_rs,
                   ParticleRange const &particles) const {
 
     /* magnetic particle index */
@@ -221,7 +208,7 @@ template <int cao> struct AssignTorques {
         Utils::Vector3d E{};
         p3m_interpolate(dp3m.local_mesh, w,
                         [&E, &dp3m, d_rs](int ind, double w) {
-                          E[d_rs] += w * dp3m.rs_mesh[ind];
+                          E[d_rs] += w * dp3m.mesh.rs_scalar[ind];
                         });
 
         p.torque() -= vector_product(p.calc_dip(), prefac * E);
@@ -232,7 +219,7 @@ template <int cao> struct AssignTorques {
 };
 
 template <int cao> struct AssignForces {
-  void operator()(dp3m_data_struct const &dp3m, double prefac, int d_rs,
+  void operator()(decltype(DipolarP3M::dp3m) &dp3m, double prefac, int d_rs,
                   ParticleRange const &particles) const {
 
     /* magnetic particle index */
@@ -244,9 +231,9 @@ template <int cao> struct AssignForces {
 
         Utils::Vector3d E{};
         p3m_interpolate(dp3m.local_mesh, w, [&E, &dp3m](int ind, double w) {
-          E[0] += w * dp3m.rs_mesh_dip[0][ind];
-          E[1] += w * dp3m.rs_mesh_dip[1][ind];
-          E[2] += w * dp3m.rs_mesh_dip[2][ind];
+          E[0] += w * dp3m.mesh.rs_fields[0][ind];
+          E[1] += w * dp3m.mesh.rs_fields[1][ind];
+          E[2] += w * dp3m.mesh.rs_fields[2][ind];
         });
 
         p.force()[d_rs] += p.calc_dip() * prefac * E;
@@ -273,17 +260,7 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
 
   if (dp3m.sum_mu2 > 0.) {
     dipole_assign(particles);
-    /* Gather information for FFT grid inside the nodes domain (inner local
-     * mesh) and perform forward 3D FFT (Charge Assignment Mesh). */
-    std::array<double *, 3> meshes = {{dp3m.rs_mesh_dip[0u].data(),
-                                       dp3m.rs_mesh_dip[1u].data(),
-                                       dp3m.rs_mesh_dip[2u].data()}};
-    dp3m.sm.gather_grid(meshes, comm_cart, dp3m.local_mesh.dim);
-    for (auto i = 0u; i < 3u; ++i) {
-      fft_perform_forw(dp3m.rs_mesh_dip[i].data(), dp3m.fft, comm_cart);
-    }
-    // Note: after these calls, the grids are in the order yzx and not xyz
-    // anymore!!!
+    dp3m.fft->perform_fwd_fft();
   }
 
   /* === k-space energy calculation  === */
@@ -295,26 +272,26 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
       /* i*k differentiation for dipolar gradients:
        * |(\Fourier{\vect{mu}}(k)\cdot \vect{k})|^2 */
 
-      auto constexpr m_start = Utils::Vector3i::broadcast(0);
-      auto const m_stop = std::span(dp3m.fft.plan[3].new_mesh);
-      auto const offset = std::span(dp3m.fft.plan[3].start);
+      auto constexpr mesh_start = Utils::Vector3i::broadcast(0);
+      auto const &offset = dp3m.mesh.start;
       auto const &d_op = dp3m.d_op[0u];
-      auto const &mesh_dip = dp3m.rs_mesh_dip;
+      auto const &mesh_dip = dp3m.mesh.rs_fields;
+      auto const [KX, KY, KZ] = dp3m.fft->get_permutations();
       auto indices = Utils::Vector3i{};
       auto index = std::size_t(0u);
       auto it_energy = dp3m.g_energy.begin();
       auto node_energy = 0.;
-      for_each_3d(m_start, m_stop, indices, [&]() {
-        using namespace detail::FFT_indexing;
+      for_each_3d(mesh_start, dp3m.mesh.size, indices, [&]() {
+        auto const shift = indices + offset;
         // Re(mu)*k
-        auto const re = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] +
-                        mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] +
-                        mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]];
+        auto const re = mesh_dip[0u][index] * d_op[shift[KX]] +
+                        mesh_dip[1u][index] * d_op[shift[KY]] +
+                        mesh_dip[2u][index] * d_op[shift[KZ]];
         ++index;
         // Im(mu)*k
-        auto const im = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] +
-                        mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] +
-                        mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]];
+        auto const im = mesh_dip[0u][index] * d_op[shift[KX]] +
+                        mesh_dip[1u][index] * d_op[shift[KY]] +
+                        mesh_dip[2u][index] * d_op[shift[KZ]];
         ++index;
         node_energy += *it_energy * (Utils::sqr(re) + Utils::sqr(im));
         std::advance(it_energy, 1);
@@ -344,30 +321,31 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
      ****************************/
     if (dp3m.sum_mu2 > 0.) {
       auto const wavenumber = 2. * std::numbers::pi * box_geo.length_inv()[0u];
-      auto constexpr m_start = Utils::Vector3i::broadcast(0);
-      auto const m_stop = std::span(dp3m.fft.plan[3].new_mesh);
-      auto const offset = std::span(dp3m.fft.plan[3].start);
+      auto constexpr mesh_start = Utils::Vector3i::broadcast(0);
+      auto const &mesh_stop = dp3m.mesh.size;
+      auto const offset = dp3m.mesh.start;
       auto const &d_op = dp3m.d_op[0u];
-      auto const &mesh_dip = dp3m.rs_mesh_dip;
+      auto const [KX, KY, KZ] = dp3m.fft->get_permutations();
+      auto &mesh_dip = dp3m.mesh.rs_fields;
       auto indices = Utils::Vector3i{};
       auto index = std::size_t(0u);
 
-      /* fill in ks_mesh array for torque calculation */
+      /* fill in mesh.ks_scalar array for torque calculation */
       auto it_energy = dp3m.g_energy.begin();
       index = 0u;
-      for_each_3d(m_start, m_stop, indices, [&]() {
-        using namespace detail::FFT_indexing;
+      for_each_3d(mesh_start, mesh_stop, indices, [&]() {
+        auto const shift = indices + offset;
         // Re(mu)*k
-        auto const re = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] +
-                        mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] +
-                        mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]];
-        dp3m.ks_mesh[index] = *it_energy * re;
+        auto const re = mesh_dip[0u][index] * d_op[shift[KX]] +
+                        mesh_dip[1u][index] * d_op[shift[KY]] +
+                        mesh_dip[2u][index] * d_op[shift[KZ]];
+        dp3m.mesh.ks_scalar[index] = *it_energy * re;
         ++index;
         // Im(mu)*k
-        auto const im = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] +
-                        mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] +
-                        mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]];
-        dp3m.ks_mesh[index] = *it_energy * im;
+        auto const im = mesh_dip[0u][index] * d_op[shift[KX]] +
+                        mesh_dip[1u][index] * d_op[shift[KY]] +
+                        mesh_dip[2u][index] * d_op[shift[KZ]];
+        dp3m.mesh.ks_scalar[index] = *it_energy * im;
         ++index;
         std::advance(it_energy, 1);
       });
@@ -375,21 +353,16 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
       /* Force component loop */
       for (int d = 0; d < 3; d++) {
         index = 0u;
-        for_each_3d(m_start, m_stop, indices, [&]() {
+        for_each_3d(mesh_start, mesh_stop, indices, [&]() {
           auto const pos = indices[d] + offset[d];
-          dp3m.rs_mesh[index] = d_op[pos] * dp3m.ks_mesh[index];
+          dp3m.mesh.rs_scalar[index] = d_op[pos] * dp3m.mesh.ks_scalar[index];
           ++index;
-          dp3m.rs_mesh[index] = d_op[pos] * dp3m.ks_mesh[index];
+          dp3m.mesh.rs_scalar[index] = d_op[pos] * dp3m.mesh.ks_scalar[index];
           ++index;
         });
-
-        /* Back FFT force component mesh */
-        fft_perform_back(dp3m.rs_mesh.data(), false, dp3m.fft, comm_cart);
-        /* redistribute force component mesh */
-        dp3m.sm.spread_grid(dp3m.rs_mesh.data(), comm_cart,
-                            dp3m.local_mesh.dim);
+        dp3m.fft->perform_space_back_fft();
         /* Assign force component from mesh to particle */
-        auto const d_rs = (d + dp3m.ks_pnum) % 3;
+        auto const d_rs = (d + dp3m.mesh.ks_pnum) % 3;
         Utils::integral_parameter<int, AssignTorques, 1, 7>(
             dp3m.params.cao, dp3m, dipole_prefac * wavenumber, d_rs, particles);
       }
@@ -398,57 +371,50 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
          DIPOLAR FORCES (k-space)
       ****************************/
       // Compute forces after torques because the algorithm below overwrites the
-      // grids dp3m.rs_mesh_dip !
+      // grids dp3m.mesh.rs_fields !
       // Note: I'll do here 9 inverse FFTs. By symmetry, we can reduce this
       // number to 6 !
-      /* fill in ks_mesh array for force calculation */
+      /* fill in mesh.ks_scalar array for force calculation */
       auto it_force = dp3m.g_force.begin();
       index = 0u;
-      for_each_3d(m_start, m_stop, indices, [&]() {
-        using namespace detail::FFT_indexing;
+      for_each_3d(mesh_start, mesh_stop, indices, [&]() {
+        auto const shift = indices + offset;
         // Re(mu)*k
-        auto const re = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] +
-                        mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] +
-                        mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]];
+        auto const re = mesh_dip[0u][index] * d_op[shift[KX]] +
+                        mesh_dip[1u][index] * d_op[shift[KY]] +
+                        mesh_dip[2u][index] * d_op[shift[KZ]];
         ++index;
         // Im(mu)*k
-        auto const im = mesh_dip[0u][index] * d_op[indices[KX] + offset[KX]] +
-                        mesh_dip[1u][index] * d_op[indices[KY] + offset[KY]] +
-                        mesh_dip[2u][index] * d_op[indices[KZ] + offset[KZ]];
+        auto const im = mesh_dip[0u][index] * d_op[shift[KX]] +
+                        mesh_dip[1u][index] * d_op[shift[KY]] +
+                        mesh_dip[2u][index] * d_op[shift[KZ]];
         ++index;
-        dp3m.ks_mesh[index - 2] = *it_force * im;
-        dp3m.ks_mesh[index - 1] = *it_force * (-re);
+        dp3m.mesh.ks_scalar[index - 2] = *it_force * im;
+        dp3m.mesh.ks_scalar[index - 1] = *it_force * (-re);
         std::advance(it_force, 1);
       });
 
       /* Force component loop */
       for (int d = 0; d < 3; d++) {
-        auto &rs_mesh_dip = dp3m.rs_mesh_dip;
         index = 0u;
-        for_each_3d(m_start, m_stop, indices, [&]() {
-          using namespace detail::FFT_indexing;
-          auto const f1 = d_op[indices[d] + offset[d]] * dp3m.ks_mesh[index];
-          rs_mesh_dip[0][index] = d_op[indices[KX] + offset[KX]] * f1;
-          rs_mesh_dip[1][index] = d_op[indices[KY] + offset[KY]] * f1;
-          rs_mesh_dip[2][index] = d_op[indices[KZ] + offset[KZ]] * f1;
+        for_each_3d(mesh_start, mesh_stop, indices, [&]() {
+          auto const shift = indices + offset;
+          auto const f1 =
+              d_op[indices[d] + offset[d]] * dp3m.mesh.ks_scalar[index];
+          mesh_dip[0u][index] = d_op[shift[KX]] * f1;
+          mesh_dip[1u][index] = d_op[shift[KY]] * f1;
+          mesh_dip[2u][index] = d_op[shift[KZ]] * f1;
           ++index;
-          auto const f2 = d_op[indices[d] + offset[d]] * dp3m.ks_mesh[index];
-          rs_mesh_dip[0][index] = d_op[indices[KX] + offset[KX]] * f2;
-          rs_mesh_dip[1][index] = d_op[indices[KY] + offset[KY]] * f2;
-          rs_mesh_dip[2][index] = d_op[indices[KZ] + offset[KZ]] * f2;
+          auto const f2 =
+              d_op[indices[d] + offset[d]] * dp3m.mesh.ks_scalar[index];
+          mesh_dip[0u][index] = d_op[shift[KX]] * f2;
+          mesh_dip[1u][index] = d_op[shift[KY]] * f2;
+          mesh_dip[2u][index] = d_op[shift[KZ]] * f2;
           ++index;
         });
-        /* Back FFT force component mesh */
-        for (auto i = 0u; i < 3u; ++i) {
-          fft_perform_back(rs_mesh_dip[i].data(), false, dp3m.fft, comm_cart);
-        }
-        /* redistribute force component mesh */
-        std::array<double *, 3> meshes = {{rs_mesh_dip[0].data(),
-                                           rs_mesh_dip[1].data(),
-                                           rs_mesh_dip[2].data()}};
-        dp3m.sm.spread_grid(meshes, comm_cart, dp3m.local_mesh.dim);
+        dp3m.fft->perform_field_back_fft();
         /* Assign force component from mesh to particle */
-        auto const d_rs = (d + dp3m.ks_pnum) % 3;
+        auto const d_rs = (d + dp3m.mesh.ks_pnum) % 3;
         Utils::integral_parameter<int, AssignForces, 1, 7>(
             dp3m.params.cao, dp3m, dipole_prefac * Utils::sqr(wavenumber), d_rs,
             particles);
@@ -490,18 +456,18 @@ double DipolarP3M::calc_surface_term(bool force_flag, bool energy_flag,
   std::size_t ip = 0u;
   for (auto const &p : particles) {
     auto const dip = p.calc_dip();
-    mx[ip] = dip[0];
-    my[ip] = dip[1];
-    mz[ip] = dip[2];
+    mx[ip] = dip[0u];
+    my[ip] = dip[1u];
+    mz[ip] = dip[2u];
     ip++;
   }
 
   // we will need the sum of all dipolar momenta vectors
   auto local_dip = Utils::Vector3d{};
   for (std::size_t i = 0u; i < n_local_part; i++) {
-    local_dip[0] += mx[i];
-    local_dip[1] += my[i];
-    local_dip[2] += mz[i];
+    local_dip[0u] += mx[i];
+    local_dip[1u] += my[i];
+    local_dip[2u] += mz[i];
   }
   auto const box_dip =
       boost::mpi::all_reduce(comm_cart, local_dip, std::plus<>());
@@ -523,17 +489,17 @@ double DipolarP3M::calc_surface_term(bool force_flag, bool energy_flag,
     std::vector<double> sumiz(n_local_part);
 
     for (std::size_t i = 0u; i < n_local_part; i++) {
-      sumix[i] = my[i] * box_dip[2] - mz[i] * box_dip[1];
-      sumiy[i] = mz[i] * box_dip[0] - mx[i] * box_dip[2];
-      sumiz[i] = mx[i] * box_dip[1] - my[i] * box_dip[0];
+      sumix[i] = my[i] * box_dip[2u] - mz[i] * box_dip[1u];
+      sumiy[i] = mz[i] * box_dip[0u] - mx[i] * box_dip[2u];
+      sumiz[i] = mx[i] * box_dip[1u] - my[i] * box_dip[0u];
     }
 
     ip = 0u;
     for (auto &p : particles) {
       auto &torque = p.torque();
-      torque[0] -= pref * sumix[ip];
-      torque[1] -= pref * sumiy[ip];
-      torque[2] -= pref * sumiz[ip];
+      torque[0u] -= pref * sumix[ip];
+      torque[1u] -= pref * sumiy[ip];
+      torque[2u] -= pref * sumiz[ip];
       ip++;
     }
   }
@@ -542,27 +508,23 @@ double DipolarP3M::calc_surface_term(bool force_flag, bool energy_flag,
 }
 
 void DipolarP3M::calc_influence_function_force() {
-  auto const start = Utils::Vector3i{dp3m.fft.plan[3].start};
-  auto const size = Utils::Vector3i{dp3m.fft.plan[3].new_mesh};
-
-  dp3m.g_force = grid_influence_function<3>(dp3m.params, start, start + size,
-                                            get_system().box_geo->length_inv());
+  dp3m.g_force =
+      grid_influence_function<3>(dp3m.params, dp3m.mesh.start, dp3m.mesh.stop,
+                                 get_system().box_geo->length_inv());
 }
 
 void DipolarP3M::calc_influence_function_energy() {
-  auto const start = Utils::Vector3i{dp3m.fft.plan[3].start};
-  auto const size = Utils::Vector3i{dp3m.fft.plan[3].new_mesh};
-
-  dp3m.g_energy = grid_influence_function<2>(
-      dp3m.params, start, start + size, get_system().box_geo->length_inv());
+  dp3m.g_energy =
+      grid_influence_function<2>(dp3m.params, dp3m.mesh.start, dp3m.mesh.stop,
+                                 get_system().box_geo->length_inv());
 }
 
 class DipolarTuningAlgorithm : public TuningAlgorithm {
-  dp3m_data_struct &dp3m;
+  decltype(DipolarP3M::dp3m) &dp3m;
   int m_mesh_max = -1, m_mesh_min = -1;
 
 public:
-  DipolarTuningAlgorithm(System::System &system, dp3m_data_struct &input_dp3m,
+  DipolarTuningAlgorithm(System::System &system, decltype(dp3m) &input_dp3m,
                          double prefactor, int timings)
       : TuningAlgorithm(system, prefactor, timings), dp3m{input_dp3m} {}
 
@@ -708,8 +670,8 @@ void DipolarP3M::tune() {
 static auto dp3m_tune_aliasing_sums(Utils::Vector3i const &shift, int mesh,
                                     double mesh_i, int cao, double alpha_L_i) {
 
-  auto constexpr m_start = Utils::Vector3i::broadcast(-P3M_BRILLOUIN);
-  auto constexpr m_stop = Utils::Vector3i::broadcast(P3M_BRILLOUIN + 1);
+  auto constexpr mesh_start = Utils::Vector3i::broadcast(-P3M_BRILLOUIN);
+  auto constexpr mesh_stop = Utils::Vector3i::broadcast(P3M_BRILLOUIN + 1);
   auto const factor1 = Utils::sqr(std::numbers::pi * alpha_L_i);
   auto alias1 = 0.;
   auto alias2 = 0.;
@@ -718,7 +680,7 @@ static auto dp3m_tune_aliasing_sums(Utils::Vector3i const &shift, int mesh,
   Utils::Vector3i nm{};
   Utils::Vector3d fnm{};
   for_each_3d(
-      m_start, m_stop, indices,
+      mesh_start, mesh_stop, indices,
       [&]() {
         auto const norm_sq = nm.norm2();
         auto const ex = std::exp(-factor1 * norm_sq);
@@ -741,14 +703,14 @@ static double dp3m_k_space_error(double box_size, int mesh, int cao,
   auto const cotangent_sum = math::get_analytic_cotangent_sum_kernel(cao);
   auto const mesh_i = 1. / static_cast<double>(mesh);
   auto const alpha_L_i = 1. / alpha_L;
-  auto const m_stop = Utils::Vector3i::broadcast(mesh / 2);
-  auto const m_start = -m_stop;
+  auto const mesh_stop = Utils::Vector3i::broadcast(mesh / 2);
+  auto const mesh_start = -mesh_stop;
   auto indices = Utils::Vector3i{};
   auto values = Utils::Vector3d{};
   auto he_q = 0.;
 
   for_each_3d(
-      m_start, m_stop, indices,
+      mesh_start, mesh_stop, indices,
       [&]() {
         if ((indices[0] != 0) or (indices[1] != 0) or (indices[2] != 0)) {
           auto const n2 = indices.norm2();
@@ -926,4 +888,5 @@ void npt_add_virial_magnetic_contribution(double energy) {
   npt_add_virial_contribution(energy);
 }
 #endif // NPT
+
 #endif // DP3M
diff --git a/src/core/magnetostatics/dp3m.hpp b/src/core/magnetostatics/dp3m.hpp
index cdbea9e8ac1..0bdf4f05ec2 100644
--- a/src/core/magnetostatics/dp3m.hpp
+++ b/src/core/magnetostatics/dp3m.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 The ESPResSo project
+ * Copyright (C) 2010-2024 The ESPResSo project
  * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
  *   Max-Planck-Institute for Polymer Research, Theory Group
  *
@@ -39,7 +39,6 @@
 
 #include "p3m/common.hpp"
 #include "p3m/data_struct.hpp"
-#include "p3m/fft.hpp"
 #include "p3m/interpolation.hpp"
 #include "p3m/send_mesh.hpp"
 
@@ -52,6 +51,7 @@
 #include <array>
 #include <cmath>
 #include <numbers>
+#include <utility>
 #include <vector>
 
 #ifdef NPT
@@ -59,42 +59,28 @@
 void npt_add_virial_magnetic_contribution(double energy);
 #endif
 
-struct dp3m_data_struct : public p3m_data_struct_base {
-  explicit dp3m_data_struct(P3MParameters &&parameters)
-      : p3m_data_struct_base{std::move(parameters)} {}
-
-  /** local mesh. */
-  P3MLocalMesh local_mesh;
-  /** real space mesh (local) for CA/FFT. */
-  fft_vector<double> rs_mesh;
-  /** real space mesh (local) for CA/FFT of the dipolar field. */
-  std::array<fft_vector<double>, 3> rs_mesh_dip;
-  /** k-space mesh (local) for k-space calculation and FFT. */
-  std::vector<double> ks_mesh;
-
-  /** number of dipolar particles (only on head node). */
-  int sum_dip_part = 0;
-  /** Sum of square of magnetic dipoles (only on head node). */
-  double sum_mu2 = 0.;
-
-  /** position shift for calculation of first assignment mesh point. */
-  double pos_shift = 0.;
+/** @brief Dipolar P3M solver. */
+struct DipolarP3M : public Dipoles::Actor<DipolarP3M> {
+  struct p3m_data_struct_impl : public p3m_data_struct {
+    explicit p3m_data_struct_impl(P3MParameters &&parameters)
+        : p3m_data_struct{std::move(parameters)} {}
 
-  p3m_interpolation_cache inter_weights;
+    /** number of dipolar particles (only on head node). */
+    int sum_dip_part = 0;
+    /** Sum of square of magnetic dipoles (only on head node). */
+    double sum_mu2 = 0.;
 
-  /** send/recv mesh sizes */
-  p3m_send_mesh sm;
+    /** position shift for calculation of first assignment mesh point. */
+    double pos_shift = 0.;
 
-  /** cached k-space self-energy correction */
-  double energy_correction = 0.;
+    p3m_interpolation_cache inter_weights;
 
-  fft_data_struct fft;
-};
+    /** cached k-space self-energy correction */
+    double energy_correction = 0.;
+  };
 
-/** @brief Dipolar P3M solver. */
-struct DipolarP3M : public Dipoles::Actor<DipolarP3M> {
   /** Dipolar P3M parameters. */
-  dp3m_data_struct dp3m;
+  p3m_data_struct_impl dp3m;
 
   /** Magnetostatics prefactor. */
   int tune_timings;
diff --git a/src/core/p3m/CMakeLists.txt b/src/core/p3m/CMakeLists.txt
index f05697c8378..2a6dd407baf 100644
--- a/src/core/p3m/CMakeLists.txt
+++ b/src/core/p3m/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2018-2022 The ESPResSo project
+# Copyright (C) 2018-2024 The ESPResSo project
 #
 # This file is part of ESPResSo.
 #
@@ -17,13 +17,5 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-if(FFTW3_FOUND)
-  target_link_libraries(espresso_core PUBLIC FFTW3::FFTW3)
-endif()
-
-target_sources(
-  espresso_core
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/TuningAlgorithm.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/send_mesh.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp)
+target_sources(espresso_core PRIVATE common.cpp send_mesh.cpp
+                                     TuningAlgorithm.cpp FFTBackendLegacy.cpp)
diff --git a/src/core/p3m/FFTBackendLegacy.cpp b/src/core/p3m/FFTBackendLegacy.cpp
new file mode 100644
index 00000000000..76fcc7f021d
--- /dev/null
+++ b/src/core/p3m/FFTBackendLegacy.cpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (C) 2010-2024 The ESPResSo project
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
+ *   Max-Planck-Institute for Polymer Research, Theory Group
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "config/config.hpp"
+
+#if defined(P3M) or defined(DP3M)
+
+#include "FFTBackendLegacy.hpp"
+
+#include "communication.hpp"
+
+#include "fft/fft.hpp"
+
+#include <utils/Vector.hpp>
+
+#include <array>
+#include <memory>
+#include <span>
+#include <utility>
+
+FFTBackendLegacy::FFTBackendLegacy(p3m_data_struct &obj)
+    : FFTBackend(obj),
+      fft{std::make_unique<fft::fft_data_struct>(
+          ::Communication::mpiCallbacksHandle()->share_mpi_env())} {}
+
+FFTBackendLegacy::~FFTBackendLegacy() = default;
+
+void FFTBackendLegacy::update_mesh_data() {
+  auto const mesh_size_ptr = fft->get_mesh_size();
+  auto const mesh_start_ptr = fft->get_mesh_start();
+  for (auto i = 0u; i < 3u; ++i) {
+    mesh.size[i] = mesh_size_ptr[i];
+    mesh.start[i] = mesh_start_ptr[i];
+  }
+  mesh.stop = mesh.start + mesh.size;
+  mesh.ks_scalar = std::span(ks_mesh);
+  mesh.rs_scalar = std::span(rs_mesh);
+  for (auto i = 0u; i < 3u; ++i) {
+    mesh.rs_fields[i] = std::span(rs_mesh_fields[i]);
+  }
+}
+
+void FFTBackendLegacy::init_fft() {
+  mesh_comm.resize(::comm_cart, local_mesh);
+  auto ca_mesh_size = fft->initialize_fft(
+      ::comm_cart, local_mesh.dim, local_mesh.margin, params.mesh,
+      params.mesh_off, mesh.ks_pnum, ::communicator.node_grid);
+  rs_mesh.resize(ca_mesh_size);
+  if (dipolar) {
+    ks_mesh.resize(ca_mesh_size);
+  }
+  for (auto &rs_mesh_field : rs_mesh_fields) {
+    rs_mesh_field.resize(ca_mesh_size);
+  }
+  update_mesh_data();
+}
+
+void FFTBackendLegacy::perform_field_back_fft() {
+  /* Back FFT force component mesh */
+  for (auto &rs_mesh_field : rs_mesh_fields) {
+    fft->backward_fft(::comm_cart, rs_mesh_field.data(),
+                      check_complex_residuals);
+  }
+  /* redistribute force component mesh */
+  std::array<double *, 3u> meshes = {{rs_mesh_fields[0u].data(),
+                                      rs_mesh_fields[1u].data(),
+                                      rs_mesh_fields[2u].data()}};
+  mesh_comm.spread_grid(::comm_cart, meshes, local_mesh.dim);
+}
+
+void FFTBackendLegacy::perform_fwd_fft() {
+  if (dipolar) {
+    std::array<double *, 3u> meshes = {{rs_mesh_fields[0u].data(),
+                                        rs_mesh_fields[1u].data(),
+                                        rs_mesh_fields[2u].data()}};
+    mesh_comm.gather_grid(::comm_cart, meshes, local_mesh.dim);
+    for (auto &rs_mesh_field : rs_mesh_fields) {
+      fft->forward_fft(::comm_cart, rs_mesh_field.data());
+    }
+  } else {
+    mesh_comm.gather_grid(::comm_cart, rs_mesh.data(), local_mesh.dim);
+    fft->forward_fft(::comm_cart, rs_mesh.data());
+  }
+  update_mesh_data();
+}
+
+void FFTBackendLegacy::perform_space_back_fft() {
+  /* Back FFT force component mesh */
+  fft->backward_fft(::comm_cart, rs_mesh.data(), check_complex_residuals);
+  /* redistribute force component mesh */
+  mesh_comm.spread_grid(::comm_cart, rs_mesh.data(), local_mesh.dim);
+}
+
+#endif // defined(P3M) or defined(DP3M)
diff --git a/src/core/p3m/FFTBackendLegacy.hpp b/src/core/p3m/FFTBackendLegacy.hpp
new file mode 100644
index 00000000000..3b4ecb41c7d
--- /dev/null
+++ b/src/core/p3m/FFTBackendLegacy.hpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2010-2024 The ESPResSo project
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
+ *   Max-Planck-Institute for Polymer Research, Theory Group
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "config/config.hpp"
+
+#if defined(P3M) or defined(DP3M)
+
+#include "common.hpp"
+#include "data_struct.hpp"
+#include "send_mesh.hpp"
+
+#include "fft/vector.hpp"
+
+#include <array>
+#include <memory>
+#include <tuple>
+
+namespace fft {
+struct fft_data_struct;
+} // namespace fft
+
+/**
+ * @brief Historic FFT backend based on FFTW3.
+ * The 3D FFT is split into three 1D FFTs.
+ */
+class FFTBackendLegacy : public FFTBackend {
+  std::unique_ptr<fft::fft_data_struct> fft;
+  /** @brief k-space mesh (local) for k-space calculations. */
+  std::vector<double> ks_mesh;
+  /** @brief real-space mesh (local) for CA/FFT. */
+  fft::vector<double> rs_mesh;
+  /** @brief real-space mesh (local) for the electric or dipolar field. */
+  std::array<fft::vector<double>, 3> rs_mesh_fields;
+  p3m_send_mesh mesh_comm;
+
+public:
+  explicit FFTBackendLegacy(p3m_data_struct &obj);
+  ~FFTBackendLegacy() override;
+  void init_fft() override;
+  void perform_fwd_fft() override;
+  void perform_field_back_fft() override;
+  void perform_space_back_fft() override;
+  void update_mesh_data();
+
+  /**
+   * @brief Index helpers for reciprocal space.
+   * After the FFT the data is in order YZX, which
+   * means that Y is the slowest changing index.
+   */
+  std::tuple<int, int, int> get_permutations() const override {
+    constexpr static int KX = 2;
+    constexpr static int KY = 0;
+    constexpr static int KZ = 1;
+    return {KX, KY, KZ};
+  }
+};
+
+#endif // defined(P3M) or defined(DP3M)
diff --git a/src/core/p3m/common.cpp b/src/core/p3m/common.cpp
index fb8023eba74..6290c46d6ab 100644
--- a/src/core/p3m/common.cpp
+++ b/src/core/p3m/common.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 The ESPResSo project
+ * Copyright (C) 2010-2024 The ESPResSo project
  * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
  *   Max-Planck-Institute for Polymer Research, Theory Group
  *
@@ -21,7 +21,7 @@
 
 #include "config/config.hpp"
 
-#if defined(P3M) || defined(DP3M)
+#if defined(P3M) or defined(DP3M)
 
 #include "common.hpp"
 
@@ -105,4 +105,4 @@ void P3MLocalMesh::calc_local_ca_mesh(P3MParameters const &params,
   q_21_off = dim[2] * (dim[1] - params.cao);
 }
 
-#endif /* defined(P3M) || defined(DP3M) */
+#endif // defined(P3M) or defined(DP3M)
diff --git a/src/core/p3m/common.hpp b/src/core/p3m/common.hpp
index 6bc30828ee5..af9a9af5959 100644
--- a/src/core/p3m/common.hpp
+++ b/src/core/p3m/common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 The ESPResSo project
+ * Copyright (C) 2010-2024 The ESPResSo project
  * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
  *   Max-Planck-Institute for Polymer Research, Theory Group
  *
@@ -45,27 +45,15 @@
 /** This value indicates metallic boundary conditions. */
 auto constexpr P3M_EPSILON_METALLIC = 0.0;
 
-#if defined(P3M) || defined(DP3M)
+#if defined(P3M) or defined(DP3M)
 
 #include "LocalBox.hpp"
 
-#include <array>
 #include <cstddef>
+#include <span>
 #include <stdexcept>
-#include <vector>
-
-namespace detail {
-/** @brief Index helpers for direct and reciprocal space.
- *  After the FFT the data is in order YZX, which
- *  means that Y is the slowest changing index.
- */
-namespace FFT_indexing {
-enum FFT_REAL_VECTOR : int { RX = 0, RY = 1, RZ = 2 };
-enum FFT_WAVE_VECTOR : int { KY = 0, KZ = 1, KX = 2 };
-} // namespace FFT_indexing
-} // namespace detail
 
-/** Structure to hold P3M parameters and some dependent variables. */
+/** @brief Structure to hold P3M parameters and some dependent variables. */
 struct P3MParameters {
   /** tuning or production? */
   bool tuning;
@@ -181,9 +169,8 @@ struct P3MParameters {
   }
 };
 
-/** Structure for local mesh parameters. */
+/** @brief Properties of the local mesh. */
 struct P3MLocalMesh {
-  /* local mesh characterization. */
   /** dimension (size) of local mesh. */
   Utils::Vector3i dim;
   /** number of local mesh points. */
@@ -228,7 +215,27 @@ struct P3MLocalMesh {
                           double space_layer);
 };
 
-#endif /* P3M || DP3M */
+/** @brief Local mesh FFT buffers. */
+struct P3MFFTMesh {
+  /** @brief k-space scalar mesh for k-space calculations. */
+  std::span<double> ks_scalar;
+  /** @brief real-space scalar mesh for charge assignment and FFT. */
+  std::span<double> rs_scalar;
+  /** @brief real-space 3D meshes for the electric or dipolar field. */
+  std::array<std::span<double>, 3> rs_fields;
+
+  /** @brief Indices of the lower left corner of the local mesh grid. */
+  Utils::Vector3i start;
+  /** @brief Indices of the upper right corner of the local mesh grid. */
+  Utils::Vector3i stop;
+  /** @brief Extents of the local mesh grid. */
+  Utils::Vector3i size;
+
+  /** @brief number of permutations in k_space */
+  int ks_pnum = 0;
+};
+
+#endif // defined(P3M) or defined(DP3M)
 
 namespace detail {
 /** Calculate indices that shift @ref P3MParameters::mesh "mesh" by `mesh/2`.
diff --git a/src/core/p3m/data_struct.hpp b/src/core/p3m/data_struct.hpp
index 410cffbbf94..3e0a9a1614a 100644
--- a/src/core/p3m/data_struct.hpp
+++ b/src/core/p3m/data_struct.hpp
@@ -1,6 +1,6 @@
 /*
- * Copyright (C) 2010-2022 The ESPResSo project
- * Copyright (C) 2002-2010
+ * Copyright (C) 2010-2024 The ESPResSo project
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
  *   Max-Planck-Institute for Polymer Research, Theory Group
  *
  * This file is part of ESPResSo.
@@ -23,20 +23,38 @@
 
 #include "config/config.hpp"
 
-#if defined(P3M) || defined(DP3M)
+#if defined(P3M) or defined(DP3M)
 
 #include "common.hpp"
 
 #include <array>
+#include <cassert>
+#include <memory>
+#include <tuple>
+#include <utility>
 #include <vector>
 
-struct p3m_data_struct_base {
-  explicit p3m_data_struct_base(P3MParameters &&parameters)
-      : params{std::move(parameters)}, ks_pnum{0} {}
+class FFTBackend;
 
+/**
+ * @brief Base class for the electrostatics and magnetostatics P3M algorithms.
+ * Contains a handle to the FFT backend, information about the local mesh,
+ * the differential operator, and various buffers.
+ */
+struct p3m_data_struct {
+  explicit p3m_data_struct(P3MParameters &&parameters)
+      : params{std::move(parameters)} {}
+
+  /** @brief P3M base parameters. */
   P3MParameters params;
+  /** @brief Local mesh properties. */
+  P3MLocalMesh local_mesh;
+  /** @brief Local mesh FFT buffers. */
+  P3MFFTMesh mesh;
 
-  /** Spatial differential operator in k-space. We use an i*k differentiation.
+  /**
+   * @brief Spatial differential operator in k-space.
+   * We use an i*k differentiation.
    */
   std::array<std::vector<int>, 3> d_op;
   /** Force optimised influence function (k-space) */
@@ -44,8 +62,8 @@ struct p3m_data_struct_base {
   /** Energy optimised influence function (k-space) */
   std::vector<double> g_energy;
 
-  /** number of permutations in k_space */
-  int ks_pnum;
+  /** FFT backend. */
+  std::unique_ptr<FFTBackend> fft;
 
   /** Calculate the Fourier transformed differential operator.
    *  Remark: This is done on the level of n-vectors and not k-vectors,
@@ -54,6 +72,50 @@ struct p3m_data_struct_base {
   void calc_differential_operator() {
     d_op = detail::calc_meshift(params.mesh, true);
   }
+
+  template <typename T, class... Args> void make_fft_instance(Args... args) {
+    assert(fft == nullptr);
+    fft = std::make_unique<T>(*this, args...);
+  }
+
+  inline void set_dipolar_mode();
 };
 
-#endif
+/**
+ * @brief API for the FFT backend of the P3M algorithm.
+ * Any FFT backend must implement this interface.
+ * The backend can read some members of @ref p3m_data_struct
+ * but can only modify the FFT buffers in @ref P3MFFTMesh.
+ */
+class FFTBackend {
+protected:
+  P3MParameters const &params;
+  P3MLocalMesh const &local_mesh;
+  P3MFFTMesh &mesh;
+
+public:
+  bool dipolar = false;
+  bool check_complex_residuals = false;
+  explicit FFTBackend(p3m_data_struct &obj)
+      : params{obj.params}, local_mesh{obj.local_mesh}, mesh{obj.mesh} {}
+  virtual ~FFTBackend() = default;
+  /** @brief Initialize the FFT plans and buffers. */
+  virtual void init_fft() = 0;
+  /** @brief Carry out the forward FFT of the scalar or field meshes. */
+  virtual void perform_fwd_fft() = 0;
+  /** @brief Carry out the backward FFT of the scalar mesh. */
+  virtual void perform_field_back_fft() = 0;
+  /** @brief Carry out the backward FFT of the field meshes. */
+  virtual void perform_space_back_fft() = 0;
+  /** @brief Get indices of the k-space data layout. */
+  virtual std::tuple<int, int, int> get_permutations() const = 0;
+};
+
+void p3m_data_struct::set_dipolar_mode() {
+  assert(g_energy.empty() and g_force.empty());
+  assert(d_op[0u].empty() and d_op[1u].empty() and d_op[2u].empty());
+  assert(not fft->dipolar);
+  fft->dipolar = true;
+}
+
+#endif // defined(P3M) or defined(DP3M)
diff --git a/src/core/p3m/for_each_3d.hpp b/src/core/p3m/for_each_3d.hpp
index 9f44871bd54..cc4bb2553d0 100644
--- a/src/core/p3m/for_each_3d.hpp
+++ b/src/core/p3m/for_each_3d.hpp
@@ -20,6 +20,7 @@
 #pragma once
 
 #include <concepts>
+#include <cstddef>
 
 namespace detail {
 
diff --git a/src/core/p3m/influence_function.hpp b/src/core/p3m/influence_function.hpp
index 24853d43e25..eb259d89d92 100644
--- a/src/core/p3m/influence_function.hpp
+++ b/src/core/p3m/influence_function.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2019-2022 The ESPResSo project
+ * Copyright (C) 2019-2024 The ESPResSo project
  *
  * This file is part of ESPResSo.
  *
@@ -47,7 +47,7 @@
  *
  * @param cao Charge assignment order.
  * @param alpha Ewald splitting parameter.
- * @param k k Vector to evaluate the function for.
+ * @param k k-vector to evaluate the function for.
  * @param h Grid spacing.
  */
 template <std::size_t S, std::size_t m>
@@ -74,7 +74,6 @@ double G_opt(int cao, double alpha, Utils::Vector3d const &k,
   for_each_3d(
       m_start, m_stop, indices,
       [&]() {
-        using namespace detail::FFT_indexing;
         auto const U2 = std::pow(Utils::product(fnm), 2 * cao);
         auto const km2 = km.norm2();
         auto const exponent = exponent_prefactor * km2;
@@ -102,17 +101,22 @@ double G_opt(int cao, double alpha, Utils::Vector3d const &k,
  *          1 for electric field...
  * @tparam m Number of aliasing terms to take into account.
  *
- * @param params P3M parameters
- * @param n_start Lower left corner of the grid
+ * @param params P3M parameters.
+ * @param n_start Lower left corner of the grid.
  * @param n_stop Upper right corner of the grid.
- * @param inv_box_l Inverse box length
+ * @param KX k-space x-axis index.
+ * @param KY k-space y-axis index.
+ * @param KZ k-space z-axis index.
+ * @param inv_box_l Inverse box length.
  * @return Values of G_opt at regular grid points.
  */
 template <std::size_t S, std::size_t m = 0>
-std::vector<double> grid_influence_function(const P3MParameters &params,
-                                            const Utils::Vector3i &n_start,
-                                            const Utils::Vector3i &n_stop,
-                                            const Utils::Vector3d &inv_box_l) {
+std::vector<double> grid_influence_function(P3MParameters const &params,
+                                            Utils::Vector3i const &n_start,
+                                            Utils::Vector3i const &n_stop,
+                                            int const KX, int const KY,
+                                            int const KZ,
+                                            Utils::Vector3d const &inv_box_l) {
 
   auto const shifts = detail::calc_meshift(params.mesh);
   auto const size = n_stop - n_start;
@@ -132,14 +136,13 @@ std::vector<double> grid_influence_function(const P3MParameters &params,
   auto index = std::size_t(0u);
 
   for_each_3d(n_start, n_stop, indices, [&]() {
-    using namespace detail::FFT_indexing;
-    if ((indices[KX] % half_mesh[RX] != 0) or
-        (indices[KY] % half_mesh[RY] != 0) or
-        (indices[KZ] % half_mesh[RZ] != 0)) {
+    if ((indices[KX] % half_mesh[0u] != 0) or
+        (indices[KY] % half_mesh[1u] != 0) or
+        (indices[KZ] % half_mesh[2u] != 0)) {
       auto const k =
-          Utils::Vector3d{{shifts[RX][indices[KX]] * wavevector[RX],
-                           shifts[RY][indices[KY]] * wavevector[RY],
-                           shifts[RZ][indices[KZ]] * wavevector[RZ]}};
+          Utils::Vector3d{{shifts[0u][indices[KX]] * wavevector[0u],
+                           shifts[1u][indices[KY]] * wavevector[1u],
+                           shifts[2u][indices[KZ]] * wavevector[2u]}};
       g[index] = G_opt<S, m>(params.cao, params.alpha, k, params.a);
     }
     ++index;
diff --git a/src/core/p3m/interpolation.hpp b/src/core/p3m/interpolation.hpp
index 355307b61f4..8aa6b76525e 100644
--- a/src/core/p3m/interpolation.hpp
+++ b/src/core/p3m/interpolation.hpp
@@ -18,8 +18,8 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef ESPRESSO_CORE_P3M_INTERPOLATION_HPP
-#define ESPRESSO_CORE_P3M_INTERPOLATION_HPP
+
+#pragma once
 
 #include <utils/index.hpp>
 #include <utils/math/bspline.hpp>
@@ -206,5 +206,3 @@ void p3m_interpolate(P3MLocalMesh const &local_mesh,
     q_ind += local_mesh.q_21_off;
   }
 }
-
-#endif
diff --git a/src/core/p3m/send_mesh.cpp b/src/core/p3m/send_mesh.cpp
index 5eea122b69a..9bb377a7688 100644
--- a/src/core/p3m/send_mesh.cpp
+++ b/src/core/p3m/send_mesh.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 The ESPResSo project
+ * Copyright (C) 2010-2024 The ESPResSo project
  * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
  *   Max-Planck-Institute for Polymer Research, Theory Group
  *
@@ -18,12 +18,13 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
+
 #include "config/config.hpp"
 
-#if defined(P3M) || defined(DP3M)
+#if defined(P3M) or defined(DP3M)
 
+#include "fft/fft.hpp"
 #include "p3m/common.hpp"
-#include "p3m/fft.hpp"
 #include "p3m/send_mesh.hpp"
 
 #include <utils/Vector.hpp>
@@ -70,8 +71,8 @@ static void p3m_add_block(double const *in, double *out, int const start[3],
   }
 }
 
-void p3m_send_mesh::resize(const boost::mpi::communicator &comm,
-                           const P3MLocalMesh &local_mesh) {
+void p3m_send_mesh::resize(boost::mpi::communicator const &comm,
+                           P3MLocalMesh const &local_mesh) {
   int done[3] = {0, 0, 0};
   /* send grids */
   for (int i = 0; i < 3; i++) {
@@ -145,9 +146,9 @@ void p3m_send_mesh::resize(const boost::mpi::communicator &comm,
   }
 }
 
-void p3m_send_mesh::gather_grid(std::span<double *> meshes,
-                                const boost::mpi::communicator &comm,
-                                const Utils::Vector3i &dim) {
+void p3m_send_mesh::gather_grid(boost::mpi::communicator const &comm,
+                                std::span<double *> meshes,
+                                Utils::Vector3i const &dim) {
   auto const node_neighbors = Utils::Mpi::cart_neighbors<3>(comm);
   send_grid.resize(max * meshes.size());
   recv_grid.resize(max * meshes.size());
@@ -159,8 +160,8 @@ void p3m_send_mesh::gather_grid(std::span<double *> meshes,
     /* pack send block */
     if (s_size[s_dir] > 0)
       for (std::size_t i = 0; i < meshes.size(); i++) {
-        fft_pack_block(meshes[i], send_grid.data() + i * s_size[s_dir],
-                       s_ld[s_dir], s_dim[s_dir], dim.data(), 1);
+        fft::fft_pack_block(meshes[i], send_grid.data() + i * s_size[s_dir],
+                            s_ld[s_dir], s_dim[s_dir], dim.data(), 1);
       }
 
     /* communication */
@@ -183,9 +184,9 @@ void p3m_send_mesh::gather_grid(std::span<double *> meshes,
   }
 }
 
-void p3m_send_mesh::spread_grid(std::span<double *> meshes,
-                                const boost::mpi::communicator &comm,
-                                const Utils::Vector3i &dim) {
+void p3m_send_mesh::spread_grid(boost::mpi::communicator const &comm,
+                                std::span<double *> meshes,
+                                Utils::Vector3i const &dim) {
   auto const node_neighbors = Utils::Mpi::cart_neighbors<3>(comm);
   send_grid.resize(max * meshes.size());
   recv_grid.resize(max * meshes.size());
@@ -197,8 +198,8 @@ void p3m_send_mesh::spread_grid(std::span<double *> meshes,
     /* pack send block */
     if (r_size[r_dir] > 0)
       for (std::size_t i = 0; i < meshes.size(); i++) {
-        fft_pack_block(meshes[i], send_grid.data() + i * r_size[r_dir],
-                       r_ld[r_dir], r_dim[r_dir], dim.data(), 1);
+        fft::fft_pack_block(meshes[i], send_grid.data() + i * r_size[r_dir],
+                            r_ld[r_dir], r_dim[r_dir], dim.data(), 1);
       }
     /* communication */
     if (node_neighbors[r_dir] != comm.rank()) {
@@ -213,11 +214,11 @@ void p3m_send_mesh::spread_grid(std::span<double *> meshes,
     /* un pack recv block */
     if (s_size[s_dir] > 0) {
       for (std::size_t i = 0; i < meshes.size(); i++) {
-        fft_unpack_block(recv_grid.data() + i * s_size[s_dir], meshes[i],
-                         s_ld[s_dir], s_dim[s_dir], dim.data(), 1);
+        fft::fft_unpack_block(recv_grid.data() + i * s_size[s_dir], meshes[i],
+                              s_ld[s_dir], s_dim[s_dir], dim.data(), 1);
       }
     }
   }
 }
 
-#endif
+#endif // defined(P3M) or defined(DP3M)
diff --git a/src/core/p3m/send_mesh.hpp b/src/core/p3m/send_mesh.hpp
index 1ccffcd91dc..8fa73435880 100644
--- a/src/core/p3m/send_mesh.hpp
+++ b/src/core/p3m/send_mesh.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 The ESPResSo project
+ * Copyright (C) 2010-2024 The ESPResSo project
  * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
  *   Max-Planck-Institute for Polymer Research, Theory Group
  *
@@ -18,12 +18,12 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef ESPRESSO_CORE_P3M_SEND_MESH_HPP
-#define ESPRESSO_CORE_P3M_SEND_MESH_HPP
+
+#pragma once
 
 #include "config/config.hpp"
 
-#if defined(P3M) || defined(DP3M)
+#if defined(P3M) or defined(DP3M)
 
 #include "p3m/common.hpp"
 
@@ -66,22 +66,20 @@ class p3m_send_mesh {
   std::vector<double> recv_grid;
 
 public:
-  void resize(const boost::mpi::communicator &comm,
-              const P3MLocalMesh &local_mesh);
-  void gather_grid(std::span<double *> meshes,
-                   const boost::mpi::communicator &comm,
-                   const Utils::Vector3i &dim);
-  void gather_grid(double *mesh, const boost::mpi::communicator &comm,
-                   const Utils::Vector3i &dim) {
-    gather_grid(std::span(&mesh, 1u), comm, dim);
+  void resize(boost::mpi::communicator const &comm,
+              P3MLocalMesh const &local_mesh);
+  void gather_grid(boost::mpi::communicator const &comm,
+                   std::span<double *> meshes, Utils::Vector3i const &dim);
+  void gather_grid(boost::mpi::communicator const &comm, double *mesh,
+                   Utils::Vector3i const &dim) {
+    gather_grid(comm, std::span(&mesh, 1u), dim);
   }
-  void spread_grid(std::span<double *> meshes,
-                   const boost::mpi::communicator &comm,
-                   const Utils::Vector3i &dim);
-  void spread_grid(double *mesh, const boost::mpi::communicator &comm,
-                   const Utils::Vector3i &dim) {
-    spread_grid(std::span(&mesh, 1u), comm, dim);
+  void spread_grid(boost::mpi::communicator const &comm,
+                   std::span<double *> meshes, Utils::Vector3i const &dim);
+  void spread_grid(boost::mpi::communicator const &comm, double *mesh,
+                   Utils::Vector3i const &dim) {
+    spread_grid(comm, std::span(&mesh, 1u), dim);
   }
 };
-#endif
-#endif
+
+#endif // defined(P3M) or defined(DP3M)
diff --git a/src/core/unit_tests/EspressoSystemStandAlone_test.cpp b/src/core/unit_tests/EspressoSystemStandAlone_test.cpp
index f917c8cc0fc..e0124cdc18e 100644
--- a/src/core/unit_tests/EspressoSystemStandAlone_test.cpp
+++ b/src/core/unit_tests/EspressoSystemStandAlone_test.cpp
@@ -49,6 +49,7 @@ namespace utf = boost::unit_test;
 #include "nonbonded_interactions/nonbonded_interaction_data.hpp"
 #include "observables/ParticleVelocities.hpp"
 #include "observables/PidObservable.hpp"
+#include "p3m/FFTBackendLegacy.hpp"
 #include "particle_node.hpp"
 #include "system/System.hpp"
 
@@ -313,6 +314,7 @@ BOOST_FIXTURE_TEST_CASE(espresso_system_stand_alone, ParticleFactory) {
                              1e-3};
     auto solver =
         std::make_shared<CoulombP3M>(std::move(p3m), prefactor, 1, false, true);
+    solver->p3m.make_fft_instance<FFTBackendLegacy>();
     add_actor(comm, espresso::system, system.coulomb.impl->solver, solver,
               [&system]() { system.on_coulomb_change(); });
 
diff --git a/src/core/unit_tests/fft_test.cpp b/src/core/unit_tests/fft_test.cpp
index 6e301a48da2..94cfc33f7fc 100644
--- a/src/core/unit_tests/fft_test.cpp
+++ b/src/core/unit_tests/fft_test.cpp
@@ -22,11 +22,12 @@
 
 #include "config/config.hpp"
 
-#if defined(P3M) || defined(DP3M)
+#if defined(P3M) or defined(DP3M)
 
 #include <boost/test/unit_test.hpp>
 
-#include "p3m/fft.hpp"
+#include "fft/fft.hpp"
+#include "fft/vector.hpp"
 #include "p3m/for_each_3d.hpp"
 
 #include <utils/Vector.hpp>
@@ -39,13 +40,8 @@
 #include <stdexcept>
 #include <vector>
 
-std::optional<std::vector<int>> find_comm_groups(Utils::Vector3i const &,
-                                                 Utils::Vector3i const &,
-                                                 std::span<int const>,
-                                                 std::span<int>, std::span<int>,
-                                                 std::span<int>, int);
-
 BOOST_AUTO_TEST_CASE(fft_find_comm_groups_mismatch) {
+  using fft::find_comm_groups;
   int my_pos[3] = {0};
   int nodelist[4] = {0};
   int nodepos[12] = {0};
@@ -68,6 +64,7 @@ BOOST_AUTO_TEST_CASE(fft_find_comm_groups_mismatch) {
 }
 
 BOOST_AUTO_TEST_CASE(fft_map_grid) {
+  using fft::map_3don2d_grid;
   {
     auto g3d = Utils::Vector3i{{3, 2, 1}};
     auto g2d = Utils::Vector3i{{3, 2, 1}};
@@ -161,7 +158,7 @@ BOOST_AUTO_TEST_CASE(fft_map_grid) {
 BOOST_AUTO_TEST_CASE(fft_exceptions) {
   auto constexpr size_max = std::numeric_limits<std::size_t>::max();
   auto constexpr bad_size = size_max / sizeof(int) + 1ul;
-  fft_allocator<int> allocator{};
+  fft::allocator<int> allocator{};
   BOOST_CHECK_EQUAL(allocator.allocate(0ul), nullptr);
   BOOST_CHECK_THROW(allocator.allocate(bad_size), std::bad_array_new_length);
 }
@@ -203,6 +200,6 @@ BOOST_AUTO_TEST_CASE(for_each_3d_test) {
   }
 }
 
-#else  // defined(P3M) || defined(DP3M)
+#else  // defined(P3M) or defined(DP3M)
 int main(int argc, char **argv) {}
-#endif // defined(P3M) || defined(DP3M)
+#endif // defined(P3M) or defined(DP3M)
diff --git a/src/script_interface/electrostatics/CoulombP3M.hpp b/src/script_interface/electrostatics/CoulombP3M.hpp
index 38fc376835c..1d221502870 100644
--- a/src/script_interface/electrostatics/CoulombP3M.hpp
+++ b/src/script_interface/electrostatics/CoulombP3M.hpp
@@ -26,6 +26,7 @@
 #include "Actor.hpp"
 
 #include "core/electrostatics/p3m.hpp"
+#include "core/p3m/FFTBackendLegacy.hpp"
 
 #include "script_interface/get_value.hpp"
 
@@ -88,6 +89,7 @@ class CoulombP3M : public Actor<CoulombP3M, ::CoulombP3M> {
           std::move(p3m), get_value<double>(params, "prefactor"),
           get_value<int>(params, "timings"), get_value<bool>(params, "verbose"),
           get_value<bool>(params, "check_complex_residuals"));
+      m_actor->p3m.make_fft_instance<FFTBackendLegacy>();
     });
     set_charge_neutrality_tolerance(params);
   }
diff --git a/src/script_interface/electrostatics/CoulombP3MGPU.hpp b/src/script_interface/electrostatics/CoulombP3MGPU.hpp
index 95ce143014c..fbb00e39c44 100644
--- a/src/script_interface/electrostatics/CoulombP3MGPU.hpp
+++ b/src/script_interface/electrostatics/CoulombP3MGPU.hpp
@@ -27,6 +27,7 @@
 #include "Actor.hpp"
 
 #include "core/electrostatics/p3m_gpu.hpp"
+#include "core/p3m/FFTBackendLegacy.hpp"
 
 #include "script_interface/get_value.hpp"
 
@@ -89,6 +90,7 @@ class CoulombP3MGPU : public Actor<CoulombP3MGPU, ::CoulombP3MGPU> {
           std::move(p3m), get_value<double>(params, "prefactor"),
           get_value<int>(params, "timings"), get_value<bool>(params, "verbose"),
           get_value<bool>(params, "check_complex_residuals"));
+      m_actor->p3m.make_fft_instance<FFTBackendLegacy>(); // for CPU part
     });
     set_charge_neutrality_tolerance(params);
   }
diff --git a/src/script_interface/magnetostatics/DipolarP3M.hpp b/src/script_interface/magnetostatics/DipolarP3M.hpp
index d37733d75d6..77913690623 100644
--- a/src/script_interface/magnetostatics/DipolarP3M.hpp
+++ b/src/script_interface/magnetostatics/DipolarP3M.hpp
@@ -26,6 +26,7 @@
 #include "Actor.hpp"
 
 #include "core/magnetostatics/dp3m.hpp"
+#include "core/p3m/FFTBackendLegacy.hpp"
 
 #include "script_interface/get_value.hpp"
 
@@ -85,6 +86,8 @@ class DipolarP3M : public Actor<DipolarP3M, ::DipolarP3M> {
           std::move(p3m), get_value<double>(params, "prefactor"),
           get_value<int>(params, "timings"),
           get_value<bool>(params, "verbose"));
+      m_actor->dp3m.make_fft_instance<FFTBackendLegacy>();
+      m_actor->dp3m.set_dipolar_mode();
     });
   }
 };

From 2adf306ac651703cb6bf1b6c3943297cbf86fbc7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Thu, 4 Jul 2024 12:11:55 +0200
Subject: [PATCH 2/2] Hide FFT dipolar flag

---
 src/core/p3m/FFTBackendLegacy.cpp                     |  4 ++--
 src/core/p3m/FFTBackendLegacy.hpp                     |  3 ++-
 src/core/p3m/data_struct.hpp                          | 10 ----------
 src/core/unit_tests/EspressoSystemStandAlone_test.cpp |  2 +-
 src/script_interface/electrostatics/CoulombP3M.hpp    |  2 +-
 src/script_interface/electrostatics/CoulombP3MGPU.hpp |  2 +-
 src/script_interface/magnetostatics/DipolarP3M.hpp    |  3 +--
 7 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/src/core/p3m/FFTBackendLegacy.cpp b/src/core/p3m/FFTBackendLegacy.cpp
index 76fcc7f021d..146c241de19 100644
--- a/src/core/p3m/FFTBackendLegacy.cpp
+++ b/src/core/p3m/FFTBackendLegacy.cpp
@@ -36,8 +36,8 @@
 #include <span>
 #include <utility>
 
-FFTBackendLegacy::FFTBackendLegacy(p3m_data_struct &obj)
-    : FFTBackend(obj),
+FFTBackendLegacy::FFTBackendLegacy(p3m_data_struct &obj, bool dipolar)
+    : FFTBackend(obj), dipolar{dipolar},
       fft{std::make_unique<fft::fft_data_struct>(
           ::Communication::mpiCallbacksHandle()->share_mpi_env())} {}
 
diff --git a/src/core/p3m/FFTBackendLegacy.hpp b/src/core/p3m/FFTBackendLegacy.hpp
index 3b4ecb41c7d..2ae1a379aef 100644
--- a/src/core/p3m/FFTBackendLegacy.hpp
+++ b/src/core/p3m/FFTBackendLegacy.hpp
@@ -44,6 +44,7 @@ struct fft_data_struct;
  * The 3D FFT is split into three 1D FFTs.
  */
 class FFTBackendLegacy : public FFTBackend {
+  bool dipolar;
   std::unique_ptr<fft::fft_data_struct> fft;
   /** @brief k-space mesh (local) for k-space calculations. */
   std::vector<double> ks_mesh;
@@ -54,7 +55,7 @@ class FFTBackendLegacy : public FFTBackend {
   p3m_send_mesh mesh_comm;
 
 public:
-  explicit FFTBackendLegacy(p3m_data_struct &obj);
+  explicit FFTBackendLegacy(p3m_data_struct &obj, bool dipolar);
   ~FFTBackendLegacy() override;
   void init_fft() override;
   void perform_fwd_fft() override;
diff --git a/src/core/p3m/data_struct.hpp b/src/core/p3m/data_struct.hpp
index 3e0a9a1614a..9b607031991 100644
--- a/src/core/p3m/data_struct.hpp
+++ b/src/core/p3m/data_struct.hpp
@@ -77,8 +77,6 @@ struct p3m_data_struct {
     assert(fft == nullptr);
     fft = std::make_unique<T>(*this, args...);
   }
-
-  inline void set_dipolar_mode();
 };
 
 /**
@@ -94,7 +92,6 @@ class FFTBackend {
   P3MFFTMesh &mesh;
 
 public:
-  bool dipolar = false;
   bool check_complex_residuals = false;
   explicit FFTBackend(p3m_data_struct &obj)
       : params{obj.params}, local_mesh{obj.local_mesh}, mesh{obj.mesh} {}
@@ -111,11 +108,4 @@ class FFTBackend {
   virtual std::tuple<int, int, int> get_permutations() const = 0;
 };
 
-void p3m_data_struct::set_dipolar_mode() {
-  assert(g_energy.empty() and g_force.empty());
-  assert(d_op[0u].empty() and d_op[1u].empty() and d_op[2u].empty());
-  assert(not fft->dipolar);
-  fft->dipolar = true;
-}
-
 #endif // defined(P3M) or defined(DP3M)
diff --git a/src/core/unit_tests/EspressoSystemStandAlone_test.cpp b/src/core/unit_tests/EspressoSystemStandAlone_test.cpp
index e0124cdc18e..ce9258b733c 100644
--- a/src/core/unit_tests/EspressoSystemStandAlone_test.cpp
+++ b/src/core/unit_tests/EspressoSystemStandAlone_test.cpp
@@ -314,7 +314,7 @@ BOOST_FIXTURE_TEST_CASE(espresso_system_stand_alone, ParticleFactory) {
                              1e-3};
     auto solver =
         std::make_shared<CoulombP3M>(std::move(p3m), prefactor, 1, false, true);
-    solver->p3m.make_fft_instance<FFTBackendLegacy>();
+    solver->p3m.make_fft_instance<FFTBackendLegacy>(false);
     add_actor(comm, espresso::system, system.coulomb.impl->solver, solver,
               [&system]() { system.on_coulomb_change(); });
 
diff --git a/src/script_interface/electrostatics/CoulombP3M.hpp b/src/script_interface/electrostatics/CoulombP3M.hpp
index 1d221502870..34ea03c20e1 100644
--- a/src/script_interface/electrostatics/CoulombP3M.hpp
+++ b/src/script_interface/electrostatics/CoulombP3M.hpp
@@ -89,7 +89,7 @@ class CoulombP3M : public Actor<CoulombP3M, ::CoulombP3M> {
           std::move(p3m), get_value<double>(params, "prefactor"),
           get_value<int>(params, "timings"), get_value<bool>(params, "verbose"),
           get_value<bool>(params, "check_complex_residuals"));
-      m_actor->p3m.make_fft_instance<FFTBackendLegacy>();
+      m_actor->p3m.make_fft_instance<FFTBackendLegacy>(false);
     });
     set_charge_neutrality_tolerance(params);
   }
diff --git a/src/script_interface/electrostatics/CoulombP3MGPU.hpp b/src/script_interface/electrostatics/CoulombP3MGPU.hpp
index fbb00e39c44..187fde4308c 100644
--- a/src/script_interface/electrostatics/CoulombP3MGPU.hpp
+++ b/src/script_interface/electrostatics/CoulombP3MGPU.hpp
@@ -90,7 +90,7 @@ class CoulombP3MGPU : public Actor<CoulombP3MGPU, ::CoulombP3MGPU> {
           std::move(p3m), get_value<double>(params, "prefactor"),
           get_value<int>(params, "timings"), get_value<bool>(params, "verbose"),
           get_value<bool>(params, "check_complex_residuals"));
-      m_actor->p3m.make_fft_instance<FFTBackendLegacy>(); // for CPU part
+      m_actor->p3m.make_fft_instance<FFTBackendLegacy>(false); // for CPU part
     });
     set_charge_neutrality_tolerance(params);
   }
diff --git a/src/script_interface/magnetostatics/DipolarP3M.hpp b/src/script_interface/magnetostatics/DipolarP3M.hpp
index 77913690623..7474345e9d4 100644
--- a/src/script_interface/magnetostatics/DipolarP3M.hpp
+++ b/src/script_interface/magnetostatics/DipolarP3M.hpp
@@ -86,8 +86,7 @@ class DipolarP3M : public Actor<DipolarP3M, ::DipolarP3M> {
           std::move(p3m), get_value<double>(params, "prefactor"),
           get_value<int>(params, "timings"),
           get_value<bool>(params, "verbose"));
-      m_actor->dp3m.make_fft_instance<FFTBackendLegacy>();
-      m_actor->dp3m.set_dipolar_mode();
+      m_actor->dp3m.make_fft_instance<FFTBackendLegacy>(true);
     });
   }
 };