From cbea0705038f917309fd2bc482ea085fdbdc2f0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Mon, 5 Jun 2023 19:28:15 +0200
Subject: [PATCH 1/5] core: Move CUDA files to subfolder

---
 src/core/CMakeLists.txt                                | 6 +++---
 src/core/EspressoSystemInterface.cpp                   | 2 +-
 src/core/EspressoSystemInterface.hpp                   | 2 +-
 src/core/EspressoSystemInterface_cuda.cu               | 4 ++--
 src/core/{ => cuda}/CudaDeviceAllocator.hpp            | 5 ++---
 src/core/{ => cuda}/CudaHostAllocator.cu               | 0
 src/core/{ => cuda}/CudaHostAllocator.hpp              | 5 ++---
 src/core/{cuda_common_cuda.cu => cuda/common_cuda.cu}  | 6 +++---
 src/core/{cuda_init.cpp => cuda/init.cpp}              | 4 ++--
 src/core/{cuda_init.hpp => cuda/init.hpp}              | 5 ++---
 src/core/{cuda_init_cuda.cu => cuda/init_cuda.cu}      | 4 ++--
 src/core/{cuda_interface.cpp => cuda/interface.cpp}    | 2 +-
 src/core/{cuda_interface.hpp => cuda/interface.hpp}    | 5 ++---
 src/core/{cuda_utils.cuh => cuda/utils.cuh}            | 8 +++-----
 src/core/{cuda_utils.hpp => cuda/utils.hpp}            | 5 ++---
 src/core/electrostatics/mmm1d_gpu_cuda.cu              | 2 +-
 src/core/electrostatics/p3m_gpu_cuda.cu                | 4 ++--
 src/core/electrostatics/p3m_gpu_error_cuda.cu          | 2 +-
 src/core/energy.cpp                                    | 2 +-
 src/core/event.cpp                                     | 6 +++---
 src/core/magnetostatics/barnes_hut_gpu.cpp             | 4 ++--
 src/core/magnetostatics/barnes_hut_gpu_cuda.cu         | 4 ++--
 src/core/magnetostatics/dipolar_direct_sum_gpu.cpp     | 2 +-
 src/core/magnetostatics/dipolar_direct_sum_gpu_cuda.cu | 2 +-
 src/core/serialization/CUDA_particle_data.hpp          | 2 +-
 src/core/unit_tests/EspressoSystemInterface_test.cpp   | 4 ++--
 src/script_interface/system/CudaInitHandle.cpp         | 4 ++--
 27 files changed, 47 insertions(+), 54 deletions(-)
 rename src/core/{ => cuda}/CudaDeviceAllocator.hpp (96%)
 rename src/core/{ => cuda}/CudaHostAllocator.cu (100%)
 rename src/core/{ => cuda}/CudaHostAllocator.hpp (95%)
 rename src/core/{cuda_common_cuda.cu => cuda/common_cuda.cu} (99%)
 rename src/core/{cuda_init.cpp => cuda/init.cpp} (98%)
 rename src/core/{cuda_init.hpp => cuda/init.hpp} (98%)
 rename src/core/{cuda_init_cuda.cu => cuda/init_cuda.cu} (98%)
 rename src/core/{cuda_interface.cpp => cuda/interface.cpp} (99%)
 rename src/core/{cuda_interface.hpp => cuda/interface.hpp} (98%)
 rename src/core/{cuda_utils.cuh => cuda/utils.cuh} (97%)
 rename src/core/{cuda_utils.hpp => cuda/utils.hpp} (93%)

diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 72830217e30..7587c362b57 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -61,10 +61,10 @@ set_target_properties(espresso_core PROPERTIES CXX_CLANG_TIDY
                                                "${ESPRESSO_CXX_CLANG_TIDY}")
 
 if(ESPRESSO_BUILD_WITH_CUDA)
-  target_sources(espresso_core PRIVATE cuda_init.cpp cuda_interface.cpp)
+  target_sources(espresso_core PRIVATE cuda/init.cpp cuda/interface.cpp)
   espresso_add_gpu_library(
-    espresso_cuda SHARED cuda_common_cuda.cu cuda_init_cuda.cu
-    CudaHostAllocator.cu magnetostatics/barnes_hut_gpu_cuda.cu
+    espresso_cuda SHARED cuda/common_cuda.cu cuda/init_cuda.cu
+    cuda/CudaHostAllocator.cu magnetostatics/barnes_hut_gpu_cuda.cu
     magnetostatics/dipolar_direct_sum_gpu_cuda.cu
     electrostatics/mmm1d_gpu_cuda.cu electrostatics/p3m_gpu_cuda.cu
     electrostatics/p3m_gpu_error_cuda.cu EspressoSystemInterface_cuda.cu)
diff --git a/src/core/EspressoSystemInterface.cpp b/src/core/EspressoSystemInterface.cpp
index f6930d7e4e5..2e7e9f87d36 100644
--- a/src/core/EspressoSystemInterface.cpp
+++ b/src/core/EspressoSystemInterface.cpp
@@ -20,7 +20,7 @@
 #include "Particle.hpp"
 #include "cells.hpp"
 #include "communication.hpp"
-#include "cuda_interface.hpp"
+#include "cuda/interface.hpp"
 #include "grid.hpp"
 
 #include <utils/Vector.hpp>
diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp
index 335390f52f7..2bc7562ac2a 100644
--- a/src/core/EspressoSystemInterface.hpp
+++ b/src/core/EspressoSystemInterface.hpp
@@ -21,7 +21,7 @@
 
 #include "SystemInterface.hpp"
 #include "config/config.hpp"
-#include "cuda_interface.hpp"
+#include "cuda/interface.hpp"
 
 #include <utils/Vector.hpp>
 
diff --git a/src/core/EspressoSystemInterface_cuda.cu b/src/core/EspressoSystemInterface_cuda.cu
index b47c7a9c639..18b559d2f93 100644
--- a/src/core/EspressoSystemInterface_cuda.cu
+++ b/src/core/EspressoSystemInterface_cuda.cu
@@ -22,8 +22,8 @@
  */
 
 #include "EspressoSystemInterface.hpp"
-#include "cuda_interface.hpp"
-#include "cuda_utils.cuh"
+#include "cuda/interface.hpp"
+#include "cuda/utils.cuh"
 #include "errorhandling.hpp"
 
 #include <cstddef>
diff --git a/src/core/CudaDeviceAllocator.hpp b/src/core/cuda/CudaDeviceAllocator.hpp
similarity index 96%
rename from src/core/CudaDeviceAllocator.hpp
rename to src/core/cuda/CudaDeviceAllocator.hpp
index 50dd4df1800..4dd30d3fd20 100644
--- a/src/core/CudaDeviceAllocator.hpp
+++ b/src/core/cuda/CudaDeviceAllocator.hpp
@@ -16,8 +16,8 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef ESPRESSO_CUDA_DEVICE_ALLOCATOR_HPP
-#define ESPRESSO_CUDA_DEVICE_ALLOCATOR_HPP
+
+#pragma once
 
 #include <utils/device_qualifier.hpp>
 
@@ -71,4 +71,3 @@ template <class T> struct CudaDeviceAllocator {
     }
   }
 };
-#endif
diff --git a/src/core/CudaHostAllocator.cu b/src/core/cuda/CudaHostAllocator.cu
similarity index 100%
rename from src/core/CudaHostAllocator.cu
rename to src/core/cuda/CudaHostAllocator.cu
diff --git a/src/core/CudaHostAllocator.hpp b/src/core/cuda/CudaHostAllocator.hpp
similarity index 95%
rename from src/core/CudaHostAllocator.hpp
rename to src/core/cuda/CudaHostAllocator.hpp
index 4de86a4d2c8..ce9cd69f1ef 100644
--- a/src/core/CudaHostAllocator.hpp
+++ b/src/core/cuda/CudaHostAllocator.hpp
@@ -16,8 +16,8 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef ESPRESSO_CUDA_HOST_ALLOCATOR_HPP
-#define ESPRESSO_CUDA_HOST_ALLOCATOR_HPP
+
+#pragma once
 
 #include <cstddef>
 #include <type_traits>
@@ -62,4 +62,3 @@ template <class T> struct CudaHostAllocator {
 };
 
 template <class T> using pinned_vector = std::vector<T, CudaHostAllocator<T>>;
-#endif
diff --git a/src/core/cuda_common_cuda.cu b/src/core/cuda/common_cuda.cu
similarity index 99%
rename from src/core/cuda_common_cuda.cu
rename to src/core/cuda/common_cuda.cu
index 40240add32f..51d683462f9 100644
--- a/src/core/cuda_common_cuda.cu
+++ b/src/core/cuda/common_cuda.cu
@@ -19,10 +19,10 @@
 #include "config/config.hpp"
 
 #include "ParticleRange.hpp"
-#include "cuda_init.hpp"
-#include "cuda_interface.hpp"
-#include "cuda_utils.cuh"
 #include "errorhandling.hpp"
+#include "init.hpp"
+#include "interface.hpp"
+#include "utils.cuh"
 
 #include "CudaDeviceAllocator.hpp"
 #include "CudaHostAllocator.hpp"
diff --git a/src/core/cuda_init.cpp b/src/core/cuda/init.cpp
similarity index 98%
rename from src/core/cuda_init.cpp
rename to src/core/cuda/init.cpp
index ff472ad04d8..b64d8ef4187 100644
--- a/src/core/cuda_init.cpp
+++ b/src/core/cuda/init.cpp
@@ -21,8 +21,8 @@
 
 #ifdef CUDA
 
-#include "cuda_init.hpp"
-#include "cuda_utils.hpp"
+#include "init.hpp"
+#include "utils.hpp"
 
 #include "communication.hpp"
 #include "errorhandling.hpp"
diff --git a/src/core/cuda_init.hpp b/src/core/cuda/init.hpp
similarity index 98%
rename from src/core/cuda_init.hpp
rename to src/core/cuda/init.hpp
index a663224db86..9ab0d65d3bc 100644
--- a/src/core/cuda_init.hpp
+++ b/src/core/cuda/init.hpp
@@ -16,8 +16,8 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef CORE_CUDA_INIT_H
-#define CORE_CUDA_INIT_H
+
+#pragma once
 
 #include "config/config.hpp"
 
@@ -110,4 +110,3 @@ std::vector<EspressoGpuDevice> cuda_gather_gpus();
 EspressoGpuDevice cuda_get_device_props(int dev);
 
 #endif // ifdef CUDA
-#endif
diff --git a/src/core/cuda_init_cuda.cu b/src/core/cuda/init_cuda.cu
similarity index 98%
rename from src/core/cuda_init_cuda.cu
rename to src/core/cuda/init_cuda.cu
index f1d8f608fe6..048e62cf7ed 100644
--- a/src/core/cuda_init_cuda.cu
+++ b/src/core/cuda/init_cuda.cu
@@ -19,8 +19,8 @@
 
 #include <cuda.h>
 
-#include "cuda_init.hpp"
-#include "cuda_utils.cuh"
+#include "init.hpp"
+#include "utils.cuh"
 
 #include <utils/constants.hpp>
 
diff --git a/src/core/cuda_interface.cpp b/src/core/cuda/interface.cpp
similarity index 99%
rename from src/core/cuda_interface.cpp
rename to src/core/cuda/interface.cpp
index 9fe7ae4ec38..08ebec7a16f 100644
--- a/src/core/cuda_interface.cpp
+++ b/src/core/cuda/interface.cpp
@@ -17,7 +17,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include "cuda_interface.hpp"
+#include "interface.hpp"
 
 #ifdef CUDA
 
diff --git a/src/core/cuda_interface.hpp b/src/core/cuda/interface.hpp
similarity index 98%
rename from src/core/cuda_interface.hpp
rename to src/core/cuda/interface.hpp
index f79cf46af9e..f4c05732a7b 100644
--- a/src/core/cuda_interface.hpp
+++ b/src/core/cuda/interface.hpp
@@ -16,8 +16,8 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef CORE_CUDA_INTERFACE_HPP
-#define CORE_CUDA_INTERFACE_HPP
+
+#pragma once
 
 #include "config/config.hpp"
 
@@ -141,4 +141,3 @@ void cuda_mpi_send_forces(const ParticleRange &particles,
 void cuda_bcast_global_part_params();
 
 #endif // CUDA
-#endif
diff --git a/src/core/cuda_utils.cuh b/src/core/cuda/utils.cuh
similarity index 97%
rename from src/core/cuda_utils.cuh
rename to src/core/cuda/utils.cuh
index 6bdafb96a6e..ca19fa0ad4c 100644
--- a/src/core/cuda_utils.cuh
+++ b/src/core/cuda/utils.cuh
@@ -16,14 +16,14 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef CORE_CUDA_UTILS_CUH
-#define CORE_CUDA_UTILS_CUH
+
+#pragma once
 
 #if !defined(__CUDACC__)
 #error Do not include CUDA headers in normal C++-code!!!
 #endif
 
-#include "cuda_utils.hpp"
+#include "utils.hpp"
 
 #include <cuda.h>
 
@@ -80,5 +80,3 @@ void cuda_check_errors_exit(const dim3 &block, const dim3 &grid,
 
 #define KERNELCALL(_function, _grid, _block, ...)                              \
   KERNELCALL_shared(_function, _grid, _block, 0, ##__VA_ARGS__)
-
-#endif
diff --git a/src/core/cuda_utils.hpp b/src/core/cuda/utils.hpp
similarity index 93%
rename from src/core/cuda_utils.hpp
rename to src/core/cuda/utils.hpp
index f65c92d1164..0c67abd7b31 100644
--- a/src/core/cuda_utils.hpp
+++ b/src/core/cuda/utils.hpp
@@ -16,8 +16,8 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef CORE_CUDA_UTILS_HPP
-#define CORE_CUDA_UTILS_HPP
+
+#pragma once
 
 #include "config/config.hpp"
 
@@ -32,4 +32,3 @@ class cuda_runtime_error : public std::runtime_error {
 };
 
 #endif // CUDA
-#endif
diff --git a/src/core/electrostatics/mmm1d_gpu_cuda.cu b/src/core/electrostatics/mmm1d_gpu_cuda.cu
index eba82a418da..82ca9bc87dc 100644
--- a/src/core/electrostatics/mmm1d_gpu_cuda.cu
+++ b/src/core/electrostatics/mmm1d_gpu_cuda.cu
@@ -31,7 +31,7 @@
 #include "electrostatics/specfunc.cuh"
 
 #include "EspressoSystemInterface.hpp"
-#include "cuda_utils.cuh"
+#include "cuda/utils.cuh"
 
 #include <utils/constants.hpp>
 #include <utils/math/sqr.hpp>
diff --git a/src/core/electrostatics/p3m_gpu_cuda.cu b/src/core/electrostatics/p3m_gpu_cuda.cu
index 3a7c005e047..68377e3d749 100644
--- a/src/core/electrostatics/p3m_gpu_cuda.cu
+++ b/src/core/electrostatics/p3m_gpu_cuda.cu
@@ -53,8 +53,8 @@
 #include "electrostatics/p3m_gpu_cuda.cuh"
 
 #include "EspressoSystemInterface.hpp"
-#include "cuda_interface.hpp"
-#include "cuda_utils.cuh"
+#include "cuda/interface.hpp"
+#include "cuda/utils.cuh"
 
 #include <utils/math/bspline.hpp>
 #include <utils/math/int_pow.hpp>
diff --git a/src/core/electrostatics/p3m_gpu_error_cuda.cu b/src/core/electrostatics/p3m_gpu_error_cuda.cu
index 047efbe59ed..c279fe54625 100644
--- a/src/core/electrostatics/p3m_gpu_error_cuda.cu
+++ b/src/core/electrostatics/p3m_gpu_error_cuda.cu
@@ -25,7 +25,7 @@
 
 #include "p3m_gpu_error.hpp"
 
-#include "cuda_utils.cuh"
+#include "cuda/utils.cuh"
 
 #include <utils/math/int_pow.hpp>
 #include <utils/math/sinc.hpp>
diff --git a/src/core/energy.cpp b/src/core/energy.cpp
index 6edd616a38d..af29f288d5b 100644
--- a/src/core/energy.cpp
+++ b/src/core/energy.cpp
@@ -26,7 +26,7 @@
 #include "Observable_stat.hpp"
 #include "communication.hpp"
 #include "constraints.hpp"
-#include "cuda_interface.hpp"
+#include "cuda/interface.hpp"
 #include "energy_inline.hpp"
 #include "event.hpp"
 #include "forces.hpp"
diff --git a/src/core/event.cpp b/src/core/event.cpp
index 18dc0ef34bc..fa9c2b25400 100644
--- a/src/core/event.cpp
+++ b/src/core/event.cpp
@@ -31,9 +31,9 @@
 #include "collision.hpp"
 #include "communication.hpp"
 #include "config/config.hpp"
-#include "cuda_init.hpp"
-#include "cuda_interface.hpp"
-#include "cuda_utils.hpp"
+#include "cuda/init.hpp"
+#include "cuda/interface.hpp"
+#include "cuda/utils.hpp"
 #include "electrostatics/coulomb.hpp"
 #include "electrostatics/icc.hpp"
 #include "errorhandling.hpp"
diff --git a/src/core/magnetostatics/barnes_hut_gpu.cpp b/src/core/magnetostatics/barnes_hut_gpu.cpp
index 09726700e0d..efc1d0dae02 100644
--- a/src/core/magnetostatics/barnes_hut_gpu.cpp
+++ b/src/core/magnetostatics/barnes_hut_gpu.cpp
@@ -26,8 +26,8 @@
 
 #include "EspressoSystemInterface.hpp"
 #include "communication.hpp"
-#include "cuda_interface.hpp"
-#include "cuda_utils.hpp"
+#include "cuda/interface.hpp"
+#include "cuda/utils.hpp"
 #include "errorhandling.hpp"
 
 DipolarBarnesHutGpu::DipolarBarnesHutGpu(double prefactor, double epssq,
diff --git a/src/core/magnetostatics/barnes_hut_gpu_cuda.cu b/src/core/magnetostatics/barnes_hut_gpu_cuda.cu
index 2955dea44ba..e797079624f 100644
--- a/src/core/magnetostatics/barnes_hut_gpu_cuda.cu
+++ b/src/core/magnetostatics/barnes_hut_gpu_cuda.cu
@@ -28,8 +28,8 @@
 
 #include "magnetostatics/barnes_hut_gpu_cuda.cuh"
 
-#include "cuda_init.hpp"
-#include "cuda_utils.cuh"
+#include "cuda/init.hpp"
+#include "cuda/utils.cuh"
 
 #include <thrust/device_ptr.h>
 #include <thrust/reduce.h>
diff --git a/src/core/magnetostatics/dipolar_direct_sum_gpu.cpp b/src/core/magnetostatics/dipolar_direct_sum_gpu.cpp
index 1ea6520b133..ef04ac0b633 100644
--- a/src/core/magnetostatics/dipolar_direct_sum_gpu.cpp
+++ b/src/core/magnetostatics/dipolar_direct_sum_gpu.cpp
@@ -26,7 +26,7 @@
 
 #include "EspressoSystemInterface.hpp"
 #include "communication.hpp"
-#include "cuda_interface.hpp"
+#include "cuda/interface.hpp"
 #include "grid.hpp"
 
 static void get_simulation_box(float *box, int *per) {
diff --git a/src/core/magnetostatics/dipolar_direct_sum_gpu_cuda.cu b/src/core/magnetostatics/dipolar_direct_sum_gpu_cuda.cu
index b8818997fa8..3f6c58f94c8 100644
--- a/src/core/magnetostatics/dipolar_direct_sum_gpu_cuda.cu
+++ b/src/core/magnetostatics/dipolar_direct_sum_gpu_cuda.cu
@@ -23,7 +23,7 @@
 
 #include "magnetostatics/dipolar_direct_sum_gpu_cuda.cuh"
 
-#include "cuda_utils.cuh"
+#include "cuda/utils.cuh"
 
 #include <thrust/device_ptr.h>
 #include <thrust/reduce.h>
diff --git a/src/core/serialization/CUDA_particle_data.hpp b/src/core/serialization/CUDA_particle_data.hpp
index fef5a94d980..1ea146f0734 100644
--- a/src/core/serialization/CUDA_particle_data.hpp
+++ b/src/core/serialization/CUDA_particle_data.hpp
@@ -19,7 +19,7 @@
 #ifndef UTILS_SERIALIZATION_CUDA_PARTICLE_DATA_HPP
 #define UTILS_SERIALIZATION_CUDA_PARTICLE_DATA_HPP
 
-#include "cuda_interface.hpp"
+#include "cuda/interface.hpp"
 
 #include <boost/serialization/array.hpp>
 #include <boost/serialization/is_bitwise_serializable.hpp>
diff --git a/src/core/unit_tests/EspressoSystemInterface_test.cpp b/src/core/unit_tests/EspressoSystemInterface_test.cpp
index f2d97586dac..564ec771e52 100644
--- a/src/core/unit_tests/EspressoSystemInterface_test.cpp
+++ b/src/core/unit_tests/EspressoSystemInterface_test.cpp
@@ -53,8 +53,8 @@ inline void check_uninitialized_device_pointers() {
 
 #ifdef CUDA
 
-#include "cuda_init.hpp"
-#include "cuda_utils.hpp"
+#include "cuda/init.hpp"
+#include "cuda/utils.hpp"
 
 /* Decorator to run a unit test depending on GPU availability. */
 boost::test_tools::assertion_result has_gpu(boost::unit_test::test_unit_id) {
diff --git a/src/script_interface/system/CudaInitHandle.cpp b/src/script_interface/system/CudaInitHandle.cpp
index 2762e62b70c..98472a976d6 100644
--- a/src/script_interface/system/CudaInitHandle.cpp
+++ b/src/script_interface/system/CudaInitHandle.cpp
@@ -21,8 +21,8 @@
 
 #include "config/config.hpp"
 
-#include "core/cuda_init.hpp"
-#include "core/cuda_utils.hpp"
+#include "core/cuda/init.hpp"
+#include "core/cuda/utils.hpp"
 
 #include <string>
 #include <unordered_map>

From 53926ee3f7d8fb50a850c8018b6846cf900c8b0c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Tue, 6 Jun 2023 23:36:12 +0200
Subject: [PATCH 2/5] core: Cleanup CUDA source files

Remove unreachable code. Include what you use.
---
 src/core/cuda/CudaDeviceAllocator.hpp         |  3 --
 src/core/cuda/CudaHostAllocator.cu            |  4 +-
 src/core/cuda/common_cuda.cu                  |  4 +-
 src/core/cuda/init.cpp                        |  2 -
 src/core/cuda/interface.cpp                   | 30 -------------
 src/core/cuda/interface.hpp                   | 44 -------------------
 src/core/cuda/utils.cuh                       |  4 --
 .../ibm_cuda_particle_velocities_input.hpp    | 42 ------------------
 8 files changed, 2 insertions(+), 131 deletions(-)
 delete mode 100644 src/core/serialization/ibm_cuda_particle_velocities_input.hpp

diff --git a/src/core/cuda/CudaDeviceAllocator.hpp b/src/core/cuda/CudaDeviceAllocator.hpp
index 4dd30d3fd20..2b1517e2734 100644
--- a/src/core/cuda/CudaDeviceAllocator.hpp
+++ b/src/core/cuda/CudaDeviceAllocator.hpp
@@ -19,8 +19,6 @@
 
 #pragma once
 
-#include <utils/device_qualifier.hpp>
-
 #ifndef __CUDACC__
 #error "This header can only be used with a CUDA-capable compiler."
 #endif
@@ -67,7 +65,6 @@ template <class T> struct CudaDeviceAllocator {
     try {
       thrust::device_free(p);
     } catch (thrust::system::system_error const &) {
-      ;
     }
   }
 };
diff --git a/src/core/cuda/CudaHostAllocator.cu b/src/core/cuda/CudaHostAllocator.cu
index b45beeecce3..4ad8a8a42b6 100644
--- a/src/core/cuda/CudaHostAllocator.cu
+++ b/src/core/cuda/CudaHostAllocator.cu
@@ -18,10 +18,8 @@
  */
 #include "CudaHostAllocator.hpp"
 
-#include <cuda.h>
-
 #include <cstddef>
-#include <stdexcept>
+#include <new>
 
 void cuda_malloc_host(void **p, std::size_t n) {
   cudaError_t error = cudaMallocHost(p, n);
diff --git a/src/core/cuda/common_cuda.cu b/src/core/cuda/common_cuda.cu
index 51d683462f9..b0ad028c4bf 100644
--- a/src/core/cuda/common_cuda.cu
+++ b/src/core/cuda/common_cuda.cu
@@ -27,12 +27,10 @@
 #include "CudaDeviceAllocator.hpp"
 #include "CudaHostAllocator.hpp"
 
-#include <utils/constants.hpp>
+#include <utils/Span.hpp>
 
 #include <thrust/device_vector.h>
 
-#include <cuda.h>
-
 #include <cstddef>
 #include <cstdio>
 
diff --git a/src/core/cuda/init.cpp b/src/core/cuda/init.cpp
index b64d8ef4187..5aca865c74d 100644
--- a/src/core/cuda/init.cpp
+++ b/src/core/cuda/init.cpp
@@ -25,11 +25,9 @@
 #include "utils.hpp"
 
 #include "communication.hpp"
-#include "errorhandling.hpp"
 
 #include <mpi.h>
 
-#include <algorithm>
 #include <cstring>
 #include <iterator>
 #include <set>
diff --git a/src/core/cuda/interface.cpp b/src/core/cuda/interface.cpp
index 08ebec7a16f..b7df6482e6d 100644
--- a/src/core/cuda/interface.cpp
+++ b/src/core/cuda/interface.cpp
@@ -41,43 +41,13 @@ static void pack_particles(ParticleRange particles,
   int i = 0;
   for (auto const &part : particles) {
     buffer[i].p = static_cast<Vector3f>(folded_position(part.pos(), box_geo));
-
     buffer[i].identity = part.id();
-    buffer[i].v = static_cast<Vector3f>(part.v());
-#ifdef VIRTUAL_SITES
-    buffer[i].is_virtual = part.is_virtual();
-#endif
-
 #ifdef DIPOLES
     buffer[i].dip = static_cast<Vector3f>(part.calc_dip());
 #endif
-
-#ifdef LB_ELECTROHYDRODYNAMICS
-    buffer[i].mu_E = static_cast<Vector3f>(part.mu_E());
-#endif
-
 #ifdef ELECTROSTATICS
     buffer[i].q = static_cast<float>(part.q());
 #endif
-
-#ifdef MASS
-    buffer[i].mass = static_cast<float>(part.mass());
-#endif
-
-#ifdef ROTATION
-    buffer[i].director = static_cast<Vector3f>(part.calc_director());
-#endif
-
-#ifdef ENGINE
-    buffer[i].swim.v_swim = static_cast<float>(part.swimming().v_swim);
-    buffer[i].swim.f_swim = static_cast<float>(part.swimming().f_swim);
-    buffer[i].swim.director = buffer[i].director;
-
-    buffer[i].swim.push_pull = part.swimming().push_pull;
-    buffer[i].swim.dipole_length =
-        static_cast<float>(part.swimming().dipole_length);
-    buffer[i].swim.swimming = part.swimming().swimming;
-#endif
     i++;
   }
 }
diff --git a/src/core/cuda/interface.hpp b/src/core/cuda/interface.hpp
index f4c05732a7b..cb6433dc678 100644
--- a/src/core/cuda/interface.hpp
+++ b/src/core/cuda/interface.hpp
@@ -29,59 +29,15 @@
 #include <utils/Span.hpp>
 #include <utils/Vector.hpp>
 
-// Parameters for swimmers
-#ifdef ENGINE
-struct CUDA_ParticleParametersSwimming {
-  using Vector3f = Utils::Vector3f;
-
-  // v_cs has to stay in the front for memmove reasons
-  float v_cs[6];
-  float v_swim;
-  float f_swim;
-  Vector3f director;
-  int push_pull;
-  float dipole_length;
-  bool swimming;
-};
-#endif
-
 /** data structure which must be copied to the GPU at each step run on the GPU
  */
 struct CUDA_particle_data {
   using Vector3f = Utils::Vector3f;
-#ifdef ENGINE
-  // This has to stay in front of the struct for memmove reasons
-  CUDA_ParticleParametersSwimming swim;
-#endif
-
   Vector3f p;
-
   int identity;
-
-#ifdef VIRTUAL_SITES
-  bool is_virtual;
-#else
-  static constexpr const bool is_virtual = false;
-#endif
-
-  Vector3f v;
-
-#ifdef ROTATION
-  Vector3f director;
-#endif
-
-#ifdef LB_ELECTROHYDRODYNAMICS
-  Vector3f mu_E;
-#endif
-
 #ifdef ELECTROSTATICS
   float q;
 #endif
-
-#ifdef MASS
-  float mass;
-#endif
-
 #ifdef DIPOLES
   Vector3f dip;
 #endif
diff --git a/src/core/cuda/utils.cuh b/src/core/cuda/utils.cuh
index ca19fa0ad4c..d699a6fe13c 100644
--- a/src/core/cuda/utils.cuh
+++ b/src/core/cuda/utils.cuh
@@ -74,9 +74,5 @@ void cuda_check_errors_exit(const dim3 &block, const dim3 &grid,
   _function<<<_grid, _block, _stream, stream[0]>>>(__VA_ARGS__);               \
   cuda_check_errors_exit(_grid, _block, #_function, __FILE__, __LINE__);
 
-#define KERNELCALL_stream(_function, _grid, _block, _stream, ...)              \
-  _function<<<_grid, _block, 0, _stream>>>(__VA_ARGS__);                       \
-  cuda_check_errors_exit(_grid, _block, #_function, __FILE__, __LINE__);
-
 #define KERNELCALL(_function, _grid, _block, ...)                              \
   KERNELCALL_shared(_function, _grid, _block, 0, ##__VA_ARGS__)
diff --git a/src/core/serialization/ibm_cuda_particle_velocities_input.hpp b/src/core/serialization/ibm_cuda_particle_velocities_input.hpp
deleted file mode 100644
index 26d4ef6f685..00000000000
--- a/src/core/serialization/ibm_cuda_particle_velocities_input.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (C) 2010-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-#ifndef CORE_UTILS_SERIALIZATION_IBM_CUDA_PARTICLE_DATA_INPUT_HPP
-#define CORE_UTILS_SERIALIZATION_IBM_CUDA_PARTICLE_DATA_INPUT_HPP
-
-#include "virtual_sites/lb_inertialess_tracers_cuda_interface.hpp"
-
-namespace boost {
-namespace serialization {
-template <class Archive>
-void serialize(Archive &ar, IBM_CUDA_ParticleDataInput &d,
-               const unsigned int /* version */) {
-  ar &d.pos;
-  ar &d.f;
-  ar &d.is_virtual;
-}
-
-template <class Archive>
-void serialize(Archive &ar, IBM_CUDA_ParticleDataOutput &d,
-               const unsigned int /* version */) {
-  ar &d.v;
-}
-} // namespace serialization
-} // namespace boost
-
-#endif

From 51e6c2775c8415e7b49f9585d8dda729d4e2c0cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Wed, 7 Jun 2023 21:12:09 +0200
Subject: [PATCH 3/5] core: Rewrite particle data management on the GPU

---
 src/core/CMakeLists.txt                       |   8 +-
 src/core/EspressoSystemInterface.cpp          |  61 ---
 src/core/EspressoSystemInterface.hpp          | 157 -------
 src/core/EspressoSystemInterface_cuda.cu      | 149 -------
 src/core/SystemInterface.hpp                  | 106 -----
 src/core/cuda/common_cuda.cu                  | 176 +-------
 src/core/cuda/interface.cpp                   | 133 ------
 src/core/cuda/interface.hpp                   |  99 -----
 src/core/electrostatics/mmm1d_gpu.cpp         |  13 +-
 src/core/electrostatics/mmm1d_gpu.hpp         |   4 -
 src/core/electrostatics/mmm1d_gpu_cuda.cu     | 108 ++---
 src/core/electrostatics/p3m_gpu.cpp           |   9 +-
 src/core/electrostatics/p3m_gpu_cuda.cu       | 151 +++----
 src/core/energy.cpp                           |  19 +-
 src/core/event.cpp                            |   6 -
 src/core/forces.cpp                           |  11 +-
 src/core/magnetostatics/barnes_hut_gpu.cpp    |  36 +-
 .../magnetostatics/dipolar_direct_sum_gpu.cpp |  43 +-
 src/core/serialization/CUDA_particle_data.hpp |  54 ---
 src/core/system/GpuParticleData.cpp           | 174 ++++++++
 src/core/system/GpuParticleData.hpp           | 133 ++++++
 src/core/system/GpuParticleData_cuda.cu       | 399 ++++++++++++++++++
 .../System.cpp}                               |  13 +-
 src/core/system/System.hpp                    |  47 +++
 src/core/unit_tests/CMakeLists.txt            |   5 +-
 .../EspressoSystemInterface_test.cpp          | 186 --------
 src/core/unit_tests/EspressoSystem_test.cpp   | 160 +++++++
 27 files changed, 1140 insertions(+), 1320 deletions(-)
 delete mode 100644 src/core/EspressoSystemInterface.cpp
 delete mode 100644 src/core/EspressoSystemInterface.hpp
 delete mode 100644 src/core/EspressoSystemInterface_cuda.cu
 delete mode 100644 src/core/SystemInterface.hpp
 delete mode 100644 src/core/cuda/interface.cpp
 delete mode 100644 src/core/cuda/interface.hpp
 delete mode 100644 src/core/serialization/CUDA_particle_data.hpp
 create mode 100644 src/core/system/GpuParticleData.cpp
 create mode 100644 src/core/system/GpuParticleData.hpp
 create mode 100644 src/core/system/GpuParticleData_cuda.cu
 rename src/core/{SystemInterface.cpp => system/System.cpp} (75%)
 create mode 100644 src/core/system/System.hpp
 delete mode 100644 src/core/unit_tests/EspressoSystemInterface_test.cpp
 create mode 100644 src/core/unit_tests/EspressoSystem_test.cpp

diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 7587c362b57..45d3942899d 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -30,7 +30,6 @@ add_library(
   dpd.cpp
   energy.cpp
   errorhandling.cpp
-  EspressoSystemInterface.cpp
   forcecap.cpp
   forces.cpp
   ghosts.cpp
@@ -48,11 +47,12 @@ add_library(
   rotate_system.cpp
   rotation.cpp
   Observable_stat.cpp
-  SystemInterface.cpp
   thermostat.cpp
   tuning.cpp
   virtual_sites.cpp
   exclusions.cpp
+  system/GpuParticleData.cpp
+  system/System.cpp
   PartCfg.cpp
   EspressoSystemStandAlone.cpp
   TabulatedPotential.cpp)
@@ -61,13 +61,13 @@ set_target_properties(espresso_core PROPERTIES CXX_CLANG_TIDY
                                                "${ESPRESSO_CXX_CLANG_TIDY}")
 
 if(ESPRESSO_BUILD_WITH_CUDA)
-  target_sources(espresso_core PRIVATE cuda/init.cpp cuda/interface.cpp)
+  target_sources(espresso_core PRIVATE cuda/init.cpp)
   espresso_add_gpu_library(
     espresso_cuda SHARED cuda/common_cuda.cu cuda/init_cuda.cu
     cuda/CudaHostAllocator.cu magnetostatics/barnes_hut_gpu_cuda.cu
     magnetostatics/dipolar_direct_sum_gpu_cuda.cu
     electrostatics/mmm1d_gpu_cuda.cu electrostatics/p3m_gpu_cuda.cu
-    electrostatics/p3m_gpu_error_cuda.cu EspressoSystemInterface_cuda.cu)
+    electrostatics/p3m_gpu_error_cuda.cu system/GpuParticleData_cuda.cu)
   add_library(espresso::cuda ALIAS espresso_cuda)
   target_link_libraries(
     espresso_cuda PRIVATE CUDA::cuda_driver CUDA::cudart CUDA::cufft
diff --git a/src/core/EspressoSystemInterface.cpp b/src/core/EspressoSystemInterface.cpp
deleted file mode 100644
index 2e7e9f87d36..00000000000
--- a/src/core/EspressoSystemInterface.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (C) 2014-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-#include "EspressoSystemInterface.hpp"
-#include "Particle.hpp"
-#include "cells.hpp"
-#include "communication.hpp"
-#include "cuda/interface.hpp"
-#include "grid.hpp"
-
-#include <utils/Vector.hpp>
-
-EspressoSystemInterface *EspressoSystemInterface::m_instance = nullptr;
-
-void EspressoSystemInterface::gatherParticles() {
-#ifdef CUDA
-  if (m_gpu) {
-    if (gpu_get_global_particle_vars_pointer_host()->communication_enabled) {
-      copy_part_data_to_gpu(cell_structure.local_particles(), this_node);
-      reallocDeviceMemory(gpu_get_particle_pointer().size());
-      if (m_splitParticleStructGpu && (this_node == 0))
-        split_particle_struct();
-    }
-  }
-#endif
-}
-
-#ifdef CUDA
-void EspressoSystemInterface::enableParticleCommunication() {
-  if (m_gpu) {
-    if (!gpu_get_global_particle_vars_pointer_host()->communication_enabled) {
-      gpu_init_particle_comm(this_node);
-      cuda_bcast_global_part_params();
-      reallocDeviceMemory(gpu_get_particle_pointer().size());
-    }
-  }
-}
-#endif
-
-void EspressoSystemInterface::init() { gatherParticles(); }
-
-void EspressoSystemInterface::update() { gatherParticles(); }
-
-Utils::Vector3d EspressoSystemInterface::box() const {
-  return box_geo.length();
-}
diff --git a/src/core/EspressoSystemInterface.hpp b/src/core/EspressoSystemInterface.hpp
deleted file mode 100644
index 2bc7562ac2a..00000000000
--- a/src/core/EspressoSystemInterface.hpp
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Copyright (C) 2014-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-#ifndef ESPRESSOSYSTEMINTERFACE_H
-#define ESPRESSOSYSTEMINTERFACE_H
-
-#include "SystemInterface.hpp"
-#include "config/config.hpp"
-#include "cuda/interface.hpp"
-
-#include <utils/Vector.hpp>
-
-#include <cstddef>
-
-/**
- * @brief CUDA implementation of @ref SystemInterface.
- *
- * When data is synchronized between host and device memory, a subset
- * of the @ref Particle struct is copied from each particle on the host
- * to the corresponding @ref CUDA_particle_data struct on the device via
- * @ref EspressoSystemInterface::gatherParticles(). Once the transfer is
- * complete, the particle AoS on the device is copied (or "split") to
- * a SoA via @ref EspressoSystemInterface::split_particle_struct().
- */
-class EspressoSystemInterface : public SystemInterface {
-public:
-  EspressoSystemInterface() = default;
-  ~EspressoSystemInterface() override = default;
-
-  static EspressoSystemInterface &Instance() {
-    if (!m_instance)
-      m_instance = new EspressoSystemInterface;
-
-    return *m_instance;
-  }
-
-  void init() override;
-  void update() override;
-
-#ifdef CUDA
-  float *rGpuBegin() override { return m_r_gpu_begin; }
-  bool hasRGpu() override { return true; }
-  void requestRGpu() override {
-    m_needsRGpu = hasRGpu();
-    m_splitParticleStructGpu |= m_needsRGpu;
-    m_gpu |= m_needsRGpu;
-    enableParticleCommunication();
-  }
-
-#ifdef DIPOLES
-  float *dipGpuBegin() override { return m_dip_gpu_begin; }
-  bool hasDipGpu() override { return true; }
-  void requestDipGpu() override {
-    m_needsDipGpu = hasDipGpu();
-    m_splitParticleStructGpu |= m_needsRGpu;
-    m_gpu |= m_needsRGpu;
-    enableParticleCommunication();
-  }
-#endif
-
-#ifdef ELECTROSTATICS
-  float *qGpuBegin() override { return m_q_gpu_begin; }
-  bool hasQGpu() override { return true; }
-  void requestQGpu() override {
-    m_needsQGpu = hasQGpu();
-    m_splitParticleStructGpu |= m_needsQGpu;
-    m_gpu |= m_needsQGpu;
-    enableParticleCommunication();
-  }
-#endif
-
-  void requestParticleStructGpu() {
-    m_needsParticleStructGpu = true;
-    m_gpu |= m_needsParticleStructGpu;
-    enableParticleCommunication();
-  }
-
-  float *fGpuBegin() override { return gpu_get_particle_force_pointer(); }
-  bool hasFGpu() override { return true; }
-  void requestFGpu() override {
-    m_needsFGpu = hasFGpu();
-    m_gpu |= m_needsFGpu;
-    enableParticleCommunication();
-  }
-
-#ifdef ROTATION
-  float *torqueGpuBegin() override { return gpu_get_particle_torque_pointer(); }
-  bool hasTorqueGpu() override { return true; }
-  void requestTorqueGpu() override {
-    m_needsTorqueGpu = hasTorqueGpu();
-    m_gpu |= m_needsTorqueGpu;
-    enableParticleCommunication();
-  }
-#endif
-
-  float *eGpu() override {
-    // cast pointer from struct of floats to array of floats
-    // https://stackoverflow.com/a/29278260
-    return reinterpret_cast<float *>(gpu_get_energy_pointer());
-  }
-
-#endif // ifdef CUDA
-
-  Utils::Vector3d box() const override;
-
-  std::size_t npart_gpu() const override {
-#ifdef CUDA
-    return gpu_get_particle_pointer().size();
-#else
-    return 0;
-#endif
-  }
-
-protected:
-  static EspressoSystemInterface *m_instance;
-
-  void gatherParticles();
-  void split_particle_struct();
-#ifdef CUDA
-  void enableParticleCommunication();
-  void reallocDeviceMemory(std::size_t n);
-
-private:
-  std::size_t m_gpu_npart = 0;
-  bool m_gpu = false;
-
-  float *m_r_gpu_begin = nullptr;
-  float *m_dip_gpu_begin = nullptr;
-  float *m_q_gpu_begin = nullptr;
-
-  bool m_needsParticleStructGpu = false;
-  bool m_splitParticleStructGpu = false;
-
-  bool m_needsRGpu = false;
-  bool m_needsQGpu = false;
-  bool m_needsFGpu = false;
-  bool m_needsDipGpu = false;
-  bool m_needsTorqueGpu = false;
-#endif
-};
-
-#endif
diff --git a/src/core/EspressoSystemInterface_cuda.cu b/src/core/EspressoSystemInterface_cuda.cu
deleted file mode 100644
index 18b559d2f93..00000000000
--- a/src/core/EspressoSystemInterface_cuda.cu
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright (C) 2014-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-/**
- * @file
- * CUDA kernels to convert the particles AoS to a SoA on the device.
- */
-
-#include "EspressoSystemInterface.hpp"
-#include "cuda/interface.hpp"
-#include "cuda/utils.cuh"
-#include "errorhandling.hpp"
-
-#include <cstddef>
-
-#include <cuda.h>
-
-#if defined(OMPI_MPI_H) || defined(_MPI_H)
-#error CU-file includes mpi.h! This should not happen!
-#endif
-
-// Position and charge
-__global__ void split_kernel_rq(CUDA_particle_data *particles, float *r,
-                                float *q, unsigned int n) {
-  auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
-  if (idx >= n)
-    return;
-
-  CUDA_particle_data p = particles[idx];
-
-  r[3 * idx + 0] = p.p[0];
-  r[3 * idx + 1] = p.p[1];
-  r[3 * idx + 2] = p.p[2];
-#ifdef ELECTROSTATICS
-  q[idx] = p.q;
-#endif
-}
-
-// Charge only
-__global__ void split_kernel_q(CUDA_particle_data *particles, float *q,
-                               unsigned int n) {
-  auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
-  if (idx >= n)
-    return;
-
-#ifdef ELECTROSTATICS
-  CUDA_particle_data p = particles[idx];
-
-  q[idx] = p.q;
-#endif
-}
-
-// Position only
-__global__ void split_kernel_r(CUDA_particle_data *particles, float *r,
-                               unsigned int n) {
-  auto idx = blockDim.x * blockIdx.x + threadIdx.x;
-  if (idx >= n)
-    return;
-
-  CUDA_particle_data p = particles[idx];
-
-  idx *= 3;
-
-  r[idx + 0] = p.p[0];
-  r[idx + 1] = p.p[1];
-  r[idx + 2] = p.p[2];
-}
-
-#ifdef DIPOLES
-// Dipole moment
-__global__ void split_kernel_dip(CUDA_particle_data *particles, float *dip,
-                                 unsigned int n) {
-  auto idx = blockDim.x * blockIdx.x + threadIdx.x;
-  if (idx >= n)
-    return;
-
-  CUDA_particle_data p = particles[idx];
-
-  idx *= 3;
-
-  dip[idx + 0] = p.dip[0];
-  dip[idx + 1] = p.dip[1];
-  dip[idx + 2] = p.dip[2];
-}
-#endif
-
-void EspressoSystemInterface::reallocDeviceMemory(std::size_t n) {
-  if (m_needsRGpu && ((n != m_gpu_npart) || (m_r_gpu_begin == nullptr))) {
-    if (m_r_gpu_begin != nullptr)
-      cuda_safe_mem(cudaFree(m_r_gpu_begin));
-    cuda_safe_mem(cudaMalloc(&m_r_gpu_begin, 3 * n * sizeof(float)));
-  }
-#ifdef DIPOLES
-  if (m_needsDipGpu && ((n != m_gpu_npart) || (m_dip_gpu_begin == nullptr))) {
-    if (m_dip_gpu_begin != nullptr)
-      cuda_safe_mem(cudaFree(m_dip_gpu_begin));
-    cuda_safe_mem(cudaMalloc(&m_dip_gpu_begin, 3 * n * sizeof(float)));
-  }
-#endif
-#ifdef ELECTROSTATICS
-  if (m_needsQGpu && ((n != m_gpu_npart) || (m_q_gpu_begin == nullptr))) {
-    if (m_q_gpu_begin != nullptr)
-      cuda_safe_mem(cudaFree(m_q_gpu_begin));
-    cuda_safe_mem(cudaMalloc(&m_q_gpu_begin, 3 * n * sizeof(float)));
-  }
-#endif
-
-  m_gpu_npart = n;
-}
-
-void EspressoSystemInterface::split_particle_struct() {
-  auto const device_particles = gpu_get_particle_pointer();
-  auto const n = static_cast<unsigned int>(device_particles.size());
-  if (n == 0)
-    return;
-
-  dim3 grid(n / 512 + 1, 1, 1);
-  dim3 block(512, 1, 1);
-
-  if (m_needsQGpu && m_needsRGpu)
-    split_kernel_rq<<<dim3(grid), dim3(block), 0, nullptr>>>(
-        device_particles.data(), m_r_gpu_begin, m_q_gpu_begin, n);
-  else if (m_needsQGpu)
-    split_kernel_q<<<dim3(grid), dim3(block), 0, nullptr>>>(
-        device_particles.data(), m_q_gpu_begin, n);
-  else if (m_needsRGpu)
-    split_kernel_r<<<dim3(grid), dim3(block), 0, nullptr>>>(
-        device_particles.data(), m_r_gpu_begin, n);
-#ifdef DIPOLES
-  if (m_needsDipGpu)
-    split_kernel_dip<<<dim3(grid), dim3(block), 0, nullptr>>>(
-        device_particles.data(), m_dip_gpu_begin, n);
-#endif
-}
diff --git a/src/core/SystemInterface.hpp b/src/core/SystemInterface.hpp
deleted file mode 100644
index 1b68444ba60..00000000000
--- a/src/core/SystemInterface.hpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (C) 2014-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-#ifndef SYSTEMINTERFACE_H
-#define SYSTEMINTERFACE_H
-
-#include "config/config.hpp"
-
-#include <utils/Vector.hpp>
-
-#include <cstddef>
-#include <stdexcept>
-#include <string>
-
-/**
- * Public interface for the ESPResSo system and device memory management.
- *
- * This interface is responsible for device memory management and memory
- * layout conversion (AoS to SoA conversion). On the CPU, particle data
- * is stored in an Array of Structures (AoS). On the GPU, some algorithms
- * have better performance with a Structure of Arrays (SoA). When data
- * is synchronized between host and device memory, particle objects are
- * copied to device memory in the form of an AoS. A few particle properties
- * are further copied ("split") into a SoA by the split callback.
- *
- * This interface provides getters in the form <tt>xGpuBegin()</tt> and
- * <tt>xGpuEnd()</tt> (with @c x a particle property, e.g. @c r for position)
- * that return pointers to device memory for each array in the SoA.
- * To register a new particle property in the split callback, call the
- * corresponding <tt>requestXGpu()</tt> method, with @c X the uppercase
- * version of the particle property @c x. It is currently impossible to
- * undo this action.
- *
- * Since properties in the @ref Particle struct depend on the config file,
- * it is necessary to provide a means to check if it is even possible to
- * carry out the split. This is achieved by the <tt>hasXGpu()</tt> methods,
- * whose return value must be checked at the call site to raise a runtime
- * error. This is important in MPI callbacks, since they have limited support
- * for standard exceptions. Methods <tt>requestXGpu()</tt> will throw an
- * exception if the particle property cannot be split on device memory.
- */
-class SystemInterface {
-public:
-  SystemInterface() = default;
-  virtual ~SystemInterface() = default;
-
-  virtual void init() = 0;
-  virtual void update() = 0;
-
-  virtual float *rGpuBegin() { return nullptr; }
-  virtual bool hasRGpu() { return false; }
-  virtual void requestRGpu() {
-    throw std::runtime_error(error_message("positions"));
-  }
-
-  virtual float *dipGpuBegin() { return nullptr; }
-  virtual bool hasDipGpu() { return false; }
-  virtual void requestDipGpu() {
-    throw std::runtime_error(error_message("dipoles"));
-  }
-
-  virtual float *torqueGpuBegin() { return nullptr; }
-  virtual bool hasTorqueGpu() { return false; }
-  virtual void requestTorqueGpu() {
-    throw std::runtime_error(error_message("torques"));
-  }
-
-  virtual float *qGpuBegin() { return nullptr; }
-  virtual bool hasQGpu() { return false; }
-  virtual void requestQGpu() {
-    throw std::runtime_error(error_message("charges"));
-  }
-
-  virtual float *fGpuBegin() { return nullptr; }
-  virtual bool hasFGpu() { return false; }
-  virtual void requestFGpu() {
-    throw std::runtime_error(error_message("forces"));
-  }
-
-  virtual float *eGpu() { return nullptr; }
-
-  virtual std::size_t npart_gpu() const { return 0; }
-  virtual Utils::Vector3d box() const = 0;
-
-private:
-  std::string error_message(std::string const &property) const {
-    return "No GPU available or particle " + property + " not compiled in.";
-  }
-};
-
-#endif
diff --git a/src/core/cuda/common_cuda.cu b/src/core/cuda/common_cuda.cu
index b0ad028c4bf..3f7e136369a 100644
--- a/src/core/cuda/common_cuda.cu
+++ b/src/core/cuda/common_cuda.cu
@@ -16,54 +16,13 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#include "config/config.hpp"
 
-#include "ParticleRange.hpp"
 #include "errorhandling.hpp"
-#include "init.hpp"
-#include "interface.hpp"
-#include "utils.cuh"
-
-#include "CudaDeviceAllocator.hpp"
-#include "CudaHostAllocator.hpp"
 
-#include <utils/Span.hpp>
-
-#include <thrust/device_vector.h>
+#include "utils.cuh"
 
-#include <cstddef>
 #include <cstdio>
 
-template <class T>
-using device_vector = thrust::device_vector<T, CudaDeviceAllocator<T>>;
-
-static CUDA_global_part_vars global_part_vars_host = {};
-
-template <class T, class A>
-T *raw_data_pointer(thrust::device_vector<T, A> &vec) {
-  return thrust::raw_pointer_cast(vec.data());
-}
-
-template <class SpanLike> std::size_t byte_size(SpanLike const &v) {
-  return v.size() * sizeof(typename SpanLike::value_type);
-}
-
-/** struct for particle force */
-static device_vector<float> particle_forces_device;
-#ifdef ROTATION
-static device_vector<float> particle_torques_device;
-#endif
-
-/** struct for particle position and velocity */
-static device_vector<CUDA_particle_data> particle_data_device;
-/** struct for energies */
-static CUDA_energy *energy_device = nullptr;
-
-pinned_vector<CUDA_particle_data> particle_data_host;
-pinned_vector<float> particle_forces_host;
-pinned_vector<float> particle_torques_host;
-CUDA_energy energy_host;
-
 cudaStream_t stream[1];
 
 void cuda_check_errors_exit(const dim3 &block, const dim3 &grid,
@@ -80,139 +39,6 @@ void cuda_check_errors_exit(const dim3 &block, const dim3 &grid,
   }
 }
 
-/**
- * @brief Resize a @ref device_vector.
- *
- * Due to a bug in thrust (https://github.com/thrust/thrust/issues/939),
- * resizing or appending to default constructed containers causes undefined
- * behavior by dereferencing a null-pointer for certain types. This
- * function is used instead of the resize member function to side-step
- * the problem. This is done by replacing the existing vector by a new
- * one constructed with the desired size if resizing from capacity zero.
- * Behaves as-if vec.resize(n) was called.
- *
- * @tparam T Type contained in the vector.
- * @param vec Vector To resize.
- * @param n Desired new size of the element.
- */
-template <class T>
-void resize_or_replace(device_vector<T> &vec, std::size_t n) {
-  if (vec.capacity() == 0) {
-    vec = device_vector<T>(n);
-  } else {
-    vec.resize(n);
-  }
-}
-
-void resize_buffers(std::size_t number_of_particles) {
-  particle_data_host.resize(number_of_particles);
-  resize_or_replace(particle_data_device, number_of_particles);
-
-  particle_forces_host.resize(3 * number_of_particles);
-  resize_or_replace(particle_forces_device, 3 * number_of_particles);
-
-#ifdef ROTATION
-  particle_torques_host.resize(3 * number_of_particles);
-  resize_or_replace(particle_torques_device, 3 * number_of_particles);
-#endif
-}
-
-/**
- * @brief Setup and call particle reallocation from the host.
- * Note that in addition to calling this function the parameters must be
- * broadcast with either:
- * 1. @ref cuda_bcast_global_part_params() (when just being executed on the
- *    head node) or
- * 2. `MPI_Bcast(gpu_get_global_particle_vars_pointer_host(),
- *    sizeof(CUDA_global_part_vars), MPI_BYTE, 0, comm_cart)` (when executed
- *    on all nodes)
- */
-void gpu_init_particle_comm(int this_node) {
-  if (this_node == 0 && global_part_vars_host.communication_enabled == 0) {
-    try {
-      cuda_check_device();
-    } catch (cuda_runtime_error const &err) {
-      fprintf(stderr, "ERROR: %s\n", err.what());
-      errexit();
-    }
-  }
-  global_part_vars_host.communication_enabled = 1;
-}
-
-Utils::Span<CUDA_particle_data> gpu_get_particle_pointer() {
-  return {raw_data_pointer(particle_data_device), particle_data_device.size()};
-}
-CUDA_global_part_vars *gpu_get_global_particle_vars_pointer_host() {
-  return &global_part_vars_host;
-}
-float *gpu_get_particle_force_pointer() {
-  return raw_data_pointer(particle_forces_device);
-}
-#ifdef ROTATION
-float *gpu_get_particle_torque_pointer() {
-  return raw_data_pointer(particle_torques_device);
-}
-#endif
-CUDA_energy *gpu_get_energy_pointer() { return energy_device; }
-
-void copy_part_data_to_gpu(ParticleRange particles, int this_node) {
-  if (global_part_vars_host.communication_enabled == 1) {
-    cuda_mpi_get_particles(particles, particle_data_host);
-
-    resize_buffers(particle_data_host.size());
-
-    /* get espressomd particle values */
-    if (this_node == 0) {
-      cudaMemsetAsync(raw_data_pointer(particle_forces_device), 0x0,
-                      byte_size(particle_forces_device), stream[0]);
-#ifdef ROTATION
-      cudaMemsetAsync(raw_data_pointer(particle_torques_device), 0x0,
-                      byte_size(particle_torques_device), stream[0]);
-#endif
-      cudaMemcpyAsync(raw_data_pointer(particle_data_device),
-                      particle_data_host.data(), byte_size(particle_data_host),
-                      cudaMemcpyHostToDevice, stream[0]);
-    }
-  }
-}
-
-/** setup and call kernel to copy particle forces to host
- */
-void copy_forces_from_GPU(ParticleRange &particles, int this_node) {
-  if (global_part_vars_host.communication_enabled == 1) {
-    /* Copy result from device memory to host memory*/
-    if (this_node == 0 && (not particle_forces_device.empty())) {
-      thrust::copy(particle_forces_device.begin(), particle_forces_device.end(),
-                   particle_forces_host.begin());
-#ifdef ROTATION
-      thrust::copy(particle_torques_device.begin(),
-                   particle_torques_device.end(),
-                   particle_torques_host.begin());
-#endif
-    }
-
-    cuda_mpi_send_forces(
-        particles, {particle_forces_host.data(), particle_forces_host.size()},
-        {particle_torques_host.data(), particle_torques_host.size()});
-  }
-}
-
-void clear_energy_on_GPU() {
-  if (!global_part_vars_host.communication_enabled)
-    return;
-  if (energy_device == nullptr)
-    cuda_safe_mem(cudaMalloc((void **)&energy_device, sizeof(CUDA_energy)));
-  cuda_safe_mem(cudaMemset(energy_device, 0, sizeof(CUDA_energy)));
-}
-
-CUDA_energy copy_energy_from_GPU() {
-  if (!global_part_vars_host.communication_enabled)
-    return {};
-  cuda_safe_mem(cudaMemcpy(&energy_host, energy_device, sizeof(CUDA_energy),
-                           cudaMemcpyDeviceToHost));
-  return energy_host;
-}
-
 void cuda_safe_mem_exit(cudaError_t CU_err, const char *file,
                         unsigned int line) {
   if (CU_err != cudaSuccess) {
diff --git a/src/core/cuda/interface.cpp b/src/core/cuda/interface.cpp
deleted file mode 100644
index b7df6482e6d..00000000000
--- a/src/core/cuda/interface.cpp
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * Copyright (C) 2014-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "interface.hpp"
-
-#ifdef CUDA
-
-#include "EspressoSystemInterface.hpp"
-#include "communication.hpp"
-#include "config/config.hpp"
-#include "grid.hpp"
-#include "nonbonded_interactions/nonbonded_interaction_data.hpp"
-#include "serialization/CUDA_particle_data.hpp"
-
-#include <utils/Vector.hpp>
-#include <utils/mpi/gather_buffer.hpp>
-#include <utils/mpi/scatter_buffer.hpp>
-
-#include <vector>
-
-static void pack_particles(ParticleRange particles,
-                           CUDA_particle_data *buffer) {
-  using Utils::Vector3f;
-
-  int i = 0;
-  for (auto const &part : particles) {
-    buffer[i].p = static_cast<Vector3f>(folded_position(part.pos(), box_geo));
-    buffer[i].identity = part.id();
-#ifdef DIPOLES
-    buffer[i].dip = static_cast<Vector3f>(part.calc_dip());
-#endif
-#ifdef ELECTROSTATICS
-    buffer[i].q = static_cast<float>(part.q());
-#endif
-    i++;
-  }
-}
-
-void cuda_mpi_get_particles(
-    const ParticleRange &particles,
-    pinned_vector<CUDA_particle_data> &particle_data_host) {
-  auto const n_part = particles.size();
-
-  if (this_node > 0) {
-    static std::vector<CUDA_particle_data> buffer;
-    buffer.resize(n_part);
-    /* pack local parts into buffer */
-    pack_particles(particles, buffer.data());
-
-    Utils::Mpi::gather_buffer(buffer, comm_cart);
-  } else {
-    particle_data_host.resize(n_part);
-
-    /* Pack own particles */
-    pack_particles(particles, particle_data_host.data());
-
-    Utils::Mpi::gather_buffer(particle_data_host, comm_cart);
-  }
-}
-
-/**
- * @brief Add a flat force (and torque) array to a range of particles.
- *
- * @param particles The particles the forces (and torques should be added to)
- * @param forces The forces as flat array of size 3 * particles.size()
- * @param torques The torques as flat array of size 3 * particles.size(),
- *                this is only touched if ROTATION is active.
- */
-static void add_forces_and_torques(ParticleRange particles,
-                                   Utils::Span<const float> forces,
-                                   Utils::Span<const float> torques) {
-  unsigned int i = 0;
-  for (auto &p : particles) {
-    for (unsigned int j = 0; j < 3; j++) {
-      p.force()[j] += static_cast<double>(forces[3 * i + j]);
-#ifdef ROTATION
-      p.torque()[j] += static_cast<double>(torques[3 * i + j]);
-#endif
-    }
-    i++;
-  }
-}
-
-void cuda_mpi_send_forces(const ParticleRange &particles,
-                          Utils::Span<float> host_forces,
-                          Utils::Span<float> host_torques) {
-
-  auto const size = 3 * particles.size();
-  auto const n_elements = static_cast<int>(size);
-
-  if (this_node > 0) {
-    static std::vector<float> buffer_forces;
-    static std::vector<float> buffer_torques;
-
-    buffer_forces.resize(size);
-    Utils::Mpi::scatter_buffer(buffer_forces.data(), n_elements, comm_cart);
-#ifdef ROTATION
-    buffer_torques.resize(size);
-    Utils::Mpi::scatter_buffer(buffer_torques.data(), n_elements, comm_cart);
-#endif
-    add_forces_and_torques(particles, buffer_forces, buffer_torques);
-  } else {
-    Utils::Mpi::scatter_buffer(host_forces.data(), n_elements, comm_cart);
-#ifdef ROTATION
-    Utils::Mpi::scatter_buffer(host_torques.data(), n_elements, comm_cart);
-#endif
-    add_forces_and_torques(particles, host_forces, host_torques);
-  }
-}
-
-void cuda_bcast_global_part_params() {
-  MPI_Bcast(gpu_get_global_particle_vars_pointer_host(),
-            sizeof(CUDA_global_part_vars), MPI_BYTE, 0, comm_cart);
-  EspressoSystemInterface::Instance().requestParticleStructGpu();
-}
-
-#endif // CUDA
diff --git a/src/core/cuda/interface.hpp b/src/core/cuda/interface.hpp
deleted file mode 100644
index cb6433dc678..00000000000
--- a/src/core/cuda/interface.hpp
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (C) 2013-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "config/config.hpp"
-
-#ifdef CUDA
-
-#include "CudaHostAllocator.hpp"
-#include "ParticleRange.hpp"
-
-#include <utils/Span.hpp>
-#include <utils/Vector.hpp>
-
-/** data structure which must be copied to the GPU at each step run on the GPU
- */
-struct CUDA_particle_data {
-  using Vector3f = Utils::Vector3f;
-  Vector3f p;
-  int identity;
-#ifdef ELECTROSTATICS
-  float q;
-#endif
-#ifdef DIPOLES
-  Vector3f dip;
-#endif
-};
-
-/** data structure for the different kinds of energies */
-struct CUDA_energy {
-  float coulomb, dipolar;
-};
-
-/** Global variables associated with all of the particles and not with
- *  one individual particle.
- */
-struct CUDA_global_part_vars {
-  /** Boolean flag to indicate if particle info should be communicated
-   *  between the cpu and gpu
-   */
-  unsigned int communication_enabled;
-};
-
-void copy_forces_from_GPU(ParticleRange &particles, int this_node);
-CUDA_energy copy_energy_from_GPU();
-void clear_energy_on_GPU();
-
-CUDA_global_part_vars *gpu_get_global_particle_vars_pointer_host();
-Utils::Span<CUDA_particle_data> gpu_get_particle_pointer();
-float *gpu_get_particle_force_pointer();
-#ifdef ROTATION
-float *gpu_get_particle_torque_pointer();
-#endif
-
-CUDA_energy *gpu_get_energy_pointer();
-void gpu_init_particle_comm(int this_node);
-
-void cuda_mpi_get_particles(
-    const ParticleRange &particles,
-    pinned_vector<CUDA_particle_data> &particle_data_host);
-void copy_part_data_to_gpu(ParticleRange particles, int this_node);
-
-/**
- * @brief Distribute forces to the worker nodes, and add them to the particles.
- *
- * @param particles    The particles for which the forces (and torques) should
- *                     be added to.
- * @param host_forces  The forces as flat array of size 3 * particles.size(),
- *                     only relevant on the head node.
- * @param host_torques The torques as flat array of size 3 * particles.size(),
- *                     this is only touched if ROTATION is active. Only
- *                     relevant on the head node.
- *
- * This is a collective call.
- */
-void cuda_mpi_send_forces(const ParticleRange &particles,
-                          Utils::Span<float> host_forces,
-                          Utils::Span<float> host_torques);
-
-void cuda_bcast_global_part_params();
-
-#endif // CUDA
diff --git a/src/core/electrostatics/mmm1d_gpu.cpp b/src/core/electrostatics/mmm1d_gpu.cpp
index ffe2047fd62..1764a4a2376 100644
--- a/src/core/electrostatics/mmm1d_gpu.cpp
+++ b/src/core/electrostatics/mmm1d_gpu.cpp
@@ -23,11 +23,12 @@
 
 #include "electrostatics/mmm1d_gpu.hpp"
 
-#include "EspressoSystemInterface.hpp"
 #include "cell_system/CellStructureType.hpp"
 #include "communication.hpp"
 #include "event.hpp"
 #include "grid.hpp"
+#include "system/GpuParticleData.hpp"
+#include "system/System.hpp"
 
 #include <stdexcept>
 
@@ -51,10 +52,10 @@ CoulombMMM1DGpu::CoulombMMM1DGpu(double prefactor, double maxPWerror,
     throw std::domain_error("Parameter 'bessel_cutoff' must be > 0");
   }
 
-  auto &system = EspressoSystemInterface::Instance();
-  system.requestFGpu();
-  system.requestRGpu();
-  system.requestQGpu();
+  auto &gpu_particle_data = System::Instance().gpu;
+  gpu_particle_data.enable_property(GpuParticleData::prop::force);
+  gpu_particle_data.enable_property(GpuParticleData::prop::pos);
+  gpu_particle_data.enable_property(GpuParticleData::prop::q);
   if (this_node == 0) {
     modpsi_init();
   }
@@ -74,7 +75,7 @@ void CoulombMMM1DGpu::sanity_checks_cell_structure() const {
 }
 
 void CoulombMMM1DGpu::tune() {
-  EspressoSystemInterface::Instance().init();
+  System::Instance().gpu.init();
   if (this_node == 0) {
     setup();
     tune(maxPWerror, far_switch_radius, bessel_cutoff);
diff --git a/src/core/electrostatics/mmm1d_gpu.hpp b/src/core/electrostatics/mmm1d_gpu.hpp
index e524c7f1b99..a72b8516af5 100644
--- a/src/core/electrostatics/mmm1d_gpu.hpp
+++ b/src/core/electrostatics/mmm1d_gpu.hpp
@@ -72,10 +72,6 @@ class CoulombMMM1DGpu : public Coulomb::Actor<CoulombMMM1DGpu> {
 private:
   bool m_is_tuned;
 
-  // CUDA parameters
-  unsigned int numThreads = 64u;
-  unsigned int numBlocks() const;
-
   // the box length currently set on the GPU
   // Needed to make sure it hasn't been modified after inter coulomb was used.
   float host_boxz = 0.f;
diff --git a/src/core/electrostatics/mmm1d_gpu_cuda.cu b/src/core/electrostatics/mmm1d_gpu_cuda.cu
index 82ca9bc87dc..a7c6216b14d 100644
--- a/src/core/electrostatics/mmm1d_gpu_cuda.cu
+++ b/src/core/electrostatics/mmm1d_gpu_cuda.cu
@@ -30,8 +30,8 @@
 #include "electrostatics/mmm1d_gpu.hpp"
 #include "electrostatics/specfunc.cuh"
 
-#include "EspressoSystemInterface.hpp"
 #include "cuda/utils.cuh"
+#include "system/System.hpp"
 
 #include <utils/constants.hpp>
 #include <utils/math/sqr.hpp>
@@ -49,7 +49,8 @@
 #endif
 
 // the code is mostly multi-GPU capable, but ESPResSo is not yet
-constexpr int deviceCount = 1;
+static constexpr int deviceCount = 1;
+static constexpr unsigned int numThreads = 64u;
 
 #undef cudaSetDevice
 #define cudaSetDevice(d)
@@ -73,8 +74,6 @@ __constant__ unsigned int device_linModPsi_offsets[2 * modpsi_order];
 __constant__ unsigned int device_linModPsi_lengths[2 * modpsi_order];
 __constant__ float device_linModPsi[modpsi_constant_size];
 
-static EspressoSystemInterface *es_system = nullptr;
-
 __device__ float dev_mod_psi_even(int n, float x) {
   return evaluateAsTaylorSeriesAt(
       &device_linModPsi[device_linModPsi_offsets[2 * n]],
@@ -132,10 +131,24 @@ void CoulombMMM1DGpu::modpsi_init() {
   }
 }
 
+/** @brief Get number of blocks for a given number of operations. */
+static auto numBlocksOps(std::size_t n_ops) {
+  auto const n_ops_per_thread = n_ops / static_cast<std::size_t>(numThreads);
+  return 1u + static_cast<unsigned int>(n_ops_per_thread);
+}
+
+/** @brief Get number of blocks for a given number of particles. */
+static auto numBlocks(std::size_t n_part) {
+  auto b = numBlocksOps(Utils::sqr(n_part)); // algorithm is N-square
+  if (b > 65535u)
+    b = 65535u;
+  return b;
+}
+
 void CoulombMMM1DGpu::setup() {
-  es_system = &EspressoSystemInterface::Instance();
-  auto const box_z = static_cast<float>(es_system->box()[2]);
-  auto const n_part = es_system->npart_gpu();
+  auto &gpu_particle_data = System::Instance().gpu;
+  auto const box_z = static_cast<float>(System::Instance().box()[2]);
+  auto const n_part = gpu_particle_data.n_particles();
   if (not m_is_tuned and n_part != 0) {
     set_params(box_z, prefactor, maxPWerror, far_switch_radius, bessel_cutoff);
     tune(maxPWerror, far_switch_radius, bessel_cutoff);
@@ -173,19 +186,11 @@ void CoulombMMM1DGpu::setup() {
   }
   if (dev_energyBlocks)
     cudaFree(dev_energyBlocks);
-  cuda_safe_mem(
-      cudaMalloc((void **)&dev_energyBlocks, numBlocks() * sizeof(float)));
+  cuda_safe_mem(cudaMalloc((void **)&dev_energyBlocks,
+                           numBlocks(n_part) * sizeof(float)));
   host_npart = static_cast<unsigned int>(n_part);
 }
 
-unsigned int CoulombMMM1DGpu::numBlocks() const {
-  auto b = 1 + static_cast<unsigned int>(Utils::sqr(es_system->npart_gpu()) /
-                                         static_cast<std::size_t>(numThreads));
-  if (b > 65535)
-    b = 65535;
-  return b;
-}
-
 CoulombMMM1DGpu::~CoulombMMM1DGpu() { cudaFree(dev_forcePairs); }
 
 __forceinline__ __device__ float sqpow(float x) { return x * x; }
@@ -331,9 +336,9 @@ void CoulombMMM1DGpu::set_params(double boxz, double prefactor,
   m_is_tuned = false;
 }
 
-__global__ void forcesKernel(const float *__restrict__ r,
-                             const float *__restrict__ q,
-                             float *__restrict__ force, std::size_t N,
+__global__ void forcesKernel(float const *const __restrict__ r,
+                             float const *const __restrict__ q,
+                             float *const __restrict__ force, std::size_t N,
                              int pairs) {
 
   constexpr auto c_2pif = 2.f * Utils::pi<float>();
@@ -419,9 +424,9 @@ __global__ void forcesKernel(const float *__restrict__ r,
   }
 }
 
-__global__ void energiesKernel(const float *__restrict__ r,
-                               const float *__restrict__ q,
-                               float *__restrict__ energy, std::size_t N,
+__global__ void energiesKernel(float const *const __restrict__ r,
+                               float const *const __restrict__ q,
+                               float *const __restrict__ energy, std::size_t N,
                                int pairs) {
 
   constexpr auto c_2pif = 2.f * Utils::pi<float>();
@@ -513,25 +518,26 @@ void CoulombMMM1DGpu::add_long_range_forces() {
     throw std::runtime_error("MMM1D was not initialized correctly");
   }
 
+  auto &gpu = System::Instance().gpu;
+  auto const positions_device = gpu.get_particle_positions_device();
+  auto const charges_device = gpu.get_particle_charges_device();
+  auto const forces_device = gpu.get_particle_forces_device();
+  auto const n_part = gpu.n_particles();
   if (pairs) {
     // if we calculate force pairs, we need to reduce them to forces
-    auto const blocksRed =
-        1 + static_cast<unsigned>(es_system->npart_gpu() /
-                                  static_cast<std::size_t>(numThreads));
-    KERNELCALL(forcesKernel, numBlocks(), numThreads, es_system->rGpuBegin(),
-               es_system->qGpuBegin(), dev_forcePairs, es_system->npart_gpu(),
-               pairs)
-    KERNELCALL(vectorReductionKernel, blocksRed, numThreads, dev_forcePairs,
-               es_system->fGpuBegin(), es_system->npart_gpu())
+    auto const numBlocksRed = numBlocksOps(n_part);
+    KERNELCALL(forcesKernel, numBlocks(n_part), numThreads, positions_device,
+               charges_device, dev_forcePairs, n_part, pairs)
+    KERNELCALL(vectorReductionKernel, numBlocksRed, numThreads, dev_forcePairs,
+               forces_device, n_part)
   } else {
-    KERNELCALL(forcesKernel, numBlocks(), numThreads, es_system->rGpuBegin(),
-               es_system->qGpuBegin(), es_system->fGpuBegin(),
-               es_system->npart_gpu(), pairs)
+    KERNELCALL(forcesKernel, numBlocks(n_part), numThreads, positions_device,
+               charges_device, forces_device, n_part, pairs)
   }
 }
 
-__global__ void scaleAndAddKernel(float *dst, float const *src, std::size_t N,
-                                  float factor) {
+__global__ void scaleAndAddKernel(float *const dst, float const *const src,
+                                  std::size_t N, float factor) {
   for (std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < N;
        tid += blockDim.x * gridDim.x) {
     dst[tid] += src[tid] * factor;
@@ -545,17 +551,21 @@ void CoulombMMM1DGpu::add_long_range_energy() {
     throw std::runtime_error("MMM1D was not initialized correctly");
   }
 
+  auto &gpu = System::Instance().gpu;
+  auto const positions_device = gpu.get_particle_positions_device();
+  auto const charges_device = gpu.get_particle_charges_device();
+  auto const energy_device = gpu.get_energy_device();
+  auto const n_part = gpu.n_particles();
   auto const shared = numThreads * static_cast<unsigned>(sizeof(float));
-  KERNELCALL_shared(energiesKernel, numBlocks(), numThreads, shared,
-                    es_system->rGpuBegin(), es_system->qGpuBegin(),
-                    dev_energyBlocks, es_system->npart_gpu(), 0);
+  KERNELCALL_shared(energiesKernel, numBlocks(n_part), numThreads, shared,
+                    positions_device, charges_device, dev_energyBlocks, n_part,
+                    0);
   KERNELCALL_shared(sumKernel, 1, numThreads, shared, dev_energyBlocks,
-                    numBlocks());
+                    numBlocks(n_part));
   // we count every interaction twice, so halve the total energy
   auto constexpr factor = 0.5f;
-  KERNELCALL(scaleAndAddKernel, 1, 1,
-             &(reinterpret_cast<CUDA_energy *>(es_system->eGpu())->coulomb),
-             &dev_energyBlocks[0], 1, factor);
+  KERNELCALL(scaleAndAddKernel, 1, 1, &(energy_device->coulomb),
+             dev_energyBlocks, 1, factor);
 }
 
 float CoulombMMM1DGpu::force_benchmark() {
@@ -563,13 +573,17 @@ float CoulombMMM1DGpu::force_benchmark() {
   float elapsedTime;
   float *dev_f_benchmark;
 
-  cuda_safe_mem(cudaMalloc((void **)&dev_f_benchmark,
-                           3ul * es_system->npart_gpu() * sizeof(float)));
+  auto &gpu = System::Instance().gpu;
+  auto const positions_device = gpu.get_particle_positions_device();
+  auto const charges_device = gpu.get_particle_charges_device();
+  auto const n_part = gpu.n_particles();
+  cuda_safe_mem(
+      cudaMalloc((void **)&dev_f_benchmark, 3ul * n_part * sizeof(float)));
   cuda_safe_mem(cudaEventCreate(&eventStart));
   cuda_safe_mem(cudaEventCreate(&eventStop));
   cuda_safe_mem(cudaEventRecord(eventStart, stream[0]));
-  KERNELCALL(forcesKernel, numBlocks(), numThreads, es_system->rGpuBegin(),
-             es_system->qGpuBegin(), dev_f_benchmark, es_system->npart_gpu(), 0)
+  KERNELCALL(forcesKernel, numBlocks(n_part), numThreads, positions_device,
+             charges_device, dev_f_benchmark, n_part, 0)
   cuda_safe_mem(cudaEventRecord(eventStop, stream[0]));
   cuda_safe_mem(cudaEventSynchronize(eventStop));
   cuda_safe_mem(cudaEventElapsedTime(&elapsedTime, eventStart, eventStop));
diff --git a/src/core/electrostatics/p3m_gpu.cpp b/src/core/electrostatics/p3m_gpu.cpp
index 5922644fcdd..ff5c9bfc9d9 100644
--- a/src/core/electrostatics/p3m_gpu.cpp
+++ b/src/core/electrostatics/p3m_gpu.cpp
@@ -28,8 +28,9 @@
 #include "actor/visitors.hpp"
 #include "electrostatics/coulomb.hpp"
 
-#include "EspressoSystemInterface.hpp"
 #include "ParticleRange.hpp"
+#include "system/GpuParticleData.hpp"
+#include "system/System.hpp"
 
 #include "communication.hpp"
 
@@ -49,8 +50,10 @@ void CoulombP3MGPU::init() {
 void CoulombP3MGPU::init_cpu_kernels() { CoulombP3M::init(); }
 
 void CoulombP3MGPU::request_gpu() const {
-  auto &system = EspressoSystemInterface::Instance();
-  system.requestParticleStructGpu();
+  auto &gpu_particle_data = System::Instance().gpu;
+  gpu_particle_data.enable_property(GpuParticleData::prop::force);
+  gpu_particle_data.enable_property(GpuParticleData::prop::q);
+  gpu_particle_data.enable_property(GpuParticleData::prop::pos);
 }
 
 #endif // CUDA
diff --git a/src/core/electrostatics/p3m_gpu_cuda.cu b/src/core/electrostatics/p3m_gpu_cuda.cu
index 68377e3d749..f17bd96d99d 100644
--- a/src/core/electrostatics/p3m_gpu_cuda.cu
+++ b/src/core/electrostatics/p3m_gpu_cuda.cu
@@ -52,9 +52,8 @@
 
 #include "electrostatics/p3m_gpu_cuda.cuh"
 
-#include "EspressoSystemInterface.hpp"
-#include "cuda/interface.hpp"
 #include "cuda/utils.cuh"
+#include "system/System.hpp"
 
 #include <utils/math/bspline.hpp>
 #include <utils/math/int_pow.hpp>
@@ -271,28 +270,28 @@ __global__ void apply_influence_function(const P3MGpuData p) {
 }
 
 template <int cao, bool shared>
-__global__ void assign_charge_kernel(const CUDA_particle_data *const pdata,
-                                     const P3MGpuData par,
-                                     const int parts_per_block) {
+__global__ void assign_charge_kernel(P3MGpuData const params,
+                                     float const *const __restrict__ part_pos,
+                                     float const *const __restrict__ part_q,
+                                     int const parts_per_block) {
   auto const part_in_block = static_cast<int>(threadIdx.x) / cao;
   auto const cao_id_x = static_cast<int>(threadIdx.x) - part_in_block * cao;
   /* id of the particle */
   auto const id =
       parts_per_block * static_cast<int>(blockIdx.x * gridDim.y + blockIdx.y) +
       part_in_block;
-  if (id >= par.n_part)
+  if (id >= params.n_part)
     return;
   /* position relative to the closest gird point */
   REAL_TYPE m_pos[3];
   /* index of the nearest mesh point */
   int nmp_x, nmp_y, nmp_z;
 
-  const CUDA_particle_data p = pdata[id];
-  auto *charge_mesh = (REAL_TYPE *)par.charge_mesh;
+  auto *charge_mesh = (REAL_TYPE *)params.charge_mesh;
 
-  m_pos[0] = p.p[0] * par.hi[0] - par.pos_shift;
-  m_pos[1] = p.p[1] * par.hi[1] - par.pos_shift;
-  m_pos[2] = p.p[2] * par.hi[2] - par.pos_shift;
+  m_pos[0] = part_pos[3 * id + 0] * params.hi[0] - params.pos_shift;
+  m_pos[1] = part_pos[3 * id + 1] * params.hi[1] - params.pos_shift;
+  m_pos[2] = part_pos[3 * id + 2] * params.hi[2] - params.pos_shift;
 
   nmp_x = static_cast<int>(floorf(m_pos[0] + 0.5f));
   nmp_y = static_cast<int>(floorf(m_pos[1] + 0.5f));
@@ -302,11 +301,11 @@ __global__ void assign_charge_kernel(const CUDA_particle_data *const pdata,
   m_pos[1] -= static_cast<REAL_TYPE>(nmp_y);
   m_pos[2] -= static_cast<REAL_TYPE>(nmp_z);
 
-  nmp_x = wrap_index(nmp_x + cao_id_x, par.mesh[0]);
-  nmp_y = wrap_index(nmp_y + static_cast<int>(threadIdx.y), par.mesh[1]);
-  nmp_z = wrap_index(nmp_z + static_cast<int>(threadIdx.z), par.mesh[2]);
+  nmp_x = wrap_index(nmp_x + cao_id_x, params.mesh[0]);
+  nmp_y = wrap_index(nmp_y + static_cast<int>(threadIdx.y), params.mesh[1]);
+  nmp_z = wrap_index(nmp_z + static_cast<int>(threadIdx.z), params.mesh[2]);
 
-  const int ind = linear_index_r(par, nmp_x, nmp_y, nmp_z);
+  auto const ind = linear_index_r(params, nmp_x, nmp_y, nmp_z);
 
   extern __shared__ float weights[];
 
@@ -322,30 +321,32 @@ __global__ void assign_charge_kernel(const CUDA_particle_data *const pdata,
     auto const c =
         weights[3 * offset + 3 * static_cast<unsigned>(cao_id_x) + 0] *
         weights[3 * offset + 3 * threadIdx.y + 1] *
-        weights[3 * offset + 3 * threadIdx.z + 2] * p.q;
+        weights[3 * offset + 3 * threadIdx.z + 2] * part_q[id];
     atomicAdd(&(charge_mesh[ind]), c);
 
   } else {
     auto const c =
-        Utils::bspline<cao>(cao_id_x, m_pos[0]) *
+        Utils::bspline<cao>(cao_id_x, m_pos[0]) * part_q[id] *
         Utils::bspline<cao>(static_cast<int>(threadIdx.y), m_pos[1]) *
-        Utils::bspline<cao>(static_cast<int>(threadIdx.z), m_pos[2]) * p.q;
+        Utils::bspline<cao>(static_cast<int>(threadIdx.z), m_pos[2]);
     atomicAdd(&(charge_mesh[ind]), c);
   }
 }
 
-void assign_charges(const CUDA_particle_data *const pdata, const P3MGpuData p) {
-  auto const cao = p.cao;
+void assign_charges(P3MGpuData const params,
+                    float const *const __restrict__ part_pos,
+                    float const *const __restrict__ part_q) {
+  auto const cao = params.cao;
   auto const cao3 = int_pow<3>(cao);
   int parts_per_block = 1, n_blocks = 1;
 
   while ((parts_per_block + 1) * cao3 <= 1024) {
     parts_per_block++;
   }
-  if ((p.n_part % parts_per_block) == 0)
-    n_blocks = std::max<int>(1, p.n_part / parts_per_block);
+  if ((params.n_part % parts_per_block) == 0)
+    n_blocks = std::max<int>(1, params.n_part / parts_per_block);
   else
-    n_blocks = p.n_part / parts_per_block + 1;
+    n_blocks = params.n_part / parts_per_block + 1;
 
   dim3 block(static_cast<unsigned>(parts_per_block * cao),
              static_cast<unsigned>(cao), static_cast<unsigned>(cao));
@@ -363,36 +364,36 @@ void assign_charges(const CUDA_particle_data *const pdata, const P3MGpuData p) {
   switch (cao) {
   case 1:
     (assign_charge_kernel<1, false>)<<<dim3(grid), dim3(block), 0, nullptr>>>(
-        pdata, p, parts_per_block);
+        params, part_pos, part_q, parts_per_block);
     break;
   case 2:
     (assign_charge_kernel<2, false>)<<<dim3(grid), dim3(block), 0, nullptr>>>(
-        pdata, p, parts_per_block);
+        params, part_pos, part_q, parts_per_block);
     break;
   case 3:
     (assign_charge_kernel<
         3, true>)<<<dim3(grid), dim3(block), data_length, nullptr>>>(
-        pdata, p, parts_per_block);
+        params, part_pos, part_q, parts_per_block);
     break;
   case 4:
     (assign_charge_kernel<
         4, true>)<<<dim3(grid), dim3(block), data_length, nullptr>>>(
-        pdata, p, parts_per_block);
+        params, part_pos, part_q, parts_per_block);
     break;
   case 5:
     (assign_charge_kernel<
         5, true>)<<<dim3(grid), dim3(block), data_length, nullptr>>>(
-        pdata, p, parts_per_block);
+        params, part_pos, part_q, parts_per_block);
     break;
   case 6:
     (assign_charge_kernel<
         6, true>)<<<dim3(grid), dim3(block), data_length, nullptr>>>(
-        pdata, p, parts_per_block);
+        params, part_pos, part_q, parts_per_block);
     break;
   case 7:
     (assign_charge_kernel<
         7, true>)<<<dim3(grid), dim3(block), data_length, nullptr>>>(
-        pdata, p, parts_per_block);
+        params, part_pos, part_q, parts_per_block);
     break;
   default:
     break;
@@ -401,9 +402,10 @@ void assign_charges(const CUDA_particle_data *const pdata, const P3MGpuData p) {
 }
 
 template <int cao, bool shared>
-__global__ void assign_forces_kernel(const CUDA_particle_data *const pdata,
-                                     const P3MGpuData par,
-                                     float *particle_force_gpu,
+__global__ void assign_forces_kernel(P3MGpuData const params,
+                                     float const *const __restrict__ part_pos,
+                                     float const *const __restrict__ part_q,
+                                     float *const __restrict__ part_f,
                                      REAL_TYPE prefactor, int parts_per_block) {
   auto const part_in_block = static_cast<int>(threadIdx.x) / cao;
   auto const cao_id_x = static_cast<int>(threadIdx.x) - part_in_block * cao;
@@ -411,18 +413,16 @@ __global__ void assign_forces_kernel(const CUDA_particle_data *const pdata,
   auto const id =
       parts_per_block * static_cast<int>(blockIdx.x * gridDim.y + blockIdx.y) +
       part_in_block;
-  if (id >= par.n_part)
+  if (id >= params.n_part)
     return;
   /* position relative to the closest grid point */
   REAL_TYPE m_pos[3];
   /* index of the nearest mesh point */
   int nmp_x, nmp_y, nmp_z;
 
-  const CUDA_particle_data p = pdata[id];
-
-  m_pos[0] = p.p[0] * par.hi[0] - par.pos_shift;
-  m_pos[1] = p.p[1] * par.hi[1] - par.pos_shift;
-  m_pos[2] = p.p[2] * par.hi[2] - par.pos_shift;
+  m_pos[0] = part_pos[3 * id + 0] * params.hi[0] - params.pos_shift;
+  m_pos[1] = part_pos[3 * id + 1] * params.hi[1] - params.pos_shift;
+  m_pos[2] = part_pos[3 * id + 2] * params.hi[2] - params.pos_shift;
 
   nmp_x = static_cast<int>(floorf(m_pos[0] + REAL_TYPE{0.5}));
   nmp_y = static_cast<int>(floorf(m_pos[1] + REAL_TYPE{0.5}));
@@ -432,15 +432,15 @@ __global__ void assign_forces_kernel(const CUDA_particle_data *const pdata,
   m_pos[1] -= static_cast<REAL_TYPE>(nmp_y);
   m_pos[2] -= static_cast<REAL_TYPE>(nmp_z);
 
-  nmp_x = wrap_index(nmp_x + cao_id_x, par.mesh[0]);
-  nmp_y = wrap_index(nmp_y + static_cast<int>(threadIdx.y), par.mesh[1]);
-  nmp_z = wrap_index(nmp_z + static_cast<int>(threadIdx.z), par.mesh[2]);
+  nmp_x = wrap_index(nmp_x + cao_id_x, params.mesh[0]);
+  nmp_y = wrap_index(nmp_y + static_cast<int>(threadIdx.y), params.mesh[1]);
+  nmp_z = wrap_index(nmp_z + static_cast<int>(threadIdx.z), params.mesh[2]);
 
-  REAL_TYPE c;
-  const int index = linear_index_r(par, nmp_x, nmp_y, nmp_z);
+  auto const index = linear_index_r(params, nmp_x, nmp_y, nmp_z);
 
   extern __shared__ float weights[];
 
+  REAL_TYPE c;
   if (shared) {
     auto const offset = static_cast<unsigned>(cao * part_in_block);
     if ((threadIdx.y < 3) && (threadIdx.z == 0)) {
@@ -453,25 +453,28 @@ __global__ void assign_forces_kernel(const CUDA_particle_data *const pdata,
     c = -prefactor *
         weights[3 * offset + 3 * static_cast<unsigned>(cao_id_x) + 0] *
         weights[3 * offset + 3 * threadIdx.y + 1] *
-        weights[3 * offset + 3 * threadIdx.z + 2] * p.q;
+        weights[3 * offset + 3 * threadIdx.z + 2] * part_q[id];
   } else {
-    c = -prefactor * Utils::bspline<cao>(cao_id_x, m_pos[0]) *
+    c = -prefactor * part_q[id] * Utils::bspline<cao>(cao_id_x, m_pos[0]) *
         Utils::bspline<cao>(static_cast<int>(threadIdx.y), m_pos[1]) *
-        Utils::bspline<cao>(static_cast<int>(threadIdx.z), m_pos[2]) * p.q;
+        Utils::bspline<cao>(static_cast<int>(threadIdx.z), m_pos[2]);
   }
 
-  const REAL_TYPE *force_mesh_x = (REAL_TYPE *)par.force_mesh_x;
-  const REAL_TYPE *force_mesh_y = (REAL_TYPE *)par.force_mesh_y;
-  const REAL_TYPE *force_mesh_z = (REAL_TYPE *)par.force_mesh_z;
+  const REAL_TYPE *force_mesh_x = (REAL_TYPE *)params.force_mesh_x;
+  const REAL_TYPE *force_mesh_y = (REAL_TYPE *)params.force_mesh_y;
+  const REAL_TYPE *force_mesh_z = (REAL_TYPE *)params.force_mesh_z;
 
-  atomicAdd(&(particle_force_gpu[3 * id + 0]), c * force_mesh_x[index]);
-  atomicAdd(&(particle_force_gpu[3 * id + 1]), c * force_mesh_y[index]);
-  atomicAdd(&(particle_force_gpu[3 * id + 2]), c * force_mesh_z[index]);
+  atomicAdd(&(part_f[3 * id + 0]), c * force_mesh_x[index]);
+  atomicAdd(&(part_f[3 * id + 1]), c * force_mesh_y[index]);
+  atomicAdd(&(part_f[3 * id + 2]), c * force_mesh_z[index]);
 }
 
-void assign_forces(const CUDA_particle_data *const pdata, const P3MGpuData p,
-                   float *particle_force_gpu, REAL_TYPE prefactor) {
-  auto const cao = p.cao;
+void assign_forces(P3MGpuData const params,
+                   float const *const __restrict__ part_pos,
+                   float const *const __restrict__ part_q,
+                   float *const __restrict__ part_f,
+                   REAL_TYPE const prefactor) {
+  auto const cao = params.cao;
   auto const cao3 = int_pow<3>(cao);
   int parts_per_block = 1, n_blocks = 1;
 
@@ -502,36 +505,36 @@ void assign_forces(const CUDA_particle_data *const pdata, const P3MGpuData p,
   switch (cao) {
   case 1:
     (assign_forces_kernel<1, false>)<<<dim3(grid), dim3(block), 0, nullptr>>>(
-        pdata, p, particle_force_gpu, prefactor, parts_per_block);
+        params, part_pos, part_q, part_f, prefactor, parts_per_block);
     break;
   case 2:
     (assign_forces_kernel<2, false>)<<<dim3(grid), dim3(block), 0, nullptr>>>(
-        pdata, p, particle_force_gpu, prefactor, parts_per_block);
+        params, part_pos, part_q, part_f, prefactor, parts_per_block);
     break;
   case 3:
     (assign_forces_kernel<
         3, true>)<<<dim3(grid), dim3(block), data_length, nullptr>>>(
-        pdata, p, particle_force_gpu, prefactor, parts_per_block);
+        params, part_pos, part_q, part_f, prefactor, parts_per_block);
     break;
   case 4:
     (assign_forces_kernel<
         4, true>)<<<dim3(grid), dim3(block), data_length, nullptr>>>(
-        pdata, p, particle_force_gpu, prefactor, parts_per_block);
+        params, part_pos, part_q, part_f, prefactor, parts_per_block);
     break;
   case 5:
     (assign_forces_kernel<
         5, true>)<<<dim3(grid), dim3(block), data_length, nullptr>>>(
-        pdata, p, particle_force_gpu, prefactor, parts_per_block);
+        params, part_pos, part_q, part_f, prefactor, parts_per_block);
     break;
   case 6:
     (assign_forces_kernel<
         6, true>)<<<dim3(grid), dim3(block), data_length, nullptr>>>(
-        pdata, p, particle_force_gpu, prefactor, parts_per_block);
+        params, part_pos, part_q, part_f, prefactor, parts_per_block);
     break;
   case 7:
     (assign_forces_kernel<
         7, true>)<<<dim3(grid), dim3(block), data_length, nullptr>>>(
-        pdata, p, particle_force_gpu, prefactor, parts_per_block);
+        params, part_pos, part_q, part_f, prefactor, parts_per_block);
     break;
   default:
     break;
@@ -549,10 +552,10 @@ void p3m_gpu_init(int cao, const int mesh[3], double alpha) {
   if (mesh[0] == -1 && mesh[1] == -1 && mesh[2] == -1)
     throw std::runtime_error("P3M: invalid mesh size");
 
-  auto &espresso_system = EspressoSystemInterface::Instance();
+  auto &gpu_particle_data = System::Instance().gpu;
 
   bool do_reinit = false, mesh_changed = false;
-  p3m_gpu_data.n_part = static_cast<int>(gpu_get_particle_pointer().size());
+  p3m_gpu_data.n_part = static_cast<int>(gpu_particle_data.n_particles());
 
   if (!p3m_gpu_data_initialized || p3m_gpu_data.alpha != alpha) {
     p3m_gpu_data.alpha = static_cast<REAL_TYPE>(alpha);
@@ -573,7 +576,7 @@ void p3m_gpu_init(int cao, const int mesh[3], double alpha) {
     do_reinit = true;
   }
 
-  auto const box_l = espresso_system.box();
+  auto const box_l = System::Instance().box();
 
   if (!p3m_gpu_data_initialized || (p3m_gpu_data.box[0] != box_l[0]) ||
       (p3m_gpu_data.box[1] != box_l[1]) || (p3m_gpu_data.box[2] != box_l[2])) {
@@ -677,15 +680,16 @@ void p3m_gpu_init(int cao, const int mesh[3], double alpha) {
  *  \brief The long-range part of the P3M algorithm.
  */
 void p3m_gpu_add_farfield_force(double prefactor) {
-  auto device_particles = gpu_get_particle_pointer();
-  CUDA_particle_data *particles_gpu = device_particles.data();
-  p3m_gpu_data.n_part = static_cast<int>(device_particles.size());
-
-  float *particle_force_gpu = gpu_get_particle_force_pointer();
+  auto &gpu = System::Instance().gpu;
+  p3m_gpu_data.n_part = static_cast<int>(gpu.n_particles());
 
   if (p3m_gpu_data.n_part == 0)
     return;
 
+  auto const forces_device = gpu.get_particle_forces_device();
+  auto const positions_device = gpu.get_particle_positions_device();
+  auto const charges_device = gpu.get_particle_charges_device();
+
   dim3 gridConv(static_cast<unsigned>(p3m_gpu_data.mesh[0]),
                 static_cast<unsigned>(p3m_gpu_data.mesh[1]), 1u);
   dim3 threadsConv(static_cast<unsigned>(p3m_gpu_data.mesh[2] / 2 + 1), 1u, 1u);
@@ -699,7 +703,7 @@ void p3m_gpu_add_farfield_force(double prefactor) {
                                sizeof(REAL_TYPE)));
 
   /* Interpolate the charges to the mesh */
-  assign_charges(particles_gpu, p3m_gpu_data);
+  assign_charges(p3m_gpu_data, positions_device, charges_device);
 
   /* Do forward FFT of the charge mesh */
   if (FFT_FORW_FFT(p3m_gpu_fft_plans.forw_plan,
@@ -724,7 +728,8 @@ void p3m_gpu_add_farfield_force(double prefactor) {
                (REAL_TYPE *)p3m_gpu_data.force_mesh_z);
 
   /* Assign the forces from the mesh back to the particles */
-  assign_forces(particles_gpu, p3m_gpu_data, particle_force_gpu, pref);
+  assign_forces(p3m_gpu_data, positions_device, charges_device, forces_device,
+                pref);
 }
 
 #endif // ELECTROSTATICS
diff --git a/src/core/energy.cpp b/src/core/energy.cpp
index af29f288d5b..517d943a737 100644
--- a/src/core/energy.cpp
+++ b/src/core/energy.cpp
@@ -22,19 +22,17 @@
  *  Energy calculation.
  */
 
-#include "EspressoSystemInterface.hpp"
 #include "Observable_stat.hpp"
 #include "communication.hpp"
 #include "constraints.hpp"
-#include "cuda/interface.hpp"
 #include "energy_inline.hpp"
 #include "event.hpp"
 #include "forces.hpp"
 #include "integrate.hpp"
 #include "interactions.hpp"
 #include "nonbonded_interactions/nonbonded_interaction_data.hpp"
-
 #include "short_range_loop.hpp"
+#include "system/System.hpp"
 
 #include "electrostatics/coulomb.hpp"
 #include "magnetostatics/dipoles.hpp"
@@ -52,14 +50,11 @@ std::shared_ptr<Observable_stat> calculate_energy() {
   }
 
   auto &obs_energy = *obs_energy_ptr;
-
-#ifdef CUDA
-  clear_energy_on_GPU();
+#if defined(CUDA) and (defined(ELECTROSTATICS) or defined(DIPOLES))
+  auto &gpu_particle_data = System::Instance().gpu;
+  gpu_particle_data.clear_energy_on_device();
+  gpu_particle_data.update();
 #endif
-
-  auto &espresso_system = EspressoSystemInterface::Instance();
-  espresso_system.update();
-
   on_observable_calc();
 
   auto const local_parts = cell_structure.local_particles();
@@ -104,8 +99,8 @@ std::shared_ptr<Observable_stat> calculate_energy() {
 
   Constraints::constraints.add_energy(local_parts, get_sim_time(), obs_energy);
 
-#ifdef CUDA
-  auto const energy_host = copy_energy_from_GPU();
+#if defined(CUDA) and (defined(ELECTROSTATICS) or defined(DIPOLES))
+  auto const energy_host = gpu_particle_data.copy_energy_to_host();
   if (!obs_energy.coulomb.empty())
     obs_energy.coulomb[1] += static_cast<double>(energy_host.coulomb);
   if (!obs_energy.dipolar.empty())
diff --git a/src/core/event.cpp b/src/core/event.cpp
index fa9c2b25400..9c6997b3196 100644
--- a/src/core/event.cpp
+++ b/src/core/event.cpp
@@ -32,7 +32,6 @@
 #include "communication.hpp"
 #include "config/config.hpp"
 #include "cuda/init.hpp"
-#include "cuda/interface.hpp"
 #include "cuda/utils.hpp"
 #include "electrostatics/coulomb.hpp"
 #include "electrostatics/icc.hpp"
@@ -101,11 +100,6 @@ void on_integration_start(double time_step) {
   /* end sanity checks                        */
   /********************************************/
 
-#ifdef CUDA
-  MPI_Bcast(gpu_get_global_particle_vars_pointer_host(),
-            sizeof(CUDA_global_part_vars), MPI_BYTE, 0, comm_cart);
-#endif
-
   /* Prepare the thermostat */
   if (reinit_thermo) {
     thermo_init(time_step);
diff --git a/src/core/forces.cpp b/src/core/forces.cpp
index b2db558000e..a5e3eefaeae 100644
--- a/src/core/forces.cpp
+++ b/src/core/forces.cpp
@@ -24,8 +24,6 @@
  *  The corresponding header file is forces.hpp.
  */
 
-#include "EspressoSystemInterface.hpp"
-
 #include "bond_breakage/bond_breakage.hpp"
 #include "cell_system/CellStructure.hpp"
 #include "cells.hpp"
@@ -48,6 +46,7 @@
 #include "npt.hpp"
 #include "rotation.hpp"
 #include "short_range_loop.hpp"
+#include "system/System.hpp"
 #include "thermostat.hpp"
 #include "thermostats/langevin_inline.hpp"
 #include "virtual_sites.hpp"
@@ -146,8 +145,10 @@ void init_forces_ghosts(const ParticleRange &particles) {
 void force_calc(CellStructure &cell_structure, double time_step, double kT) {
   ESPRESSO_PROFILER_CXX_MARK_FUNCTION;
 
-  auto &espresso_system = EspressoSystemInterface::Instance();
-  espresso_system.update();
+#ifdef CUDA
+  auto &espresso_system = System::Instance();
+  espresso_system.gpu.update();
+#endif
 
 #ifdef COLLISION_DETECTION
   prepare_local_collision_queue();
@@ -230,7 +231,7 @@ void force_calc(CellStructure &cell_structure, double time_step, double kT) {
   }
 
 #ifdef CUDA
-  copy_forces_from_GPU(particles, this_node);
+  espresso_system.gpu.copy_forces_to_host(particles, this_node);
 #endif
 
 // VIRTUAL_SITES distribute forces
diff --git a/src/core/magnetostatics/barnes_hut_gpu.cpp b/src/core/magnetostatics/barnes_hut_gpu.cpp
index efc1d0dae02..50caca8c445 100644
--- a/src/core/magnetostatics/barnes_hut_gpu.cpp
+++ b/src/core/magnetostatics/barnes_hut_gpu.cpp
@@ -24,11 +24,11 @@
 #include "magnetostatics/barnes_hut_gpu.hpp"
 #include "magnetostatics/barnes_hut_gpu_cuda.cuh"
 
-#include "EspressoSystemInterface.hpp"
 #include "communication.hpp"
-#include "cuda/interface.hpp"
 #include "cuda/utils.hpp"
 #include "errorhandling.hpp"
+#include "system/GpuParticleData.hpp"
+#include "system/System.hpp"
 
 DipolarBarnesHutGpu::DipolarBarnesHutGpu(double prefactor, double epssq,
                                          double itolsq)
@@ -42,11 +42,11 @@ DipolarBarnesHutGpu::DipolarBarnesHutGpu(double prefactor, double epssq,
   if (m_epssq <= 0.) {
     throw std::domain_error("Parameter 'epssq' must be > 0");
   }
-  auto &system = EspressoSystemInterface::Instance();
-  system.requestFGpu();
-  system.requestTorqueGpu();
-  system.requestRGpu();
-  system.requestDipGpu();
+  auto &gpu_particle_data = System::Instance().gpu;
+  gpu_particle_data.enable_property(GpuParticleData::prop::force);
+  gpu_particle_data.enable_property(GpuParticleData::prop::torque);
+  gpu_particle_data.enable_property(GpuParticleData::prop::pos);
+  gpu_particle_data.enable_property(GpuParticleData::prop::dip);
   if (this_node == 0) {
     setBHPrecision(static_cast<float>(m_epssq), static_cast<float>(m_itolsq));
   }
@@ -65,12 +65,14 @@ int call_kernel(void (*fp)(Args...), ArgRef &&...args) {
 }
 
 int DipolarBarnesHutGpu::initialize_data_structure() {
-  auto &system = EspressoSystemInterface::Instance();
-  auto const n_part = static_cast<int>(system.npart_gpu());
+  auto &gpu = System::Instance().gpu;
+  auto const n_part = static_cast<int>(gpu.n_particles());
   auto const error_code = call_kernel(allocBHmemCopy, n_part, &m_bh_data);
 
   if (error_code == ES_OK) {
-    fill_bh_data(system.rGpuBegin(), system.dipGpuBegin(), &m_bh_data);
+    auto const positions_device = gpu.get_particle_positions_device();
+    auto const dipoles_device = gpu.get_particle_dipoles_device();
+    fill_bh_data(positions_device, dipoles_device, &m_bh_data);
     initBHgpu(m_bh_data.blocks);
     buildBoxBH(m_bh_data.blocks);
     buildTreeBH(m_bh_data.blocks);
@@ -82,22 +84,24 @@ int DipolarBarnesHutGpu::initialize_data_structure() {
 }
 
 void DipolarBarnesHutGpu::add_long_range_forces() {
-  auto &system = EspressoSystemInterface::Instance();
-  system.update();
+  auto &gpu_particle_data = System::Instance().gpu;
+  gpu_particle_data.update();
   if (this_node == 0) {
     if (initialize_data_structure() == ES_OK) {
+      auto forces_device = gpu_particle_data.get_particle_forces_device();
+      auto torques_device = gpu_particle_data.get_particle_torques_device();
       call_kernel(forceBH, &m_bh_data, static_cast<float>(prefactor),
-                  system.fGpuBegin(), system.torqueGpuBegin());
+                  forces_device, torques_device);
     }
   }
 }
 
 void DipolarBarnesHutGpu::long_range_energy() {
-  auto &system = EspressoSystemInterface::Instance();
-  system.update();
+  auto &gpu_particle_data = System::Instance().gpu;
+  gpu_particle_data.update();
   if (this_node == 0) {
     if (initialize_data_structure() == ES_OK) {
-      auto energy = &(reinterpret_cast<CUDA_energy *>(system.eGpu())->dipolar);
+      auto energy = &(gpu_particle_data.get_energy_device()->dipolar);
       call_kernel(energyBH, &m_bh_data, static_cast<float>(prefactor), energy);
     }
   }
diff --git a/src/core/magnetostatics/dipolar_direct_sum_gpu.cpp b/src/core/magnetostatics/dipolar_direct_sum_gpu.cpp
index ef04ac0b633..17c6ae397ad 100644
--- a/src/core/magnetostatics/dipolar_direct_sum_gpu.cpp
+++ b/src/core/magnetostatics/dipolar_direct_sum_gpu.cpp
@@ -24,10 +24,10 @@
 #include "magnetostatics/dipolar_direct_sum_gpu.hpp"
 #include "magnetostatics/dipolar_direct_sum_gpu_cuda.cuh"
 
-#include "EspressoSystemInterface.hpp"
 #include "communication.hpp"
-#include "cuda/interface.hpp"
 #include "grid.hpp"
+#include "system/GpuParticleData.hpp"
+#include "system/System.hpp"
 
 static void get_simulation_box(float *box, int *per) {
   for (int i = 0; i < 3; i++) {
@@ -38,41 +38,48 @@ static void get_simulation_box(float *box, int *per) {
 
 DipolarDirectSumGpu::DipolarDirectSumGpu(double prefactor)
     : prefactor{prefactor} {
-  auto &system = EspressoSystemInterface::Instance();
-  system.requestFGpu();
-  system.requestTorqueGpu();
-  system.requestRGpu();
-  system.requestDipGpu();
+  auto &gpu_particle_data = System::Instance().gpu;
+  gpu_particle_data.enable_property(GpuParticleData::prop::force);
+  gpu_particle_data.enable_property(GpuParticleData::prop::torque);
+  gpu_particle_data.enable_property(GpuParticleData::prop::pos);
+  gpu_particle_data.enable_property(GpuParticleData::prop::dip);
 }
 
 void DipolarDirectSumGpu::add_long_range_forces() const {
-  auto &system = EspressoSystemInterface::Instance();
-  system.update();
+  auto &gpu = System::Instance().gpu;
+  gpu.update();
   if (this_node != 0) {
     return;
   }
   float box[3];
   int periodicity[3];
   get_simulation_box(box, periodicity);
+  auto const npart = static_cast<unsigned>(gpu.n_particles());
+  auto const forces_device = gpu.get_particle_forces_device();
+  auto const torques_device = gpu.get_particle_torques_device();
+  auto const positions_device = gpu.get_particle_positions_device();
+  auto const dipoles_device = gpu.get_particle_dipoles_device();
   DipolarDirectSum_kernel_wrapper_force(
-      static_cast<float>(prefactor), static_cast<unsigned>(system.npart_gpu()),
-      system.rGpuBegin(), system.dipGpuBegin(), system.fGpuBegin(),
-      system.torqueGpuBegin(), box, periodicity);
+      static_cast<float>(prefactor), npart, positions_device, dipoles_device,
+      forces_device, torques_device, box, periodicity);
 }
 
 void DipolarDirectSumGpu::long_range_energy() const {
-  auto &system = EspressoSystemInterface::Instance();
-  system.update();
+  auto &gpu = System::Instance().gpu;
+  gpu.update();
   if (this_node != 0) {
     return;
   }
   float box[3];
   int periodicity[3];
   get_simulation_box(box, periodicity);
-  auto energy = &(reinterpret_cast<CUDA_energy *>(system.eGpu())->dipolar);
-  DipolarDirectSum_kernel_wrapper_energy(
-      static_cast<float>(prefactor), static_cast<unsigned>(system.npart_gpu()),
-      system.rGpuBegin(), system.dipGpuBegin(), box, periodicity, energy);
+  auto const npart = static_cast<unsigned>(gpu.n_particles());
+  auto const energy_device = &(gpu.get_energy_device()->dipolar);
+  auto const positions_device = gpu.get_particle_positions_device();
+  auto const dipoles_device = gpu.get_particle_dipoles_device();
+  DipolarDirectSum_kernel_wrapper_energy(static_cast<float>(prefactor), npart,
+                                         positions_device, dipoles_device, box,
+                                         periodicity, energy_device);
 }
 
 #endif
diff --git a/src/core/serialization/CUDA_particle_data.hpp b/src/core/serialization/CUDA_particle_data.hpp
deleted file mode 100644
index 1ea146f0734..00000000000
--- a/src/core/serialization/CUDA_particle_data.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (C) 2010-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-#ifndef UTILS_SERIALIZATION_CUDA_PARTICLE_DATA_HPP
-#define UTILS_SERIALIZATION_CUDA_PARTICLE_DATA_HPP
-
-#include "cuda/interface.hpp"
-
-#include <boost/serialization/array.hpp>
-#include <boost/serialization/is_bitwise_serializable.hpp>
-#include <boost/serialization/split_free.hpp>
-
-BOOST_IS_BITWISE_SERIALIZABLE(CUDA_particle_data)
-
-namespace boost {
-namespace serialization {
-template <typename Archive>
-void load(Archive &ar, CUDA_particle_data &p,
-          const unsigned int /* file_version */) {
-  ar >> make_array(reinterpret_cast<char *>(&p), sizeof(CUDA_particle_data));
-}
-
-template <typename Archive>
-void save(Archive &ar, CUDA_particle_data const &p,
-          const unsigned int /* file_version */) {
-  /* Cruel but effective */
-  ar << make_array(reinterpret_cast<char const *>(&p),
-                   sizeof(CUDA_particle_data));
-}
-
-template <class Archive>
-void serialize(Archive &ar, CUDA_particle_data &p,
-               const unsigned int file_version) {
-  split_free(ar, p, file_version);
-}
-} // namespace serialization
-} // namespace boost
-
-#endif
diff --git a/src/core/system/GpuParticleData.cpp b/src/core/system/GpuParticleData.cpp
new file mode 100644
index 00000000000..83a8ed4fd50
--- /dev/null
+++ b/src/core/system/GpuParticleData.cpp
@@ -0,0 +1,174 @@
+/*
+ * Copyright (C) 2014-2022 The ESPResSo project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "config/config.hpp"
+
+#ifdef CUDA
+
+#include "GpuParticleData.hpp"
+
+#include "cells.hpp"
+#include "communication.hpp"
+#include "cuda/CudaHostAllocator.hpp"
+#include "grid.hpp"
+
+#include <utils/Span.hpp>
+#include <utils/Vector.hpp>
+#include <utils/mpi/gather_buffer.hpp>
+#include <utils/mpi/scatter_buffer.hpp>
+
+#include <boost/serialization/array.hpp>
+#include <boost/serialization/is_bitwise_serializable.hpp>
+#include <boost/serialization/split_free.hpp>
+
+#include <vector>
+
+void GpuParticleData::enable_particle_transfer() {
+  if (m_need_particles_update and not m_communication_enabled) {
+    if (::this_node == 0) {
+      gpu_init_particle_comm();
+    }
+    m_communication_enabled = true;
+  }
+}
+
+void GpuParticleData::copy_particles_to_device() {
+  copy_particles_to_device(::cell_structure.local_particles(), ::this_node);
+}
+
+BOOST_IS_BITWISE_SERIALIZABLE(GpuParticleData::GpuParticle)
+
+namespace boost {
+namespace serialization {
+template <typename Archive>
+void load(Archive &ar, GpuParticleData::GpuParticle &p, unsigned const) {
+  ar >> make_array(reinterpret_cast<char *>(&p),
+                   sizeof(GpuParticleData::GpuParticle));
+}
+template <typename Archive>
+void save(Archive &ar, GpuParticleData::GpuParticle const &p, unsigned const) {
+  ar << make_array(reinterpret_cast<char const *>(&p),
+                   sizeof(GpuParticleData::GpuParticle));
+}
+} // namespace serialization
+} // namespace boost
+
+BOOST_SERIALIZATION_SPLIT_FREE(GpuParticleData::GpuParticle)
+
+static void pack_particles(ParticleRange const &particles,
+                           GpuParticleData::GpuParticle *buffer) {
+  auto const &box_l = ::box_geo;
+  unsigned long int i = 0u;
+  for (auto const &p : particles) {
+    buffer[i].p = static_cast<Utils::Vector3f>(folded_position(p.pos(), box_l));
+#ifdef DIPOLES
+    buffer[i].dip = static_cast<Utils::Vector3f>(p.calc_dip());
+#endif
+#ifdef ELECTROSTATICS
+    buffer[i].q = static_cast<float>(p.q());
+#endif
+    buffer[i].identity = p.id();
+    i++;
+  }
+}
+
+void GpuParticleData::gather_particle_data(
+    ParticleRange const &particles,
+    pinned_vector<GpuParticle> &particle_data_host, int this_node) {
+  auto const n_part = particles.size();
+
+  if (this_node > 0) {
+    static std::vector<GpuParticle> buffer;
+    buffer.resize(n_part);
+    /* pack local parts into buffer */
+    pack_particles(particles, buffer.data());
+
+    Utils::Mpi::gather_buffer(buffer, comm_cart);
+  } else {
+    particle_data_host.resize(n_part);
+
+    /* Pack own particles */
+    pack_particles(particles, particle_data_host.data());
+
+    Utils::Mpi::gather_buffer(particle_data_host, comm_cart);
+  }
+}
+
+/**
+ * @brief Add a flat force (and torque) array to a range of particles.
+ *
+ * @param particles The particles the forces (and torques) should be added to
+ * @param forces The forces as flat array of size 3 * particles.size()
+ * @param torques The torques as flat array of size 3 * particles.size(),
+ *                this is only touched if ROTATION is active.
+ */
+static void add_forces_and_torques(ParticleRange const &particles,
+                                   Utils::Span<const float> forces,
+                                   Utils::Span<const float> torques) {
+  unsigned long int i = 0u;
+  for (auto &p : particles) {
+    for (unsigned long int j = 0u; j < 3u; j++) {
+      p.force()[j] += static_cast<double>(forces[3ul * i + j]);
+#ifdef ROTATION
+      p.torque()[j] += static_cast<double>(torques[3ul * i + j]);
+#endif
+    }
+    i++;
+  }
+}
+
+/**
+ * @brief Distribute forces to the worker nodes, and add them to the particles.
+ *
+ * @param particles    The particles for which the forces (and torques) should
+ *                     be added to.
+ * @param host_forces  The forces as flat array of size 3 * particles.size(),
+ *                     only relevant on the head node.
+ * @param host_torques The torques as flat array of size 3 * particles.size(),
+ *                     this is only touched if ROTATION is active. Only
+ *                     relevant on the head node.
+ */
+void GpuParticleData::particles_scatter_forces(
+    ParticleRange const &particles, Utils::Span<float> host_forces,
+    Utils::Span<float> host_torques) const {
+
+  auto const size = 3ul * particles.size();
+  auto const n_elements = static_cast<int>(size);
+
+  if (::this_node > 0) {
+    static std::vector<float> buffer_forces;
+    static std::vector<float> buffer_torques;
+
+    buffer_forces.resize(size);
+    Utils::Mpi::scatter_buffer(buffer_forces.data(), n_elements, ::comm_cart);
+#ifdef ROTATION
+    buffer_torques.resize(size);
+    Utils::Mpi::scatter_buffer(buffer_torques.data(), n_elements, ::comm_cart);
+#endif
+    add_forces_and_torques(particles, buffer_forces, buffer_torques);
+  } else {
+    Utils::Mpi::scatter_buffer(host_forces.data(), n_elements, ::comm_cart);
+#ifdef ROTATION
+    Utils::Mpi::scatter_buffer(host_torques.data(), n_elements, ::comm_cart);
+#endif
+    add_forces_and_torques(particles, host_forces, host_torques);
+  }
+}
+
+#endif
diff --git a/src/core/system/GpuParticleData.hpp b/src/core/system/GpuParticleData.hpp
new file mode 100644
index 00000000000..4c7659e7156
--- /dev/null
+++ b/src/core/system/GpuParticleData.hpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (C) 2014-2022 The ESPResSo project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "config/config.hpp"
+
+#ifdef CUDA
+
+#include "ParticleRange.hpp"
+#include "cuda/CudaHostAllocator.hpp"
+
+#include <utils/Span.hpp>
+#include <utils/Vector.hpp>
+
+#include <bitset>
+#include <cstddef>
+#include <memory>
+
+/**
+ * @brief Particle data communication manager for the GPU.
+ *
+ * When data is synchronized between host and device memory, a subset
+ * of the @ref Particle struct is copied from each particle on the host
+ * to the corresponding @ref GpuParticle struct on the device via
+ * @ref GpuParticleData::update(). Once the transfer is complete,
+ * the particle AoS on the device is copied (or "split") to a SoA
+ * automatically.
+ *
+ * Note that once a particle member is requested, the corresponding device
+ * memory is allocated and populated at every time step, even when the GPU
+ * method that originally requested the data is disabled.
+ */
+class GpuParticleData {
+public:
+  /** @brief Particle properties that need to be communicated to the GPU. */
+  struct prop {
+    static constexpr std::size_t pos = 0;
+    static constexpr std::size_t force = 1;
+    static constexpr std::size_t torque = 2;
+    static constexpr std::size_t q = 3;
+    static constexpr std::size_t dip = 4;
+    using bitset = std::bitset<5>;
+  };
+
+  /** @brief Energies that are retrieved from the GPU. */
+  struct GpuEnergy {
+    float coulomb, dipolar;
+  };
+
+  /** @brief Subset of @ref Particle which is copied to the GPU. */
+  struct GpuParticle {
+    Utils::Vector3f p;
+#ifdef DIPOLES
+    Utils::Vector3f dip;
+#endif
+#ifdef ELECTROSTATICS
+    float q;
+#endif
+    int identity;
+  };
+
+private:
+  // forward declare
+  struct Storage;
+  /** @brief Whether a device was found and data structures were allocated. */
+  bool m_communication_enabled = false;
+  /** @brief Whether to convert particle properties from AoS to SoA. */
+  bool m_split_particle_struct = false;
+  /** @brief Whether particle transfer to the GPU was requested. */
+  bool m_need_particles_update = false;
+  /** @brief Host and device containers. */
+  std::unique_ptr<Storage> m_data;
+
+  void gpu_init_particle_comm();
+  void enable_particle_transfer();
+  void copy_particles_to_device();
+  void copy_particles_to_device(ParticleRange const &particles, int this_node);
+  /** @brief Collect particles from all nodes to the head node. */
+  void gather_particle_data(ParticleRange const &particles,
+                            pinned_vector<GpuParticle> &particle_data_host,
+                            int this_node);
+  void particles_scatter_forces(ParticleRange const &particles,
+                                Utils::Span<float> host_forces,
+                                Utils::Span<float> host_torques) const;
+
+public:
+  GpuParticleData();
+  ~GpuParticleData();
+
+  void update() {
+    if (m_need_particles_update and m_communication_enabled) {
+      copy_particles_to_device();
+    }
+  }
+  void init() { update(); }
+  void enable_property(std::size_t property);
+  void clear_energy_on_device();
+  void copy_forces_to_host(ParticleRange const &particles, int this_node);
+  std::size_t n_particles() const;
+
+  GpuEnergy copy_energy_to_host() const;
+  GpuEnergy *get_energy_device() const;
+  float *get_particle_positions_device() const;
+  float *get_particle_forces_device() const;
+#ifdef ROTATION
+  float *get_particle_torques_device() const;
+#endif
+#ifdef DIPOLES
+  float *get_particle_dipoles_device() const;
+#endif
+#ifdef ELECTROSTATICS
+  float *get_particle_charges_device() const;
+#endif
+};
+
+#endif // CUDA
diff --git a/src/core/system/GpuParticleData_cuda.cu b/src/core/system/GpuParticleData_cuda.cu
new file mode 100644
index 00000000000..b129fc3d465
--- /dev/null
+++ b/src/core/system/GpuParticleData_cuda.cu
@@ -0,0 +1,399 @@
+/*
+ * Copyright (C) 2014-2022 The ESPResSo project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/**
+ * @file
+ * CUDA kernels to convert the particles AoS to a SoA on the device.
+ */
+
+#include "config/config.hpp"
+
+#include "GpuParticleData.hpp"
+
+#include "ParticleRange.hpp"
+#include "errorhandling.hpp"
+
+#include "cuda/CudaDeviceAllocator.hpp"
+#include "cuda/init.hpp"
+#include "cuda/utils.cuh"
+
+#include <utils/Span.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+
+#include <cuda.h>
+
+#include <cstddef>
+#include <cstdio>
+#include <memory>
+
+#if defined(OMPI_MPI_H) || defined(_MPI_H)
+#error CU-file includes mpi.h! This should not happen!
+#endif
+
+template <class T>
+using device_vector = thrust::device_vector<T, CudaDeviceAllocator<T>>;
+
+template <class T, class A>
+T *raw_data_pointer(thrust::device_vector<T, A> &vec) {
+  return thrust::raw_pointer_cast(vec.data());
+}
+
+template <class SpanLike> std::size_t byte_size(SpanLike const &v) {
+  return v.size() * sizeof(typename SpanLike::value_type);
+}
+
+/**
+ * @brief Resize a @ref device_vector.
+ *
+ * Due to a bug in thrust (https://github.com/thrust/thrust/issues/939),
+ * resizing or appending to default constructed containers causes undefined
+ * behavior by dereferencing a null-pointer for certain types. This
+ * function is used instead of the resize member function to side-step
+ * the problem. This is done by replacing the existing vector by a new
+ * one constructed with the desired size if resizing from capacity zero.
+ * Behaves as-if vec.resize(n) was called.
+ * This is fixed in Thrust 1.11, shipped in CUDA 11.3
+ * (https://github.com/NVIDIA/thrust/commit/1c4f25d9).
+ *
+ * @tparam T Type contained in the vector.
+ * @param vec Vector To resize.
+ * @param n Desired new size of the element.
+ */
+template <class T>
+void resize_or_replace(device_vector<T> &vec, std::size_t n) {
+  if (vec.capacity() == 0) {
+    vec = device_vector<T>(n);
+  } else {
+    vec.resize(n);
+  }
+}
+
+/** @brief Host and device containers for particle data. */
+struct GpuParticleData::Storage {
+  /** @brief Which particle properties are needed by GPU methods. */
+  GpuParticleData::prop::bitset m_need;
+  GpuParticleData::GpuEnergy *energy_device = nullptr;
+  std::size_t current_size = 0ul;
+  pinned_vector<GpuParticle> particle_data_host;
+  device_vector<GpuParticle> particle_data_device;
+  pinned_vector<float> particle_forces_host;
+  device_vector<float> particle_forces_device;
+#ifdef ROTATION
+  pinned_vector<float> particle_torques_host;
+  device_vector<float> particle_torques_device;
+#endif
+  float *particle_pos_device = nullptr;
+#ifdef DIPOLES
+  float *particle_dip_device = nullptr;
+#endif
+#ifdef ELECTROSTATICS
+  float *particle_q_device = nullptr;
+#endif
+
+  void realloc_device_memory();
+  void split_particle_struct();
+  void copy_particles_to_device();
+  void copy_particle_forces_to_host() {
+    if (not particle_forces_device.empty()) {
+      thrust::copy(particle_forces_device.begin(), particle_forces_device.end(),
+                   particle_forces_host.begin());
+    }
+  }
+#ifdef ROTATION
+  void copy_particle_torques_to_host() {
+    if (not particle_torques_device.empty()) {
+      thrust::copy(particle_torques_device.begin(),
+                   particle_torques_device.end(),
+                   particle_torques_host.begin());
+    }
+  }
+#endif
+  Utils::Span<float> get_particle_forces_host_span() {
+    return {particle_forces_host.data(), particle_forces_host.size()};
+  }
+#ifdef ROTATION
+  Utils::Span<float> get_particle_torques_host_span() {
+    return {particle_torques_host.data(), particle_torques_host.size()};
+  }
+#endif
+};
+
+GpuParticleData::GpuParticleData() {
+  m_data = std::make_unique<GpuParticleData::Storage>();
+}
+
+GpuParticleData::~GpuParticleData() {}
+
+std::size_t GpuParticleData::n_particles() const {
+  return m_data->particle_data_device.size();
+}
+
+float *GpuParticleData::get_particle_positions_device() const {
+  return m_data->particle_pos_device;
+}
+
+float *GpuParticleData::get_particle_forces_device() const {
+  return raw_data_pointer(m_data->particle_forces_device);
+}
+
+#ifdef ROTATION
+float *GpuParticleData::get_particle_torques_device() const {
+  return raw_data_pointer(m_data->particle_torques_device);
+}
+#endif
+
+#ifdef DIPOLES
+float *GpuParticleData::get_particle_dipoles_device() const {
+  return m_data->particle_dip_device;
+}
+#endif
+
+#ifdef ELECTROSTATICS
+float *GpuParticleData::get_particle_charges_device() const {
+  return m_data->particle_q_device;
+}
+#endif
+
+GpuParticleData::GpuEnergy *GpuParticleData::get_energy_device() const {
+  return m_data->energy_device;
+}
+
+void GpuParticleData::enable_property(std::size_t property) {
+  m_need_particles_update = true;
+  m_data->m_need[property] = true;
+  if (property != prop::force and property != prop::torque) {
+    m_split_particle_struct = true;
+  }
+  enable_particle_transfer();
+}
+
+/**
+ * @brief Setup and call particle reallocation from the host.
+ */
+void GpuParticleData::gpu_init_particle_comm() {
+  try {
+    cuda_check_device();
+  } catch (cuda_runtime_error const &err) {
+    fprintf(stderr, "ERROR: %s\n", err.what());
+    errexit();
+  }
+  m_data->realloc_device_memory();
+}
+
+void GpuParticleData::Storage::copy_particles_to_device() {
+  // resize buffers
+  auto const n_part = particle_data_host.size();
+  resize_or_replace(particle_data_device, n_part);
+  particle_forces_host.resize(3ul * n_part);
+  resize_or_replace(particle_forces_device, 3ul * n_part);
+#ifdef ROTATION
+  particle_torques_host.resize(3ul * n_part);
+  resize_or_replace(particle_torques_device, 3ul * n_part);
+#endif
+
+  // zero out device memory for forces and torques
+  cudaMemsetAsync(raw_data_pointer(particle_forces_device), 0x0,
+                  byte_size(particle_forces_device), stream[0]);
+#ifdef ROTATION
+  cudaMemsetAsync(raw_data_pointer(particle_torques_device), 0x0,
+                  byte_size(particle_torques_device), stream[0]);
+#endif
+
+  // copy particles to device
+  cudaMemcpyAsync(raw_data_pointer(particle_data_device),
+                  particle_data_host.data(), byte_size(particle_data_host),
+                  cudaMemcpyHostToDevice, stream[0]);
+}
+
+void GpuParticleData::copy_particles_to_device(ParticleRange const &particles,
+                                               int this_node) {
+  if (m_communication_enabled) {
+    gather_particle_data(particles, m_data->particle_data_host, this_node);
+    if (this_node == 0) {
+      m_data->copy_particles_to_device();
+      if (m_split_particle_struct) {
+        m_data->realloc_device_memory();
+        m_data->split_particle_struct();
+      }
+    }
+  }
+}
+
+void GpuParticleData::copy_forces_to_host(ParticleRange const &particles,
+                                          int this_node) {
+  if (m_communication_enabled) {
+    // copy results from device memory to host memory
+    if (this_node == 0) {
+      m_data->copy_particle_forces_to_host();
+#ifdef ROTATION
+      m_data->copy_particle_torques_to_host();
+#endif
+    }
+
+    auto forces_buffer = m_data->get_particle_forces_host_span();
+#ifdef ROTATION
+    auto torques_buffer = m_data->get_particle_torques_host_span();
+#else
+    auto torques_buffer = Utils::Span<float>{nullptr, std::size_t{0ul}};
+#endif
+
+    // add forces and torques to the particles
+    particles_scatter_forces(particles, forces_buffer, torques_buffer);
+  }
+}
+
+void GpuParticleData::clear_energy_on_device() {
+  if (m_communication_enabled) {
+    if (m_data->energy_device == nullptr) {
+      cuda_safe_mem(cudaMalloc(&m_data->energy_device, sizeof(GpuEnergy)));
+    }
+    cuda_safe_mem(cudaMemset(m_data->energy_device, 0, sizeof(GpuEnergy)));
+  }
+}
+
+GpuParticleData::GpuEnergy GpuParticleData::copy_energy_to_host() const {
+  GpuEnergy energy_host{};
+  if (m_communication_enabled) {
+    cuda_safe_mem(cudaMemcpy(&energy_host, m_data->energy_device,
+                             sizeof(GpuEnergy), cudaMemcpyDeviceToHost));
+  }
+  return energy_host;
+}
+
+// Position only
+__global__ void split_kernel_r(GpuParticleData::GpuParticle *particles,
+                               float *r, std::size_t n) {
+  auto idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (idx >= n)
+    return;
+
+  auto const &p = particles[idx];
+  idx *= 3u;
+  r[idx + 0u] = p.p[0u];
+  r[idx + 1u] = p.p[1u];
+  r[idx + 2u] = p.p[2u];
+}
+
+#ifdef ELECTROSTATICS
+// Position and charge
+__global__ void split_kernel_rq(GpuParticleData::GpuParticle *particles,
+                                float *r, float *q, std::size_t n) {
+  auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (idx >= n)
+    return;
+
+  auto const &p = particles[idx];
+  r[3u * idx + 0u] = p.p[0u];
+  r[3u * idx + 1u] = p.p[1u];
+  r[3u * idx + 2u] = p.p[2u];
+  q[idx] = p.q;
+}
+
+// Charge only
+__global__ void split_kernel_q(GpuParticleData::GpuParticle *particles,
+                               float *q, std::size_t n) {
+  auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (idx >= n)
+    return;
+
+  auto const &p = particles[idx];
+  q[idx] = p.q;
+}
+#endif
+
+#ifdef DIPOLES
+// Dipole moment
+__global__ void split_kernel_dip(GpuParticleData::GpuParticle *particles,
+                                 float *dip, std::size_t n) {
+  auto idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (idx >= n)
+    return;
+
+  auto const &p = particles[idx];
+
+  idx *= 3u;
+
+  dip[idx + 0u] = p.dip[0u];
+  dip[idx + 1u] = p.dip[1u];
+  dip[idx + 2u] = p.dip[2u];
+}
+#endif
+
+void GpuParticleData::Storage::split_particle_struct() {
+  auto const n_part = particle_data_device.size();
+  if (n_part == 0ul)
+    return;
+
+  using prop = GpuParticleData::prop;
+  dim3 const threadsPerBlock{512u, 1u, 1u};
+  dim3 const numBlocks{static_cast<unsigned>(n_part / threadsPerBlock.x + 1ul)};
+
+#ifdef ELECTROSTATICS
+  if (m_need[prop::q] and m_need[prop::pos]) {
+    split_kernel_rq<<<numBlocks, threadsPerBlock, 0, nullptr>>>(
+        raw_data_pointer(particle_data_device), particle_pos_device,
+        particle_q_device, n_part);
+  } else if (m_need[prop::q]) {
+    split_kernel_q<<<numBlocks, threadsPerBlock, 0, nullptr>>>(
+        raw_data_pointer(particle_data_device), particle_q_device, n_part);
+  } else
+#endif
+      if (m_need[prop::pos]) {
+    split_kernel_r<<<numBlocks, threadsPerBlock, 0, nullptr>>>(
+        raw_data_pointer(particle_data_device), particle_pos_device, n_part);
+  }
+#ifdef DIPOLES
+  if (m_need[prop::dip]) {
+    split_kernel_dip<<<numBlocks, threadsPerBlock, 0, nullptr>>>(
+        raw_data_pointer(particle_data_device), particle_dip_device, n_part);
+  }
+#endif
+}
+
+void GpuParticleData::Storage::realloc_device_memory() {
+  using prop = GpuParticleData::prop;
+  auto const new_size = particle_data_device.size();
+  auto const resize_needed = new_size != current_size;
+  if (m_need[prop::pos] and (resize_needed or particle_pos_device == nullptr)) {
+    if (particle_pos_device != nullptr) {
+      cuda_safe_mem(cudaFree(particle_pos_device));
+    }
+    cuda_safe_mem(
+        cudaMalloc(&particle_pos_device, 3ul * new_size * sizeof(float)));
+  }
+#ifdef DIPOLES
+  if (m_need[prop::dip] and (resize_needed or particle_dip_device == nullptr)) {
+    if (particle_dip_device != nullptr) {
+      cuda_safe_mem(cudaFree(particle_dip_device));
+    }
+    cuda_safe_mem(
+        cudaMalloc(&particle_dip_device, 3ul * new_size * sizeof(float)));
+  }
+#endif
+#ifdef ELECTROSTATICS
+  if (m_need[prop::q] and (resize_needed or particle_q_device == nullptr)) {
+    if (particle_q_device != nullptr) {
+      cuda_safe_mem(cudaFree(particle_q_device));
+    }
+    cuda_safe_mem(cudaMalloc(&particle_q_device, new_size * sizeof(float)));
+  }
+#endif
+  current_size = new_size;
+}
diff --git a/src/core/SystemInterface.cpp b/src/core/system/System.cpp
similarity index 75%
rename from src/core/SystemInterface.cpp
rename to src/core/system/System.cpp
index 47df16f2115..44f083d7d54 100644
--- a/src/core/SystemInterface.cpp
+++ b/src/core/system/System.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 The ESPResSo project
+ * Copyright (C) 2014-2022 The ESPResSo project
  *
  * This file is part of ESPResSo.
  *
@@ -16,5 +16,12 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#include "SystemInterface.hpp"
-#include "config/config.hpp"
+
+#include "System.hpp"
+#include "grid.hpp"
+
+#include <utils/Vector.hpp>
+
+System *System::m_instance = nullptr;
+
+Utils::Vector3d System::box() const { return ::box_geo.length(); }
diff --git a/src/core/system/System.hpp b/src/core/system/System.hpp
new file mode 100644
index 00000000000..a102f7c1c89
--- /dev/null
+++ b/src/core/system/System.hpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2014-2022 The ESPResSo project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "config/config.hpp"
+
+#include "GpuParticleData.hpp"
+
+#include <utils/Vector.hpp>
+
+#include <cstddef>
+
+class System {
+public:
+  static System &Instance() {
+    if (!m_instance)
+      m_instance = new System;
+
+    return *m_instance;
+  }
+
+#ifdef CUDA
+  GpuParticleData gpu;
+#endif // ifdef CUDA
+
+  Utils::Vector3d box() const;
+
+protected:
+  static System *m_instance;
+};
diff --git a/src/core/unit_tests/CMakeLists.txt b/src/core/unit_tests/CMakeLists.txt
index f623143c189..a94a59072d8 100644
--- a/src/core/unit_tests/CMakeLists.txt
+++ b/src/core/unit_tests/CMakeLists.txt
@@ -32,9 +32,8 @@ unit_test(NAME EspressoSystemStandAlone_parallel_test SRC
 unit_test(NAME EspressoSystemStandAlone_serial_test SRC
           EspressoSystemStandAlone_test.cpp DEPENDS espresso::core Boost::mpi
           MPI::MPI_CXX NUM_PROC 1)
-unit_test(NAME EspressoSystemInterface_test SRC
-          EspressoSystemInterface_test.cpp DEPENDS espresso::core Boost::mpi
-          NUM_PROC 2)
+unit_test(NAME EspressoSystem_test SRC EspressoSystem_test.cpp DEPENDS
+          espresso::core Boost::mpi NUM_PROC 2)
 unit_test(NAME MpiCallbacks_test SRC MpiCallbacks_test.cpp DEPENDS
           espresso::utils Boost::mpi MPI::MPI_CXX NUM_PROC 2)
 unit_test(NAME ParticleIterator_test SRC ParticleIterator_test.cpp DEPENDS
diff --git a/src/core/unit_tests/EspressoSystemInterface_test.cpp b/src/core/unit_tests/EspressoSystemInterface_test.cpp
deleted file mode 100644
index 564ec771e52..00000000000
--- a/src/core/unit_tests/EspressoSystemInterface_test.cpp
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * Copyright (C) 2021-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#define BOOST_TEST_NO_MAIN
-#define BOOST_TEST_MODULE EspressoSystemInterface test
-#define BOOST_TEST_ALTERNATIVE_INIT_API
-#define BOOST_TEST_DYN_LINK
-#include <boost/test/unit_test.hpp>
-
-#include "ParticleFactory.hpp"
-
-#include "EspressoSystemInterface.hpp"
-#include "Particle.hpp"
-#include "cells.hpp"
-#include "communication.hpp"
-#include "particle_node.hpp"
-
-#include "config/config.hpp"
-
-#include <boost/mpi.hpp>
-
-#include <cassert>
-#include <memory>
-
-namespace espresso {
-static auto &system = EspressoSystemInterface::Instance();
-}
-
-inline void check_uninitialized_device_pointers() {
-  BOOST_CHECK_EQUAL(espresso::system.eGpu(), nullptr);
-  BOOST_CHECK_EQUAL(espresso::system.rGpuBegin(), nullptr);
-  BOOST_CHECK_EQUAL(espresso::system.dipGpuBegin(), nullptr);
-  BOOST_CHECK_EQUAL(espresso::system.fGpuBegin(), nullptr);
-  BOOST_CHECK_EQUAL(espresso::system.torqueGpuBegin(), nullptr);
-  BOOST_CHECK_EQUAL(espresso::system.qGpuBegin(), nullptr);
-}
-
-#ifdef CUDA
-
-#include "cuda/init.hpp"
-#include "cuda/utils.hpp"
-
-/* Decorator to run a unit test depending on GPU availability. */
-boost::test_tools::assertion_result has_gpu(boost::unit_test::test_unit_id) {
-  bool has_compatible_gpu = false;
-  try {
-    cuda_check_device();
-    has_compatible_gpu = true;
-  } catch (cuda_runtime_error const &) {
-  }
-  return has_compatible_gpu;
-}
-
-BOOST_FIXTURE_TEST_CASE(check_with_gpu, ParticleFactory,
-                        *boost::unit_test::precondition(has_gpu)) {
-  auto const rank = boost::mpi::communicator().rank();
-
-  check_uninitialized_device_pointers();
-  BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 0);
-
-  auto const p_id = 1;
-  create_particle({0., 0., 0.}, p_id, 0);
-
-  BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 0);
-  espresso::system.update();
-  BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 0);
-  espresso::system.requestParticleStructGpu();
-  espresso::system.update();
-  BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), (rank == 0) ? 1 : 0);
-
-  // check position split
-  BOOST_TEST(espresso::system.hasRGpu());
-  espresso::system.requestRGpu();
-  espresso::system.update();
-  if (rank == 0) {
-    BOOST_TEST(espresso::system.rGpuBegin() != nullptr);
-  } else {
-    BOOST_TEST(espresso::system.rGpuBegin() == nullptr);
-  }
-
-  // check force split
-  BOOST_TEST(espresso::system.hasFGpu());
-  espresso::system.requestFGpu();
-  espresso::system.update();
-  if (rank == 0) {
-    BOOST_TEST(espresso::system.fGpuBegin() != nullptr);
-  } else {
-    BOOST_TEST(espresso::system.fGpuBegin() == nullptr);
-  }
-
-  // check torque split
-#ifdef ROTATION
-  BOOST_CHECK(espresso::system.hasTorqueGpu());
-  espresso::system.requestTorqueGpu();
-  espresso::system.update();
-  if (rank == 0) {
-    BOOST_TEST(espresso::system.torqueGpuBegin() != nullptr);
-  } else {
-    BOOST_TEST(espresso::system.torqueGpuBegin() == nullptr);
-  }
-#else
-  BOOST_CHECK(!espresso::system.hasTorqueGpu());
-  BOOST_CHECK_THROW(espresso::system.requestTorqueGpu(), std::runtime_error);
-#endif
-
-  // check charge split
-#ifdef ELECTROSTATICS
-  BOOST_CHECK(espresso::system.hasQGpu());
-  espresso::system.requestQGpu();
-  espresso::system.update();
-  if (rank == 0) {
-    BOOST_TEST(espresso::system.qGpuBegin() != nullptr);
-  } else {
-    BOOST_TEST(espresso::system.qGpuBegin() == nullptr);
-  }
-#else
-  BOOST_CHECK(!espresso::system.hasQGpu());
-  BOOST_CHECK_THROW(espresso::system.requestQGpu(), std::runtime_error);
-#endif
-
-  // check dipole split
-#ifdef DIPOLES
-  BOOST_CHECK(espresso::system.hasDipGpu());
-  espresso::system.requestDipGpu();
-  espresso::system.update();
-  if (rank == 0) {
-    BOOST_TEST(espresso::system.dipGpuBegin() != nullptr);
-  } else {
-    BOOST_TEST(espresso::system.dipGpuBegin() == nullptr);
-  }
-#else
-  BOOST_CHECK(!espresso::system.hasDipGpu());
-  BOOST_CHECK_THROW(espresso::system.requestDipGpu(), std::runtime_error);
-#endif
-
-  // clear device memory
-  remove_particle(p_id);
-  espresso::system.update();
-  BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 0);
-}
-
-#else // CUDA
-
-BOOST_AUTO_TEST_CASE(check_without_cuda) {
-  check_uninitialized_device_pointers();
-  BOOST_CHECK_EQUAL(espresso::system.npart_gpu(), 0);
-  BOOST_TEST(!espresso::system.hasRGpu());
-  BOOST_TEST(!espresso::system.hasDipGpu());
-  BOOST_TEST(!espresso::system.hasFGpu());
-  BOOST_TEST(!espresso::system.hasTorqueGpu());
-  BOOST_TEST(!espresso::system.hasQGpu());
-  BOOST_CHECK_THROW(espresso::system.requestRGpu(), std::runtime_error);
-  BOOST_CHECK_THROW(espresso::system.requestDipGpu(), std::runtime_error);
-  BOOST_CHECK_THROW(espresso::system.requestFGpu(), std::runtime_error);
-  BOOST_CHECK_THROW(espresso::system.requestTorqueGpu(), std::runtime_error);
-  BOOST_CHECK_THROW(espresso::system.requestQGpu(), std::runtime_error);
-}
-
-#endif // CUDA
-
-int main(int argc, char **argv) {
-  auto mpi_env = mpi_init(argc, argv);
-
-  // initialize the MpiCallbacks framework
-  Communication::init(mpi_env);
-
-  espresso::system.init();
-
-  return boost::unit_test::unit_test_main(init_unit_test, argc, argv);
-}
diff --git a/src/core/unit_tests/EspressoSystem_test.cpp b/src/core/unit_tests/EspressoSystem_test.cpp
new file mode 100644
index 00000000000..71bbbca6f0c
--- /dev/null
+++ b/src/core/unit_tests/EspressoSystem_test.cpp
@@ -0,0 +1,160 @@
+/*
+ * Copyright (C) 2021-2022 The ESPResSo project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define BOOST_TEST_NO_MAIN
+#define BOOST_TEST_MODULE System test
+#define BOOST_TEST_ALTERNATIVE_INIT_API
+#define BOOST_TEST_DYN_LINK
+#include <boost/test/unit_test.hpp>
+
+#include "config/config.hpp"
+
+#ifdef CUDA
+
+#include "ParticleFactory.hpp"
+
+#include "Particle.hpp"
+#include "cells.hpp"
+#include "communication.hpp"
+#include "particle_node.hpp"
+#include "system/GpuParticleData.hpp"
+#include "system/System.hpp"
+
+#include "cuda/init.hpp"
+#include "cuda/utils.hpp"
+
+#include <boost/mpi.hpp>
+
+#include <cassert>
+#include <memory>
+
+namespace espresso {
+static auto &system = System::Instance();
+}
+
+/* Decorator to run a unit test depending on GPU availability. */
+boost::test_tools::assertion_result has_gpu(boost::unit_test::test_unit_id) {
+  bool has_compatible_gpu = false;
+  try {
+    cuda_check_device();
+    has_compatible_gpu = true;
+  } catch (cuda_runtime_error const &) {
+  }
+  return has_compatible_gpu;
+}
+
+BOOST_FIXTURE_TEST_CASE(check_with_gpu, ParticleFactory,
+                        *boost::unit_test::precondition(has_gpu)) {
+  auto const rank = boost::mpi::communicator().rank();
+  auto &gpu = espresso::system.gpu;
+
+  // check uninitialized device pointers
+  BOOST_CHECK_EQUAL(gpu.get_energy_device(), nullptr);
+  BOOST_CHECK_EQUAL(gpu.get_particle_positions_device(), nullptr);
+#ifdef DIPOLES
+  BOOST_CHECK_EQUAL(gpu.get_particle_dipoles_device(), nullptr);
+#endif
+  BOOST_CHECK_EQUAL(gpu.get_particle_forces_device(), nullptr);
+#ifdef ROTATION
+  BOOST_CHECK_EQUAL(gpu.get_particle_torques_device(), nullptr);
+#endif
+#ifdef ELECTROSTATICS
+  BOOST_CHECK_EQUAL(gpu.get_particle_charges_device(), nullptr);
+#endif
+  BOOST_CHECK_EQUAL(gpu.n_particles(), 0);
+
+  auto const p_id = 1;
+  create_particle({0., 0., 0.}, p_id, 0);
+
+  BOOST_CHECK_EQUAL(gpu.n_particles(), 0);
+  gpu.update();
+  BOOST_CHECK_EQUAL(gpu.n_particles(), 0);
+
+  // check position split
+  gpu.enable_property(GpuParticleData::prop::pos);
+  gpu.update();
+  BOOST_CHECK_EQUAL(gpu.n_particles(), (rank == 0) ? 1 : 0);
+  if (rank == 0) {
+    BOOST_TEST(gpu.get_particle_positions_device() != nullptr);
+  } else {
+    BOOST_TEST(gpu.get_particle_positions_device() == nullptr);
+  }
+
+  // check force split
+  gpu.enable_property(GpuParticleData::prop::force);
+  gpu.update();
+  if (rank == 0) {
+    BOOST_TEST(gpu.get_particle_forces_device() != nullptr);
+  } else {
+    BOOST_TEST(gpu.get_particle_forces_device() == nullptr);
+  }
+
+  // check torque split
+#ifdef ROTATION
+  gpu.enable_property(GpuParticleData::prop::torque);
+  gpu.update();
+  if (rank == 0) {
+    BOOST_TEST(gpu.get_particle_torques_device() != nullptr);
+  } else {
+    BOOST_TEST(gpu.get_particle_torques_device() == nullptr);
+  }
+#endif
+
+  // check charge split
+#ifdef ELECTROSTATICS
+  gpu.enable_property(GpuParticleData::prop::q);
+  gpu.update();
+  if (rank == 0) {
+    BOOST_TEST(gpu.get_particle_charges_device() != nullptr);
+  } else {
+    BOOST_TEST(gpu.get_particle_charges_device() == nullptr);
+  }
+#endif
+
+  // check dipole split
+#ifdef DIPOLES
+  gpu.enable_property(GpuParticleData::prop::dip);
+  gpu.update();
+  if (rank == 0) {
+    BOOST_TEST(gpu.get_particle_dipoles_device() != nullptr);
+  } else {
+    BOOST_TEST(gpu.get_particle_dipoles_device() == nullptr);
+  }
+#endif
+
+  // clear device memory
+  remove_particle(p_id);
+  gpu.update();
+  BOOST_CHECK_EQUAL(gpu.n_particles(), 0);
+}
+
+int main(int argc, char **argv) {
+  auto mpi_env = mpi_init(argc, argv);
+
+  // initialize the MpiCallbacks framework
+  Communication::init(mpi_env);
+
+  espresso::system.gpu.init();
+
+  return boost::unit_test::unit_test_main(init_unit_test, argc, argv);
+}
+
+#else // CUDA
+int main(int argc, char **argv) {}
+#endif

From 511876fd02c75a3a2ba8e2bb01c428e6fab199f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Thu, 8 Jun 2023 20:19:53 +0200
Subject: [PATCH 4/5] script_interface: Rewrite System class

---
 src/core/EspressoSystemStandAlone.cpp         |   2 +
 src/core/electrostatics/mmm1d_gpu.cpp         |   4 +-
 src/core/electrostatics/mmm1d_gpu_cuda.cu     |  10 +-
 src/core/electrostatics/p3m_gpu.cpp           |   2 +-
 src/core/electrostatics/p3m_gpu.hpp           |   1 +
 src/core/electrostatics/p3m_gpu_cuda.cu       |   7 +-
 src/core/energy.cpp                           |   2 +-
 src/core/forces.cpp                           |   2 +-
 src/core/magnetostatics/barnes_hut_gpu.cpp    |   8 +-
 .../magnetostatics/dipolar_direct_sum_gpu.cpp |   6 +-
 src/core/system/System.cpp                    |  14 ++-
 src/core/system/System.hpp                    |  19 ++--
 src/core/unit_tests/EspressoSystem_test.cpp   |   2 +-
 src/python/espressomd/system.py               | 107 +++++-------------
 .../electrostatics/CoulombP3MGPU.hpp          |   1 -
 src/script_interface/system/CMakeLists.txt    |   1 -
 src/script_interface/system/Globals.cpp       |  81 -------------
 src/script_interface/system/Globals.hpp       |  62 ----------
 src/script_interface/system/System.cpp        |  78 ++++++++++++-
 src/script_interface/system/System.hpp        |  12 +-
 src/script_interface/system/SystemFacade.hpp  |  73 ++++++++++++
 src/script_interface/system/initialize.cpp    |   4 +-
 22 files changed, 236 insertions(+), 262 deletions(-)
 delete mode 100644 src/script_interface/system/Globals.cpp
 delete mode 100644 src/script_interface/system/Globals.hpp
 create mode 100644 src/script_interface/system/SystemFacade.hpp

diff --git a/src/core/EspressoSystemStandAlone.cpp b/src/core/EspressoSystemStandAlone.cpp
index 024fd403b33..3e502b9a630 100644
--- a/src/core/EspressoSystemStandAlone.cpp
+++ b/src/core/EspressoSystemStandAlone.cpp
@@ -24,6 +24,7 @@
 #include "event.hpp"
 #include "grid.hpp"
 #include "integrate.hpp"
+#include "system/System.hpp"
 #include "virtual_sites.hpp"
 #include "virtual_sites/VirtualSitesOff.hpp"
 
@@ -46,6 +47,7 @@ EspressoSystemStandAlone::EspressoSystemStandAlone(int argc, char **argv) {
 #ifdef VIRTUAL_SITES
   set_virtual_sites(std::make_shared<VirtualSitesOff>());
 #endif
+  ::System::set_system(std::make_shared<::System::System>());
 }
 
 void EspressoSystemStandAlone::set_box_l(Utils::Vector3d const &box_l) const {
diff --git a/src/core/electrostatics/mmm1d_gpu.cpp b/src/core/electrostatics/mmm1d_gpu.cpp
index 1764a4a2376..0be54f7992b 100644
--- a/src/core/electrostatics/mmm1d_gpu.cpp
+++ b/src/core/electrostatics/mmm1d_gpu.cpp
@@ -52,7 +52,7 @@ CoulombMMM1DGpu::CoulombMMM1DGpu(double prefactor, double maxPWerror,
     throw std::domain_error("Parameter 'bessel_cutoff' must be > 0");
   }
 
-  auto &gpu_particle_data = System::Instance().gpu;
+  auto &gpu_particle_data = System::get_system().gpu;
   gpu_particle_data.enable_property(GpuParticleData::prop::force);
   gpu_particle_data.enable_property(GpuParticleData::prop::pos);
   gpu_particle_data.enable_property(GpuParticleData::prop::q);
@@ -75,7 +75,7 @@ void CoulombMMM1DGpu::sanity_checks_cell_structure() const {
 }
 
 void CoulombMMM1DGpu::tune() {
-  System::Instance().gpu.init();
+  System::get_system().gpu.init();
   if (this_node == 0) {
     setup();
     tune(maxPWerror, far_switch_radius, bessel_cutoff);
diff --git a/src/core/electrostatics/mmm1d_gpu_cuda.cu b/src/core/electrostatics/mmm1d_gpu_cuda.cu
index a7c6216b14d..d1ab96b952e 100644
--- a/src/core/electrostatics/mmm1d_gpu_cuda.cu
+++ b/src/core/electrostatics/mmm1d_gpu_cuda.cu
@@ -146,8 +146,8 @@ static auto numBlocks(std::size_t n_part) {
 }
 
 void CoulombMMM1DGpu::setup() {
-  auto &gpu_particle_data = System::Instance().gpu;
-  auto const box_z = static_cast<float>(System::Instance().box()[2]);
+  auto &gpu_particle_data = System::get_system().gpu;
+  auto const box_z = static_cast<float>(System::get_system().box()[2]);
   auto const n_part = gpu_particle_data.n_particles();
   if (not m_is_tuned and n_part != 0) {
     set_params(box_z, prefactor, maxPWerror, far_switch_radius, bessel_cutoff);
@@ -518,7 +518,7 @@ void CoulombMMM1DGpu::add_long_range_forces() {
     throw std::runtime_error("MMM1D was not initialized correctly");
   }
 
-  auto &gpu = System::Instance().gpu;
+  auto &gpu = System::get_system().gpu;
   auto const positions_device = gpu.get_particle_positions_device();
   auto const charges_device = gpu.get_particle_charges_device();
   auto const forces_device = gpu.get_particle_forces_device();
@@ -551,7 +551,7 @@ void CoulombMMM1DGpu::add_long_range_energy() {
     throw std::runtime_error("MMM1D was not initialized correctly");
   }
 
-  auto &gpu = System::Instance().gpu;
+  auto &gpu = System::get_system().gpu;
   auto const positions_device = gpu.get_particle_positions_device();
   auto const charges_device = gpu.get_particle_charges_device();
   auto const energy_device = gpu.get_energy_device();
@@ -573,7 +573,7 @@ float CoulombMMM1DGpu::force_benchmark() {
   float elapsedTime;
   float *dev_f_benchmark;
 
-  auto &gpu = System::Instance().gpu;
+  auto &gpu = System::get_system().gpu;
   auto const positions_device = gpu.get_particle_positions_device();
   auto const charges_device = gpu.get_particle_charges_device();
   auto const n_part = gpu.n_particles();
diff --git a/src/core/electrostatics/p3m_gpu.cpp b/src/core/electrostatics/p3m_gpu.cpp
index ff5c9bfc9d9..8e223f6b477 100644
--- a/src/core/electrostatics/p3m_gpu.cpp
+++ b/src/core/electrostatics/p3m_gpu.cpp
@@ -50,7 +50,7 @@ void CoulombP3MGPU::init() {
 void CoulombP3MGPU::init_cpu_kernels() { CoulombP3M::init(); }
 
 void CoulombP3MGPU::request_gpu() const {
-  auto &gpu_particle_data = System::Instance().gpu;
+  auto &gpu_particle_data = System::get_system().gpu;
   gpu_particle_data.enable_property(GpuParticleData::prop::force);
   gpu_particle_data.enable_property(GpuParticleData::prop::q);
   gpu_particle_data.enable_property(GpuParticleData::prop::pos);
diff --git a/src/core/electrostatics/p3m_gpu.hpp b/src/core/electrostatics/p3m_gpu.hpp
index 8cfdcd24da8..0107384ac75 100644
--- a/src/core/electrostatics/p3m_gpu.hpp
+++ b/src/core/electrostatics/p3m_gpu.hpp
@@ -39,6 +39,7 @@ struct CoulombP3MGPU : public CoulombP3M {
 
   void init();
   void on_activation() {
+    request_gpu();
     CoulombP3M::on_activation();
     if (is_tuned()) {
       init_cpu_kernels();
diff --git a/src/core/electrostatics/p3m_gpu_cuda.cu b/src/core/electrostatics/p3m_gpu_cuda.cu
index f17bd96d99d..0fabc2aadcd 100644
--- a/src/core/electrostatics/p3m_gpu_cuda.cu
+++ b/src/core/electrostatics/p3m_gpu_cuda.cu
@@ -552,8 +552,7 @@ void p3m_gpu_init(int cao, const int mesh[3], double alpha) {
   if (mesh[0] == -1 && mesh[1] == -1 && mesh[2] == -1)
     throw std::runtime_error("P3M: invalid mesh size");
 
-  auto &gpu_particle_data = System::Instance().gpu;
-
+  auto &gpu_particle_data = System::get_system().gpu;
   bool do_reinit = false, mesh_changed = false;
   p3m_gpu_data.n_part = static_cast<int>(gpu_particle_data.n_particles());
 
@@ -576,7 +575,7 @@ void p3m_gpu_init(int cao, const int mesh[3], double alpha) {
     do_reinit = true;
   }
 
-  auto const box_l = System::Instance().box();
+  auto const box_l = System::get_system().box();
 
   if (!p3m_gpu_data_initialized || (p3m_gpu_data.box[0] != box_l[0]) ||
       (p3m_gpu_data.box[1] != box_l[1]) || (p3m_gpu_data.box[2] != box_l[2])) {
@@ -680,7 +679,7 @@ void p3m_gpu_init(int cao, const int mesh[3], double alpha) {
  *  \brief The long-range part of the P3M algorithm.
  */
 void p3m_gpu_add_farfield_force(double prefactor) {
-  auto &gpu = System::Instance().gpu;
+  auto &gpu = System::get_system().gpu;
   p3m_gpu_data.n_part = static_cast<int>(gpu.n_particles());
 
   if (p3m_gpu_data.n_part == 0)
diff --git a/src/core/energy.cpp b/src/core/energy.cpp
index 517d943a737..bd0206d9590 100644
--- a/src/core/energy.cpp
+++ b/src/core/energy.cpp
@@ -51,7 +51,7 @@ std::shared_ptr<Observable_stat> calculate_energy() {
 
   auto &obs_energy = *obs_energy_ptr;
 #if defined(CUDA) and (defined(ELECTROSTATICS) or defined(DIPOLES))
-  auto &gpu_particle_data = System::Instance().gpu;
+  auto &gpu_particle_data = System::get_system().gpu;
   gpu_particle_data.clear_energy_on_device();
   gpu_particle_data.update();
 #endif
diff --git a/src/core/forces.cpp b/src/core/forces.cpp
index a5e3eefaeae..fac6d78fc31 100644
--- a/src/core/forces.cpp
+++ b/src/core/forces.cpp
@@ -146,7 +146,7 @@ void force_calc(CellStructure &cell_structure, double time_step, double kT) {
   ESPRESSO_PROFILER_CXX_MARK_FUNCTION;
 
 #ifdef CUDA
-  auto &espresso_system = System::Instance();
+  auto &espresso_system = System::get_system();
   espresso_system.gpu.update();
 #endif
 
diff --git a/src/core/magnetostatics/barnes_hut_gpu.cpp b/src/core/magnetostatics/barnes_hut_gpu.cpp
index 50caca8c445..62409463a7e 100644
--- a/src/core/magnetostatics/barnes_hut_gpu.cpp
+++ b/src/core/magnetostatics/barnes_hut_gpu.cpp
@@ -42,7 +42,7 @@ DipolarBarnesHutGpu::DipolarBarnesHutGpu(double prefactor, double epssq,
   if (m_epssq <= 0.) {
     throw std::domain_error("Parameter 'epssq' must be > 0");
   }
-  auto &gpu_particle_data = System::Instance().gpu;
+  auto &gpu_particle_data = System::get_system().gpu;
   gpu_particle_data.enable_property(GpuParticleData::prop::force);
   gpu_particle_data.enable_property(GpuParticleData::prop::torque);
   gpu_particle_data.enable_property(GpuParticleData::prop::pos);
@@ -65,7 +65,7 @@ int call_kernel(void (*fp)(Args...), ArgRef &&...args) {
 }
 
 int DipolarBarnesHutGpu::initialize_data_structure() {
-  auto &gpu = System::Instance().gpu;
+  auto &gpu = System::get_system().gpu;
   auto const n_part = static_cast<int>(gpu.n_particles());
   auto const error_code = call_kernel(allocBHmemCopy, n_part, &m_bh_data);
 
@@ -84,7 +84,7 @@ int DipolarBarnesHutGpu::initialize_data_structure() {
 }
 
 void DipolarBarnesHutGpu::add_long_range_forces() {
-  auto &gpu_particle_data = System::Instance().gpu;
+  auto &gpu_particle_data = System::get_system().gpu;
   gpu_particle_data.update();
   if (this_node == 0) {
     if (initialize_data_structure() == ES_OK) {
@@ -97,7 +97,7 @@ void DipolarBarnesHutGpu::add_long_range_forces() {
 }
 
 void DipolarBarnesHutGpu::long_range_energy() {
-  auto &gpu_particle_data = System::Instance().gpu;
+  auto &gpu_particle_data = System::get_system().gpu;
   gpu_particle_data.update();
   if (this_node == 0) {
     if (initialize_data_structure() == ES_OK) {
diff --git a/src/core/magnetostatics/dipolar_direct_sum_gpu.cpp b/src/core/magnetostatics/dipolar_direct_sum_gpu.cpp
index 17c6ae397ad..d2603adc95a 100644
--- a/src/core/magnetostatics/dipolar_direct_sum_gpu.cpp
+++ b/src/core/magnetostatics/dipolar_direct_sum_gpu.cpp
@@ -38,7 +38,7 @@ static void get_simulation_box(float *box, int *per) {
 
 DipolarDirectSumGpu::DipolarDirectSumGpu(double prefactor)
     : prefactor{prefactor} {
-  auto &gpu_particle_data = System::Instance().gpu;
+  auto &gpu_particle_data = System::get_system().gpu;
   gpu_particle_data.enable_property(GpuParticleData::prop::force);
   gpu_particle_data.enable_property(GpuParticleData::prop::torque);
   gpu_particle_data.enable_property(GpuParticleData::prop::pos);
@@ -46,7 +46,7 @@ DipolarDirectSumGpu::DipolarDirectSumGpu(double prefactor)
 }
 
 void DipolarDirectSumGpu::add_long_range_forces() const {
-  auto &gpu = System::Instance().gpu;
+  auto &gpu = System::get_system().gpu;
   gpu.update();
   if (this_node != 0) {
     return;
@@ -65,7 +65,7 @@ void DipolarDirectSumGpu::add_long_range_forces() const {
 }
 
 void DipolarDirectSumGpu::long_range_energy() const {
-  auto &gpu = System::Instance().gpu;
+  auto &gpu = System::get_system().gpu;
   gpu.update();
   if (this_node != 0) {
     return;
diff --git a/src/core/system/System.cpp b/src/core/system/System.cpp
index 44f083d7d54..945b485b5c8 100644
--- a/src/core/system/System.cpp
+++ b/src/core/system/System.cpp
@@ -22,6 +22,18 @@
 
 #include <utils/Vector.hpp>
 
-System *System::m_instance = nullptr;
+#include <memory>
+
+namespace System {
+
+static std::shared_ptr<System> instance;
+
+void set_system(std::shared_ptr<System> new_instance) {
+  instance = new_instance;
+}
+
+System &get_system() { return *instance; }
 
 Utils::Vector3d System::box() const { return ::box_geo.length(); }
+
+} // namespace System
diff --git a/src/core/system/System.hpp b/src/core/system/System.hpp
index a102f7c1c89..590be3b084e 100644
--- a/src/core/system/System.hpp
+++ b/src/core/system/System.hpp
@@ -26,22 +26,21 @@
 #include <utils/Vector.hpp>
 
 #include <cstddef>
+#include <memory>
+
+namespace System {
 
 class System {
 public:
-  static System &Instance() {
-    if (!m_instance)
-      m_instance = new System;
-
-    return *m_instance;
-  }
-
 #ifdef CUDA
   GpuParticleData gpu;
 #endif // ifdef CUDA
 
   Utils::Vector3d box() const;
-
-protected:
-  static System *m_instance;
 };
+
+System &get_system();
+
+void set_system(std::shared_ptr<System> new_instance);
+
+} // namespace System
diff --git a/src/core/unit_tests/EspressoSystem_test.cpp b/src/core/unit_tests/EspressoSystem_test.cpp
index 71bbbca6f0c..73d242bfa08 100644
--- a/src/core/unit_tests/EspressoSystem_test.cpp
+++ b/src/core/unit_tests/EspressoSystem_test.cpp
@@ -45,7 +45,7 @@
 #include <memory>
 
 namespace espresso {
-static auto &system = System::Instance();
+static auto system = ::System::System{};
 }
 
 /* Decorator to run a unit test depending on GPU availability. */
diff --git a/src/python/espressomd/system.py b/src/python/espressomd/system.py
index 2964f81df03..f58968ea770 100644
--- a/src/python/espressomd/system.py
+++ b/src/python/espressomd/system.py
@@ -45,7 +45,7 @@
 
 
 @script_interface_register
-class _Globals(ScriptInterfaceHelper):
+class _System(ScriptInterfaceHelper):
     """
     Wrapper class required for technical reasons only.
 
@@ -55,23 +55,17 @@ class _Globals(ScriptInterfaceHelper):
     System class, and adding that object as the first element of the ordered
     dict that is used during serialization. When the System class is reloaded,
     the ordered dict is walked through and objects are deserialized in the same
-    order. Since many objects depend on the box length, the `_Globals` has
+    order. Since many objects depend on the box length, the `_System` has
     to be deserialized first. This guarantees the box geometry is already set
     in the core before e.g. particles and bonds are deserialized.
 
     """
-    _so_name = "System::Globals"
+    _so_name = "System::System"
     _so_creation_policy = "GLOBAL"
-
-    def __setattr__(self, attr, value):
-        if attr == "periodicity":
-            utils.check_type_or_throw_except(
-                value, 3, bool, "Attribute 'periodicity' must be a list of 3 bools")
-        if attr == "box_l":
-            utils.check_type_or_throw_except(
-                value, 3, float, "Attribute 'box_l' must be a list of 3 floats")
-        super().__setattr__(attr, value)
-        utils.handle_errors(f"while assigning system parameter '{attr}'")
+    _so_bind_methods = (
+        "setup_type_map",
+        "number_of_particles",
+        "rotate_system")
 
 
 @script_interface_register
@@ -99,6 +93,13 @@ class System(ScriptInterfaceHelper):
     non_bonded_inter: :class:`espressomd.interactions.NonBondedInteractions`
     part: :class:`espressomd.particle_data.ParticleList`
     thermostat: :class:`espressomd.thermostat.Thermostat`
+    box_l: (3,) array_like of :obj:`float`
+        Dimensions of the simulation box.
+    periodicity: (3,) array_like of :obj:`bool`
+        System periodicity in ``[x, y, z]``, ``False`` for no periodicity
+        in this direction, ``True`` for periodicity
+    min_global_cut : :obj:`float`
+        Minimal interaction cutoff.
 
     Methods
     -------
@@ -151,24 +152,19 @@ class System(ScriptInterfaceHelper):
             How much to rotate
 
     """
-    _so_name = "System::System"
+    _so_name = "System::SystemFacade"
     _so_creation_policy = "GLOBAL"
-    _so_bind_methods = (
-        "setup_type_map",
-        "number_of_particles",
-        "rotate_system")
-
-    def __getattr__(self, attr):
-        if attr in self.__dict__.get("_globals_parameters", []):
-            return self._globals.__getattr__(attr)
-        else:
-            return super().__getattr__(attr)
+    _so_bind_methods = _System._so_bind_methods
 
     def __setattr__(self, attr, value):
-        if attr in self.__dict__.get("_globals_parameters", []):
-            self._globals.__setattr__(attr, value)
-        else:
-            super().__setattr__(attr, value)
+        if attr == "periodicity":
+            utils.check_type_or_throw_except(
+                value, 3, bool, "Attribute 'periodicity' must be a list of 3 bools")
+        if attr == "box_l":
+            utils.check_type_or_throw_except(
+                value, 3, float, "Attribute 'box_l' must be a list of 3 floats")
+        super().__setattr__(attr, value)
+        utils.handle_errors(f"while assigning system parameter '{attr}'")
 
     def __init__(self, **kwargs):
         if "sip" in kwargs:
@@ -187,10 +183,11 @@ def __init__(self, **kwargs):
         if has_features("VIRTUAL_SITES"):
             setable_properties.append("_active_virtual_sites_handle")
 
-        self._globals = _Globals()
-        self._globals_parameters = self._globals._valid_parameters()
+        self.call_method("set_system_handle", obj=_System(**kwargs))
         self.integrator = integrate.IntegratorHandle()
-        self.box_l = kwargs.pop("box_l")
+        for key in ("box_l", "periodicity", "min_global_cut"):
+            if key in kwargs:
+                del kwargs[key]
         for arg in kwargs:
             if arg not in setable_properties:
                 raise ValueError(
@@ -233,11 +230,10 @@ def __reduce__(self):
     def _restore_object(cls, so_callback, so_callback_args, state):
         so = so_callback(*so_callback_args)
         so.__setstate__(state)
-        so._globals_parameters = so._globals._valid_parameters()
         return so
 
     def __getstate__(self):
-        checkpointable_properties = ["_globals", "integrator"]
+        checkpointable_properties = ["integrator"]
         if has_features("VIRTUAL_SITES"):
             checkpointable_properties.append("_active_virtual_sites_handle")
         checkpointable_properties += [
@@ -252,59 +248,18 @@ def __getstate__(self):
             checkpointable_properties += ["ekcontainer", "ekreactions"]
 
         odict = collections.OrderedDict()
+        odict["_system_handle"] = self.call_method("get_system_handle")
         for property_name in checkpointable_properties:
             odict[property_name] = System.__getattribute__(self, property_name)
         return odict
 
     def __setstate__(self, params):
         # note: this class is initialized twice by pickle
+        self.call_method("set_system_handle", obj=params.pop("_system_handle"))
         for property_name in params.keys():
             System.__setattr__(self, property_name, params[property_name])
         self.call_method("lock_system_creation")
 
-    @property
-    def box_l(self):
-        """
-        Dimensions of the simulation box.
-
-        Type: (3,) array_like of :obj:`float`
-
-        """
-        return self._globals.box_l
-
-    @box_l.setter
-    def box_l(self, value):
-        self._globals.box_l = value
-
-    @property
-    def periodicity(self):
-        """
-        System periodicity in ``[x, y, z]``, ``False`` for no periodicity
-        in this direction, ``True`` for periodicity
-
-        Type: (3,) array_like of :obj:`bool`
-
-        """
-        return self._globals.periodicity
-
-    @periodicity.setter
-    def periodicity(self, value):
-        self._globals.periodicity = value
-
-    @property
-    def min_global_cut(self):
-        """
-        Minimal interaction cutoff.
-
-        Type: :obj:`float`
-
-        """
-        return self._globals.min_global_cut
-
-    @min_global_cut.setter
-    def min_global_cut(self, value):
-        self._globals.min_global_cut = value
-
     @property
     def force_cap(self):
         """
diff --git a/src/script_interface/electrostatics/CoulombP3MGPU.hpp b/src/script_interface/electrostatics/CoulombP3MGPU.hpp
index 6dab09b46ac..c05e130efa6 100644
--- a/src/script_interface/electrostatics/CoulombP3MGPU.hpp
+++ b/src/script_interface/electrostatics/CoulombP3MGPU.hpp
@@ -91,7 +91,6 @@ class CoulombP3MGPU : public Actor<CoulombP3MGPU, ::CoulombP3MGPU> {
           get_value<int>(params, "timings"), get_value<bool>(params, "verbose"),
           get_value<bool>(params, "check_complex_residuals"));
     });
-    m_actor->request_gpu();
     set_charge_neutrality_tolerance(params);
   }
 };
diff --git a/src/script_interface/system/CMakeLists.txt b/src/script_interface/system/CMakeLists.txt
index 256046ab897..d57ff870555 100644
--- a/src/script_interface/system/CMakeLists.txt
+++ b/src/script_interface/system/CMakeLists.txt
@@ -21,5 +21,4 @@ target_sources(
   espresso_script_interface
   PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/initialize.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/CudaInitHandle.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/Globals.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/System.cpp)
diff --git a/src/script_interface/system/Globals.cpp b/src/script_interface/system/Globals.cpp
deleted file mode 100644
index c1bf5f186c2..00000000000
--- a/src/script_interface/system/Globals.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (C) 2013-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "Globals.hpp"
-
-#include "core/event.hpp"
-#include "core/grid.hpp"
-#include "core/nonbonded_interactions/nonbonded_interaction_data.hpp"
-
-#include <utils/Vector.hpp>
-
-#include <stdexcept>
-#include <string>
-#include <vector>
-
-namespace ScriptInterface {
-namespace System {
-
-static Utils::Vector3b get_periodicity() {
-  return {::box_geo.periodic(0), ::box_geo.periodic(1), ::box_geo.periodic(2)};
-}
-
-static void set_periodicity(Utils::Vector3b const &value) {
-  for (int i = 0; i < 3; ++i) {
-    ::box_geo.set_periodic(i, value[i]);
-  }
-  on_periodicity_change();
-}
-
-Globals::Globals() {
-  add_parameters({
-      {"box_l",
-       [this](Variant const &v) {
-         context()->parallel_try_catch([&]() {
-           auto const new_value = get_value<Utils::Vector3d>(v);
-           if (not(new_value > Utils::Vector3d::broadcast(0.))) {
-             throw std::domain_error("Attribute 'box_l' must be > 0");
-           }
-           box_geo.set_length(new_value);
-           on_boxl_change();
-         });
-       },
-       []() { return ::box_geo.length(); }},
-      {"min_global_cut",
-       [this](Variant const &v) {
-         context()->parallel_try_catch([&]() {
-           auto const new_value = get_value<double>(v);
-           if (new_value < 0. and new_value != INACTIVE_CUTOFF) {
-             throw std::domain_error("Attribute 'min_global_cut' must be >= 0");
-           }
-           set_min_global_cut(new_value);
-         });
-       },
-       []() { return ::get_min_global_cut(); }},
-      {"periodicity",
-       [this](Variant const &v) {
-         context()->parallel_try_catch(
-             [&]() { set_periodicity(get_value<Utils::Vector3b>(v)); });
-       },
-       []() { return get_periodicity(); }},
-  });
-}
-
-} // namespace System
-} // namespace ScriptInterface
diff --git a/src/script_interface/system/Globals.hpp b/src/script_interface/system/Globals.hpp
deleted file mode 100644
index 0d31911bd10..00000000000
--- a/src/script_interface/system/Globals.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (C) 2013-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef ESPRESSO_SRC_SCRIPT_INTERFACE_SYSTEM_GLOBALS_HPP
-#define ESPRESSO_SRC_SCRIPT_INTERFACE_SYSTEM_GLOBALS_HPP
-
-#include "script_interface/ScriptInterface.hpp"
-#include "script_interface/auto_parameters/AutoParameters.hpp"
-
-#include <cassert>
-#include <string>
-#include <vector>
-
-namespace ScriptInterface {
-namespace System {
-
-class Globals : public AutoParameters<Globals> {
-public:
-  Globals();
-
-private:
-  void do_construct(VariantMap const &params) override {
-    /* When reloading the system state from a checkpoint file,
-     * the order of global variables instantiation matters.
-     * The @c box_l must be set before any other global variable.
-     * All these globals re-initialize the cell system, and we
-     * cannot use the default-constructed @c box_geo when e.g.
-     * long-range interactions exist in the system, otherwise
-     * runtime errors about the local geometry being smaller
-     * than the interaction range would be raised.
-     */
-    if (not params.empty()) {
-      auto const keys =
-          std::vector<std::string>{"box_l", "periodicity", "min_global_cut"};
-      assert(params.size() == keys.size());
-      for (auto const &key : keys) {
-        do_set_parameter(key, params.at(key));
-      }
-    }
-  }
-};
-
-} // namespace System
-} // namespace ScriptInterface
-
-#endif
diff --git a/src/script_interface/system/System.cpp b/src/script_interface/system/System.cpp
index 20e490baea1..061a2a531c4 100644
--- a/src/script_interface/system/System.cpp
+++ b/src/script_interface/system/System.cpp
@@ -19,17 +19,20 @@
 
 #include "System.hpp"
 
-#include "config/config.hpp"
-
 #include "core/cells.hpp"
 #include "core/event.hpp"
 #include "core/grid.hpp"
+#include "core/nonbonded_interactions/nonbonded_interaction_data.hpp"
 #include "core/object-in-fluid/oif_global_forces.hpp"
 #include "core/particle_node.hpp"
 #include "core/rotate_system.hpp"
+#include "core/system/System.hpp"
 
 #include <utils/Vector.hpp>
 
+#include <cassert>
+#include <memory>
+#include <stdexcept>
 #include <string>
 #include <vector>
 
@@ -38,12 +41,83 @@ namespace System {
 
 static bool system_created = false;
 
+static Utils::Vector3b get_periodicity() {
+  return {::box_geo.periodic(0), ::box_geo.periodic(1), ::box_geo.periodic(2)};
+}
+
+static void set_periodicity(Utils::Vector3b const &value) {
+  for (int i = 0; i < 3; ++i) {
+    ::box_geo.set_periodic(i, value[i]);
+  }
+  on_periodicity_change();
+}
+
 System::System() {
   add_parameters({
+      {"box_l",
+       [this](Variant const &v) {
+         context()->parallel_try_catch([&]() {
+           auto const new_value = get_value<Utils::Vector3d>(v);
+           if (not(new_value > Utils::Vector3d::broadcast(0.))) {
+             throw std::domain_error("Attribute 'box_l' must be > 0");
+           }
+           box_geo.set_length(new_value);
+           on_boxl_change();
+         });
+       },
+       []() { return ::box_geo.length(); }},
+      {"min_global_cut",
+       [this](Variant const &v) {
+         context()->parallel_try_catch([&]() {
+           auto const new_value = get_value<double>(v);
+           if (new_value < 0. and new_value != INACTIVE_CUTOFF) {
+             throw std::domain_error("Attribute 'min_global_cut' must be >= 0");
+           }
+           set_min_global_cut(new_value);
+         });
+       },
+       []() { return ::get_min_global_cut(); }},
+      {"periodicity",
+       [this](Variant const &v) {
+         context()->parallel_try_catch(
+             [&]() { set_periodicity(get_value<Utils::Vector3b>(v)); });
+       },
+       []() { return get_periodicity(); }},
       {"max_oif_objects", ::max_oif_objects},
   });
 }
 
+void System::do_construct(VariantMap const &params) {
+  /* When reloading the system state from a checkpoint file,
+   * the order of global variables instantiation matters.
+   * The @c box_l must be set before any other global variable.
+   * All these globals re-initialize the cell system, and we
+   * cannot use the default-constructed @c box_geo when e.g.
+   * long-range interactions exist in the system, otherwise
+   * runtime errors about the local geometry being smaller
+   * than the interaction range would be raised.
+   */
+  auto const valid_params = get_valid_parameters();
+  std::set<std::string> const skip{{"box_l", "periodicity", "min_global_cut"}};
+  std::set<std::string> const allowed{valid_params.begin(), valid_params.end()};
+
+  m_instance = std::make_shared<::System::System>();
+  ::System::set_system(m_instance);
+
+  do_set_parameter("box_l", params.at("box_l"));
+  if (params.count("periodicity")) {
+    do_set_parameter("periodicity", params.at("periodicity"));
+  }
+  if (params.count("min_global_cut")) {
+    do_set_parameter("min_global_cut", params.at("min_global_cut"));
+  }
+  for (auto const &[name, value] : params) {
+    if (skip.count(name) == 0 and allowed.count(name) != 0) {
+      do_set_parameter(name, value);
+    }
+  }
+}
+
 /** Rescale all particle positions in direction @p dir by a factor @p scale.
  *  @param dir   direction to scale (0/1/2 = x/y/z, 3 = x+y+z isotropically)
  *  @param scale factor by which to rescale (>1: stretch, <1: contract)
diff --git a/src/script_interface/system/System.hpp b/src/script_interface/system/System.hpp
index 13ab4a44fe4..b7f42ca603b 100644
--- a/src/script_interface/system/System.hpp
+++ b/src/script_interface/system/System.hpp
@@ -17,25 +17,29 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef ESPRESSO_SRC_SCRIPT_INTERFACE_SYSTEM_SYSTEM_HPP
-#define ESPRESSO_SRC_SCRIPT_INTERFACE_SYSTEM_SYSTEM_HPP
+#pragma once
+
+#include "core/system/System.hpp"
 
 #include "script_interface/ScriptInterface.hpp"
 #include "script_interface/auto_parameters/AutoParameters.hpp"
 
+#include <memory>
 #include <string>
 
 namespace ScriptInterface {
 namespace System {
 
 class System : public AutoParameters<System> {
+  std::shared_ptr<::System::System> m_instance;
+
 public:
   System();
+
+  void do_construct(VariantMap const &params) override;
   Variant do_call_method(std::string const &name,
                          VariantMap const &parameters) override;
 };
 
 } // namespace System
 } // namespace ScriptInterface
-
-#endif
diff --git a/src/script_interface/system/SystemFacade.hpp b/src/script_interface/system/SystemFacade.hpp
new file mode 100644
index 00000000000..e4100342cff
--- /dev/null
+++ b/src/script_interface/system/SystemFacade.hpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2013-2022 The ESPResSo project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "System.hpp"
+
+#include "script_interface/ScriptInterface.hpp"
+
+#include <memory>
+#include <string>
+
+namespace ScriptInterface {
+namespace System {
+
+/**
+ * @brief Facade for the real @ref System class.
+ * All calls are forwarded to the real class.
+ * This construct is needed to satisfy the requirements
+ * of the checkpointing mechanism, which instantiates
+ * the @c SystemFacade after all other script interface
+ * objects have been instantiated.
+ */
+class SystemFacade : public ObjectHandle {
+  std::shared_ptr<System> m_instance;
+
+public:
+  SystemFacade() {
+    // create a dummy system to be able to read the list of parameters
+    m_instance = std::make_shared<System>();
+  }
+  void do_construct(VariantMap const &) override{};
+  Variant get_parameter(const std::string &name) const override {
+    return m_instance->get_parameter(name);
+  }
+  void do_set_parameter(const std::string &name, const Variant &v) override {
+    m_instance->do_set_parameter(name, v);
+  }
+  Utils::Span<const boost::string_ref> valid_parameters() const override {
+    return m_instance->valid_parameters();
+  }
+  Variant do_call_method(std::string const &name,
+                         VariantMap const &params) override {
+    if (name == "get_system_handle") {
+      return m_instance;
+    }
+    if (name == "set_system_handle") {
+      m_instance = std::dynamic_pointer_cast<System>(
+          get_value<ObjectRef>(params, "obj"));
+      return {};
+    }
+    return m_instance->do_call_method(name, params);
+  }
+};
+
+} // namespace System
+} // namespace ScriptInterface
diff --git a/src/script_interface/system/initialize.cpp b/src/script_interface/system/initialize.cpp
index f83a34b0a74..a89cd46d335 100644
--- a/src/script_interface/system/initialize.cpp
+++ b/src/script_interface/system/initialize.cpp
@@ -20,15 +20,15 @@
 #include "initialize.hpp"
 
 #include "CudaInitHandle.hpp"
-#include "Globals.hpp"
 #include "System.hpp"
+#include "SystemFacade.hpp"
 
 namespace ScriptInterface {
 namespace System {
 
 void initialize(Utils::Factory<ObjectHandle> *om) {
   om->register_new<CudaInitHandle>("System::CudaInitHandle");
-  om->register_new<Globals>("System::Globals");
+  om->register_new<SystemFacade>("System::SystemFacade");
   om->register_new<System>("System::System");
 }
 

From 91550b824e02f42c87e205189650ac759d06a251 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Mon, 12 Jun 2023 14:10:35 +0200
Subject: [PATCH 5/5] Deallocate resources before normal program termination

At normal program termination, thrust vectors might enter their
destructors when the CUDA primary context manager has already
expired, making it impossible to request the CUDA driver to free
device memory. To avoid this condition, deallocation methods are
wrapped in callbacks that are called by a cleanup class before
normal program termination.
---
 src/core/cuda/CudaDeviceAllocator.hpp        |  70 ---------
 src/core/electrostatics/mmm1d_gpu.cpp        |   2 +-
 src/core/system/GpuParticleData.cpp          |   9 ++
 src/core/system/GpuParticleData.hpp          |  10 +-
 src/core/system/GpuParticleData_cuda.cu      |  81 +++++++---
 src/core/system/ResourceCleanup.hpp          | 156 +++++++++++++++++++
 src/core/system/System.cpp                   |   2 +
 src/core/system/System.hpp                   |  11 +-
 src/core/unit_tests/CMakeLists.txt           |   2 +
 src/core/unit_tests/EspressoSystem_test.cpp  |  14 +-
 src/core/unit_tests/ResourceCleanup_test.cpp |  87 +++++++++++
 src/python/espressomd/system.py              |   9 ++
 src/script_interface/system/System.cpp       |  11 ++
 13 files changed, 362 insertions(+), 102 deletions(-)
 delete mode 100644 src/core/cuda/CudaDeviceAllocator.hpp
 create mode 100644 src/core/system/ResourceCleanup.hpp
 create mode 100644 src/core/unit_tests/ResourceCleanup_test.cpp

diff --git a/src/core/cuda/CudaDeviceAllocator.hpp b/src/core/cuda/CudaDeviceAllocator.hpp
deleted file mode 100644
index 2b1517e2734..00000000000
--- a/src/core/cuda/CudaDeviceAllocator.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (C) 2010-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#ifndef __CUDACC__
-#error "This header can only be used with a CUDA-capable compiler."
-#endif
-
-#include <thrust/device_free.h>
-#include <thrust/device_malloc.h>
-#include <thrust/device_ptr.h>
-#include <thrust/device_reference.h>
-
-#include <cstddef>
-
-/**
- * @brief Allocator that uses CUDA to allocate GPU memory.
- *
- * This allocator uses thrust::malloc and thrust::free to
- * manage device memory. To be able to have static containers
- * for device memory, exceptions in deallocations are suppressed.
- * This is needed because static containers may be destroyed after
- * the CUDA system was teared down, leading to device_free to fail.
- *
- * @tparam T Type to allocate memory for.
- */
-template <class T> struct CudaDeviceAllocator {
-  using value_type = T;
-  using pointer = thrust::device_ptr<T>;
-  using const_pointer = thrust::device_ptr<const T>;
-  using reference = thrust::device_reference<T>;
-  using const_reference = thrust::device_reference<const T>;
-
-  CudaDeviceAllocator() noexcept = default;
-  template <class U>
-  explicit CudaDeviceAllocator(const CudaDeviceAllocator<U> &) {}
-  template <class U> bool operator==(const CudaDeviceAllocator<U> &) const {
-    return true;
-  }
-  template <class U> bool operator!=(const CudaDeviceAllocator<U> &) const {
-    return false;
-  }
-
-  pointer allocate(const std::size_t n) const {
-    return thrust::device_malloc<T>(n);
-  }
-  void deallocate(pointer p, std::size_t) const noexcept {
-    try {
-      thrust::device_free(p);
-    } catch (thrust::system::system_error const &) {
-    }
-  }
-};
diff --git a/src/core/electrostatics/mmm1d_gpu.cpp b/src/core/electrostatics/mmm1d_gpu.cpp
index 0be54f7992b..9d73f35c67e 100644
--- a/src/core/electrostatics/mmm1d_gpu.cpp
+++ b/src/core/electrostatics/mmm1d_gpu.cpp
@@ -75,7 +75,7 @@ void CoulombMMM1DGpu::sanity_checks_cell_structure() const {
 }
 
 void CoulombMMM1DGpu::tune() {
-  System::get_system().gpu.init();
+  System::get_system().gpu.update();
   if (this_node == 0) {
     setup();
     tune(maxPWerror, far_switch_radius, bessel_cutoff);
diff --git a/src/core/system/GpuParticleData.cpp b/src/core/system/GpuParticleData.cpp
index 83a8ed4fd50..d27bfc53b66 100644
--- a/src/core/system/GpuParticleData.cpp
+++ b/src/core/system/GpuParticleData.cpp
@@ -52,6 +52,15 @@ void GpuParticleData::copy_particles_to_device() {
   copy_particles_to_device(::cell_structure.local_particles(), ::this_node);
 }
 
+bool GpuParticleData::has_compatible_device() const {
+  auto result = false;
+  if (::this_node == 0) {
+    result = has_compatible_device_impl();
+  }
+  boost::mpi::broadcast(::comm_cart, result, 0);
+  return result;
+}
+
 BOOST_IS_BITWISE_SERIALIZABLE(GpuParticleData::GpuParticle)
 
 namespace boost {
diff --git a/src/core/system/GpuParticleData.hpp b/src/core/system/GpuParticleData.hpp
index 4c7659e7156..8d5cc109bdd 100644
--- a/src/core/system/GpuParticleData.hpp
+++ b/src/core/system/GpuParticleData.hpp
@@ -78,7 +78,7 @@ class GpuParticleData {
 
 private:
   // forward declare
-  struct Storage;
+  class Storage;
   /** @brief Whether a device was found and data structures were allocated. */
   bool m_communication_enabled = false;
   /** @brief Whether to convert particle properties from AoS to SoA. */
@@ -86,8 +86,9 @@ class GpuParticleData {
   /** @brief Whether particle transfer to the GPU was requested. */
   bool m_need_particles_update = false;
   /** @brief Host and device containers. */
-  std::unique_ptr<Storage> m_data;
+  std::shared_ptr<Storage> m_data;
 
+  bool has_compatible_device_impl() const;
   void gpu_init_particle_comm();
   void enable_particle_transfer();
   void copy_particles_to_device();
@@ -101,7 +102,7 @@ class GpuParticleData {
                                 Utils::Span<float> host_torques) const;
 
 public:
-  GpuParticleData();
+  GpuParticleData() = default;
   ~GpuParticleData();
 
   void update() {
@@ -109,11 +110,12 @@ class GpuParticleData {
       copy_particles_to_device();
     }
   }
-  void init() { update(); }
+  void init();
   void enable_property(std::size_t property);
   void clear_energy_on_device();
   void copy_forces_to_host(ParticleRange const &particles, int this_node);
   std::size_t n_particles() const;
+  bool has_compatible_device() const;
 
   GpuEnergy copy_energy_to_host() const;
   GpuEnergy *get_energy_device() const;
diff --git a/src/core/system/GpuParticleData_cuda.cu b/src/core/system/GpuParticleData_cuda.cu
index b129fc3d465..f03e3556f90 100644
--- a/src/core/system/GpuParticleData_cuda.cu
+++ b/src/core/system/GpuParticleData_cuda.cu
@@ -24,11 +24,12 @@
 #include "config/config.hpp"
 
 #include "GpuParticleData.hpp"
+#include "ResourceCleanup.hpp"
+#include "System.hpp"
 
 #include "ParticleRange.hpp"
 #include "errorhandling.hpp"
 
-#include "cuda/CudaDeviceAllocator.hpp"
 #include "cuda/init.hpp"
 #include "cuda/utils.cuh"
 
@@ -47,11 +48,7 @@
 #error CU-file includes mpi.h! This should not happen!
 #endif
 
-template <class T>
-using device_vector = thrust::device_vector<T, CudaDeviceAllocator<T>>;
-
-template <class T, class A>
-T *raw_data_pointer(thrust::device_vector<T, A> &vec) {
+template <class T> T *raw_data_pointer(thrust::device_vector<T> &vec) {
   return thrust::raw_pointer_cast(vec.data());
 }
 
@@ -60,7 +57,7 @@ template <class SpanLike> std::size_t byte_size(SpanLike const &v) {
 }
 
 /**
- * @brief Resize a @ref device_vector.
+ * @brief Resize a @c thrust::device_vector.
  *
  * Due to a bug in thrust (https://github.com/thrust/thrust/issues/939),
  * resizing or appending to default constructed containers causes undefined
@@ -68,36 +65,46 @@ template <class SpanLike> std::size_t byte_size(SpanLike const &v) {
  * function is used instead of the resize member function to side-step
  * the problem. This is done by replacing the existing vector by a new
  * one constructed with the desired size if resizing from capacity zero.
- * Behaves as-if vec.resize(n) was called.
+ * Behaves as-if @c vec.resize(n) was called.
  * This is fixed in Thrust 1.11, shipped in CUDA 11.3
  * (https://github.com/NVIDIA/thrust/commit/1c4f25d9).
  *
  * @tparam T Type contained in the vector.
- * @param vec Vector To resize.
- * @param n Desired new size of the element.
+ * @param vec Vector to resize.
+ * @param n Desired new size of the vector.
  */
 template <class T>
-void resize_or_replace(device_vector<T> &vec, std::size_t n) {
+void resize_or_replace(thrust::device_vector<T> &vec, std::size_t n) {
   if (vec.capacity() == 0) {
-    vec = device_vector<T>(n);
+    vec = thrust::device_vector<T>(n);
   } else {
     vec.resize(n);
   }
 }
 
+template <typename T> void free_device_vector(thrust::device_vector<T> &vec) {
+  vec.clear();
+  thrust::device_vector<T>().swap(vec);
+}
+
 /** @brief Host and device containers for particle data. */
-struct GpuParticleData::Storage {
+class GpuParticleData::Storage {
+  void free_device_memory();
+  using DeviceMemory = ResourceCleanup::Attorney<&Storage::free_device_memory>;
+  friend DeviceMemory;
+
+public:
   /** @brief Which particle properties are needed by GPU methods. */
   GpuParticleData::prop::bitset m_need;
   GpuParticleData::GpuEnergy *energy_device = nullptr;
   std::size_t current_size = 0ul;
   pinned_vector<GpuParticle> particle_data_host;
-  device_vector<GpuParticle> particle_data_device;
+  thrust::device_vector<GpuParticle> particle_data_device;
   pinned_vector<float> particle_forces_host;
-  device_vector<float> particle_forces_device;
+  thrust::device_vector<float> particle_forces_device;
 #ifdef ROTATION
   pinned_vector<float> particle_torques_host;
-  device_vector<float> particle_torques_device;
+  thrust::device_vector<float> particle_torques_device;
 #endif
   float *particle_pos_device = nullptr;
 #ifdef DIPOLES
@@ -107,6 +114,13 @@ struct GpuParticleData::Storage {
   float *particle_q_device = nullptr;
 #endif
 
+  static auto make_shared() {
+    auto obj = std::make_shared<GpuParticleData::Storage>();
+    System::get_system().cleanup_queue.push<DeviceMemory>(obj);
+    return obj;
+  }
+
+  ~Storage() { free_device_memory(); }
   void realloc_device_memory();
   void split_particle_struct();
   void copy_particles_to_device();
@@ -135,8 +149,8 @@ struct GpuParticleData::Storage {
 #endif
 };
 
-GpuParticleData::GpuParticleData() {
-  m_data = std::make_unique<GpuParticleData::Storage>();
+void GpuParticleData::init() {
+  m_data = GpuParticleData::Storage::make_shared();
 }
 
 GpuParticleData::~GpuParticleData() {}
@@ -184,6 +198,16 @@ void GpuParticleData::enable_property(std::size_t property) {
   enable_particle_transfer();
 }
 
+bool GpuParticleData::has_compatible_device_impl() const {
+  auto result = true;
+  try {
+    cuda_check_device();
+  } catch (cuda_runtime_error const &err) {
+    result = false;
+  }
+  return result;
+}
+
 /**
  * @brief Setup and call particle reallocation from the host.
  */
@@ -397,3 +421,24 @@ void GpuParticleData::Storage::realloc_device_memory() {
 #endif
   current_size = new_size;
 }
+
+void GpuParticleData::Storage::free_device_memory() {
+  auto const free_device_pointer = [](float *&ptr) {
+    if (ptr != nullptr) {
+      cuda_safe_mem(cudaFree(reinterpret_cast<void *>(ptr)));
+      ptr = nullptr;
+    }
+  };
+  free_device_vector(particle_data_device);
+  free_device_vector(particle_forces_device);
+#ifdef ROTATION
+  free_device_vector(particle_torques_device);
+#endif
+  free_device_pointer(particle_pos_device);
+#ifdef DIPOLES
+  free_device_pointer(particle_dip_device);
+#endif
+#ifdef ELECTROSTATICS
+  free_device_pointer(particle_q_device);
+#endif
+}
diff --git a/src/core/system/ResourceCleanup.hpp b/src/core/system/ResourceCleanup.hpp
new file mode 100644
index 00000000000..09ba360261a
--- /dev/null
+++ b/src/core/system/ResourceCleanup.hpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (C) 2023 The ESPResSo project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <list>
+#include <memory>
+#include <type_traits>
+
+/**
+ * @brief Queue to deallocate resources before normal program termination.
+ *
+ * Some resources can only be deallocated when a static global object is
+ * still alive, for example a MPI manager, a GPU context or a file handle.
+ * This class can be registered as a callback for normal program termination;
+ * any registered resource that didn't expire will be forcefully deallocated.
+ *
+ * To this end, the "client" class needs to implement cleanup methods that
+ * deallocate managed resources, to be called by the class destructor when
+ * the instance lifetime expires, or by an "attorney" class at normal
+ * program termination, whichever comes first. The attorney class creates
+ * a callback that is pushed in a queue that gets processed before normal
+ * program termination at the user's discretion, or when the queue itself
+ * expires, if the client class instance hasn't expired already.
+ *
+ * Please note the cleanup methods need to be able to run twice, since
+ * the client class destructor will be called eventually, possibly after
+ * @c __run_exit_handlers() is called. The attorney-client idiom is used to
+ * make the private deallocation methods accessible to the cleanup callbacks.
+ */
+class ResourceCleanup {
+  struct Callback {
+    virtual ~Callback() = default;
+  };
+
+  std::list<std::unique_ptr<Callback>> m_resources;
+
+public:
+  ResourceCleanup() = default;
+  ~ResourceCleanup() {
+    while (not m_resources.empty()) {
+      m_resources.pop_back();
+    }
+  }
+
+  /**
+   * @brief Attorney for a resource deallocator.
+   * Usage:
+   * @code{.cpp}
+   * #include "system/ResourceCleanup.hpp"
+   * #include <thrust/device_free.h>
+   * #include <thrust/device_malloc.h>
+   * #include <memory>
+   *
+   * class MyClass;
+   * static std::shared_ptr<ResourceCleanup> resource_queue = nullptr;
+   * static std::shared_ptr<MyClass> resource = nullptr;
+   *
+   * class MyClass {
+   * private:
+   *   thrust::device_ptr<float> data_dev;
+   *   void free_device_memory() { thrust::device_free(data_dev); }
+   *   using Cleanup = ResourceCleanup::Attorney<&MyClass::free_device_memory>;
+   *   friend Cleanup;
+   *
+   * public:
+   *   MyClass(int n) { data_dev = thrust::device_malloc<float>(n); }
+   *   ~MyClass() { free_device_memory(); }
+   *   template <class... Args>
+   *   static auto make_shared(Args ...args) {
+   *     auto obj = std::make_shared<MyClass>(args...);
+   *     // comment the next line of code to trigger the CUDA runtime error
+   *     // "device free failed: cudaErrorCudartUnloading: driver shutting down"
+   *     ::resource_queue->push<MyClass::Cleanup>(obj);
+   *     return obj;
+   *   }
+   * };
+   *
+   * int main() {
+   *   ::resource_queue = std::make_shared<ResourceCleanup>();
+   *   ::resource = MyClass::make_shared(5);
+   *   // The CUDA primary context expires before normal program termination.
+   *   // Yet thrust vectors require a primary context to deallocate device
+   *   // memory. Force deallocation before normal program termination.
+   *   ::resource_queue.reset();
+   * }
+   * @endcode
+   * @tparam deallocator  Member function that deallocates managed resources.
+   */
+  template <auto deallocator> class Attorney {
+    struct detail {
+      template <class C> struct get_class_from_member_function_pointer;
+      template <class C, class Ret, class... Args>
+      struct get_class_from_member_function_pointer<Ret (C::*)(Args...)> {
+        using type = C;
+      };
+    };
+    using Deallocator = decltype(deallocator);
+    static_assert(std::is_member_function_pointer_v<Deallocator>);
+    using Container =
+        typename detail::template get_class_from_member_function_pointer<
+            Deallocator>::type;
+    static_assert(std::is_invocable_v<Deallocator, Container>,
+                  "deallocator must have signature void(S::*)()");
+    static_assert(
+        std::is_same_v<std::invoke_result_t<Deallocator, Container>, void>,
+        "deallocator must return void");
+
+    struct CallbackImpl : public Callback {
+      std::weak_ptr<Container> weak_ref;
+      CallbackImpl(std::shared_ptr<Container> const &ref) : weak_ref{ref} {}
+      ~CallbackImpl() override {
+        if (auto const ref = weak_ref.lock()) {
+          (ref.get()->*deallocator)();
+        }
+      }
+    };
+
+    static std::unique_ptr<Callback>
+    make_callback(std::shared_ptr<Container> const &container) {
+      return std::make_unique<CallbackImpl>(container);
+    }
+    friend class ResourceCleanup;
+  };
+
+  [[nodiscard]] auto size() const { return m_resources.size(); }
+  [[nodiscard]] auto empty() const { return m_resources.empty(); }
+
+  /**
+   * @brief Register a resource for cleanup.
+   * Internally, a weak pointer is created and stored in a callback.
+   * @tparam Attorney   Specialization of @ref ResourceCleanup::Attorney
+   *                    that wraps the class @c Container deallocator.
+   * @tparam Container  Class that manages resources.
+   */
+  template <typename Attorney, typename Container>
+  void push(std::shared_ptr<Container> const &resource) {
+    m_resources.emplace_back(Attorney::make_callback(resource));
+  }
+};
diff --git a/src/core/system/System.cpp b/src/core/system/System.cpp
index 945b485b5c8..d392e36ad96 100644
--- a/src/core/system/System.cpp
+++ b/src/core/system/System.cpp
@@ -28,6 +28,8 @@ namespace System {
 
 static std::shared_ptr<System> instance;
 
+void reset_system() { instance.reset(); }
+
 void set_system(std::shared_ptr<System> new_instance) {
   instance = new_instance;
 }
diff --git a/src/core/system/System.hpp b/src/core/system/System.hpp
index 590be3b084e..f1ea2426281 100644
--- a/src/core/system/System.hpp
+++ b/src/core/system/System.hpp
@@ -22,6 +22,7 @@
 #include "config/config.hpp"
 
 #include "GpuParticleData.hpp"
+#include "ResourceCleanup.hpp"
 
 #include <utils/Vector.hpp>
 
@@ -34,13 +35,19 @@ class System {
 public:
 #ifdef CUDA
   GpuParticleData gpu;
-#endif // ifdef CUDA
+#endif
+  ResourceCleanup cleanup_queue;
 
   Utils::Vector3d box() const;
+  void init() {
+#ifdef CUDA
+    gpu.init();
+#endif
+  }
 };
 
 System &get_system();
-
 void set_system(std::shared_ptr<System> new_instance);
+void reset_system();
 
 } // namespace System
diff --git a/src/core/unit_tests/CMakeLists.txt b/src/core/unit_tests/CMakeLists.txt
index a94a59072d8..95b6c389a20 100644
--- a/src/core/unit_tests/CMakeLists.txt
+++ b/src/core/unit_tests/CMakeLists.txt
@@ -34,6 +34,8 @@ unit_test(NAME EspressoSystemStandAlone_serial_test SRC
           MPI::MPI_CXX NUM_PROC 1)
 unit_test(NAME EspressoSystem_test SRC EspressoSystem_test.cpp DEPENDS
           espresso::core Boost::mpi NUM_PROC 2)
+unit_test(NAME ResourceCleanup_test SRC ResourceCleanup_test.cpp DEPENDS
+          espresso::core Boost::mpi NUM_PROC 2)
 unit_test(NAME MpiCallbacks_test SRC MpiCallbacks_test.cpp DEPENDS
           espresso::utils Boost::mpi MPI::MPI_CXX NUM_PROC 2)
 unit_test(NAME ParticleIterator_test SRC ParticleIterator_test.cpp DEPENDS
diff --git a/src/core/unit_tests/EspressoSystem_test.cpp b/src/core/unit_tests/EspressoSystem_test.cpp
index 73d242bfa08..a67c19d3947 100644
--- a/src/core/unit_tests/EspressoSystem_test.cpp
+++ b/src/core/unit_tests/EspressoSystem_test.cpp
@@ -44,10 +44,6 @@
 #include <cassert>
 #include <memory>
 
-namespace espresso {
-static auto system = ::System::System{};
-}
-
 /* Decorator to run a unit test depending on GPU availability. */
 boost::test_tools::assertion_result has_gpu(boost::unit_test::test_unit_id) {
   bool has_compatible_gpu = false;
@@ -62,7 +58,11 @@ boost::test_tools::assertion_result has_gpu(boost::unit_test::test_unit_id) {
 BOOST_FIXTURE_TEST_CASE(check_with_gpu, ParticleFactory,
                         *boost::unit_test::precondition(has_gpu)) {
   auto const rank = boost::mpi::communicator().rank();
-  auto &gpu = espresso::system.gpu;
+
+  auto system = std::make_shared<::System::System>();
+  System::set_system(system);
+  system->init();
+  auto &gpu = system->gpu;
 
   // check uninitialized device pointers
   BOOST_CHECK_EQUAL(gpu.get_energy_device(), nullptr);
@@ -142,6 +142,8 @@ BOOST_FIXTURE_TEST_CASE(check_with_gpu, ParticleFactory,
   remove_particle(p_id);
   gpu.update();
   BOOST_CHECK_EQUAL(gpu.n_particles(), 0);
+
+  System::reset_system();
 }
 
 int main(int argc, char **argv) {
@@ -150,8 +152,6 @@ int main(int argc, char **argv) {
   // initialize the MpiCallbacks framework
   Communication::init(mpi_env);
 
-  espresso::system.gpu.init();
-
   return boost::unit_test::unit_test_main(init_unit_test, argc, argv);
 }
 
diff --git a/src/core/unit_tests/ResourceCleanup_test.cpp b/src/core/unit_tests/ResourceCleanup_test.cpp
new file mode 100644
index 00000000000..ae14ccb6ec4
--- /dev/null
+++ b/src/core/unit_tests/ResourceCleanup_test.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2023 The ESPResSo project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define BOOST_TEST_NO_MAIN
+#define BOOST_TEST_MODULE ResourceCleanup test
+#define BOOST_TEST_ALTERNATIVE_INIT_API
+#define BOOST_TEST_DYN_LINK
+#include <boost/test/unit_test.hpp>
+
+#include "config/config.hpp"
+
+#include "communication.hpp"
+#include "cuda/utils.hpp"
+#include "system/GpuParticleData.hpp"
+#include "system/ResourceCleanup.hpp"
+#include "system/System.hpp"
+
+#include <boost/mpi.hpp>
+
+#include <memory>
+#include <vector>
+
+class MyClass {
+  std::vector<int> m_data;
+  void deallocate() { m_data.clear(); }
+  using Cleanup = ResourceCleanup::Attorney<&MyClass::deallocate>;
+  friend Cleanup;
+
+public:
+  MyClass() { m_data = std::vector<int>(5); }
+  ~MyClass() { deallocate(); }
+  auto size() const { return m_data.size(); }
+  template <class... Args> static auto make_shared(Args... args) {
+    auto obj = std::make_shared<MyClass>(args...);
+    System::get_system().cleanup_queue.push<Cleanup>(obj);
+    return obj;
+  }
+};
+
+BOOST_AUTO_TEST_CASE(checks) {
+  auto system = std::make_shared<::System::System>();
+  System::set_system(system);
+
+  BOOST_REQUIRE_EQUAL(system->cleanup_queue.size(), 0);
+  BOOST_REQUIRE_EQUAL(system->cleanup_queue.empty(), true);
+  system->init();
+
+#ifdef CUDA
+  if (system->gpu.has_compatible_device()) {
+    // allocate device memory to populate the cleanup queue
+    system->gpu.enable_property(GpuParticleData::prop::pos);
+    system->gpu.update();
+    BOOST_REQUIRE_EQUAL(system->cleanup_queue.size(), 1);
+    BOOST_REQUIRE_EQUAL(system->cleanup_queue.empty(), false);
+  }
+#endif
+
+  auto const obj = MyClass::make_shared();
+  BOOST_REQUIRE_EQUAL(system->cleanup_queue.empty(), false);
+  BOOST_REQUIRE_EQUAL(obj->size(), 5);
+  system.reset();
+  System::reset_system();
+  BOOST_REQUIRE_EQUAL(obj->size(), 0);
+}
+
+int main(int argc, char **argv) {
+  auto mpi_env = mpi_init(argc, argv);
+  Communication::init(mpi_env);
+
+  return boost::unit_test::unit_test_main(init_unit_test, argc, argv);
+}
diff --git a/src/python/espressomd/system.py b/src/python/espressomd/system.py
index f58968ea770..0df3bd0aaeb 100644
--- a/src/python/espressomd/system.py
+++ b/src/python/espressomd/system.py
@@ -169,6 +169,7 @@ def __setattr__(self, attr, value):
     def __init__(self, **kwargs):
         if "sip" in kwargs:
             super().__init__(**kwargs)
+            self._setup_atexit()
             return
         super().__init__()
 
@@ -220,6 +221,14 @@ def __init__(self, **kwargs):
 
         # lock class
         self.call_method("lock_system_creation")
+        self._setup_atexit()
+
+    def _setup_atexit(self):
+        import atexit
+
+        def session_shutdown():
+            self.call_method("session_shutdown")
+        atexit.register(session_shutdown)
 
     def __reduce__(self):
         so_callback, so_callback_args = super().__reduce__()
diff --git a/src/script_interface/system/System.cpp b/src/script_interface/system/System.cpp
index 061a2a531c4..9644822d1b8 100644
--- a/src/script_interface/system/System.cpp
+++ b/src/script_interface/system/System.cpp
@@ -103,6 +103,7 @@ void System::do_construct(VariantMap const &params) {
 
   m_instance = std::make_shared<::System::System>();
   ::System::set_system(m_instance);
+  m_instance->init();
 
   do_set_parameter("box_l", params.at("box_l"));
   if (params.count("periodicity")) {
@@ -198,6 +199,16 @@ Variant System::do_call_method(std::string const &name,
                   get_value<double>(parameters, "alpha"));
     return {};
   }
+  if (name == "session_shutdown") {
+    if (m_instance) {
+      if (&::System::get_system() == m_instance.get()) {
+        ::System::reset_system();
+      }
+      assert(m_instance.use_count() == 1l);
+      m_instance.reset();
+    }
+    return {};
+  }
   return {};
 }